diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..6313b56 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto eol=lf diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..af35f04 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 + +on: [push, pull_request] + +name: Build and test + +jobs: + check: + name: Check + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run cargo check + run: cargo check + - name: Run cargo check --no-default-features + run: cargo check --no-default-features + + test: + name: Tests + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + # Default configuration (int64 + float) + - name: Default (int64 + float) + features: "" + + # No features baseline + - name: No features + features: --no-default-features + + # int32 configurations + - name: int32 + float + features: --no-default-features --features "int32,float" + - name: int32 + float-skip + features: --no-default-features --features "int32,float-skip" + - name: int32 + float-error + features: --no-default-features --features "int32,float-error" + - name: int32 + float-truncate + features: --no-default-features --features "int32,float-truncate" + + # int64 configurations (beyond default) + - name: int64 + float-skip + features: --no-default-features --features "int64,float-skip" + - name: int64 + float-error + features: --no-default-features --features "int64,float-error" + - name: int64 + float-truncate + features: --no-default-features --features "int64,float-truncate" + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run cargo test + working-directory: stax + run: cargo test ${{ matrix.features }} diff --git a/.gitignore b/.gitignore index ad67955..9546fb5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ target # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +Cargo.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2cb20e3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-added-large-files + - id: check-merge-conflict + - id: check-json + exclude: | + (?x)^( + .vscode/.*.json + )$ + - id: check-yaml + - id: check-toml + - id: mixed-line-ending + args: ["--fix=lf"] + +- repo: https://github.com/doublify/pre-commit-rust + rev: v1.0 + hooks: + - id: fmt + name: Fmt diff --git a/CONFIGURATION.md b/CONFIGURATION.md new file mode 100644 index 0000000..d4affa1 --- /dev/null +++ b/CONFIGURATION.md @@ -0,0 +1,141 @@ +# Configurable Number Handling + +The stax JSON parser provides comprehensive configurability for number handling, making it suitable for both full-featured and embedded environments. + +## Feature Flags + +### Integer Width +Choose the integer type to avoid pulling in unnecessary math routines: + +- **`int64`** (default): Use `i64` for full range integer support +- **`int32`**: Use `i32` for embedded targets (no 64-bit math routines) + +### Float Support +Control float parsing behavior: + +- **`float`**: Enable full f64 parsing support +- **No float feature**: Disable float parsing (multiple behavior options available) + +### Float Behavior (when `float` feature is disabled) +Choose what happens when floats are encountered: + +- **Default**: Return `FloatDisabled` with raw string preserved for manual parsing +- **`float-error`**: Fail parsing when floats are encountered (embedded fail-fast) +- **`float-truncate`**: Truncate simple decimals to integers (1.7 → 1, errors on scientific notation) +- **`float-skip`**: Skip float values during parsing (continue with next token) [TODO] + +## Configuration Examples + +### Full Featured (Default) +```toml +[dependencies] +stax = { path = "../stax", features = ["int64", "float"] } +``` +- 64-bit integers, full float support +- Best for desktop/server applications + +### Embedded Friendly +```toml +[dependencies] +stax = { path = "../stax", features = ["int32", "float-error"] } +``` +- 32-bit integers (no 64-bit math) +- Error on floats (fail fast) +- Minimal code size for embedded systems + +### Embedded with Float Tolerance +```toml +[dependencies] +stax = { path = "../stax", features = ["int32", "float-truncate"] } +``` +- 32-bit integers +- Truncate simple decimals to integers (1.7 → 1) +- Error on scientific notation (avoids float math) + +### Legacy Float Disabled +```toml +[dependencies] +stax = { path = "../stax", features = ["int64"] } +``` +- 64-bit integers +- Floats return `FloatDisabled` with raw string preserved +- Manual parsing available via `JsonNumber::parse()` + +## API Usage + +All configurations preserve the exact raw string while providing different parsed representations: + +```rust +match event { + Event::Number(num) => { + // Raw string always available (exact precision) + println!("Raw: {}", num.as_str()); + + // Parsed value depends on configuration + match num.parsed { + NumberResult::Integer(i) => println!("Integer: {}", i), + NumberResult::IntegerOverflow => println!("Overflow: {}", num.as_str()), + + #[cfg(feature = "float")] + NumberResult::Float(f) => println!("Float: {}", f), + + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + NumberResult::FloatTruncated(i) => println!("Truncated: {}", i), + + #[cfg(not(feature = "float"))] + NumberResult::FloatDisabled => { + // Manual parsing still available + let manual: f64 = num.parse().unwrap(); + } + } + + // Convenience methods adapt to configuration + if let Some(int_val) = num.as_int() { + println!("As configured int: {}", int_val); + } + } +} +``` + +## Testing Different Configurations + +Run the demo with different configurations. The truncate mode shows both error and success paths: + +```bash +# Basic no-float (raw strings preserved) +cargo run --example no_float_demo --no-default-features + +# Embedded-friendly with error on floats +cargo run --example no_float_demo --features int32,float-error + +# Embedded with float truncation (demonstrates both error and success scenarios) +cargo run --example no_float_demo --features int32,float-truncate + +# Full featured +cargo run --example no_float_demo --features int64,float +``` + +**Note**: The `float-truncate` configuration demonstrates both successful truncation (with simple decimals) and error handling (with scientific notation) by testing two different JSON inputs. + +## Scientific Notation Handling + +Different configurations handle scientific notation (`1e3`, `2.5e-1`, `1.23e+2`) differently: + +| Configuration | Behavior | Rationale | +|---------------|----------|-----------| +| `float` enabled | Full evaluation: `1e3` → 1000.0 | Complete f64 math available | +| `float-error` | Error: `FloatNotAllowed` | Fail fast on any float syntax | +| `float-truncate` | Error: `InvalidNumber` | Avoid float math entirely | +| Default (disabled) | Raw string: `"1e3"` preserved | Manual parsing available | + +**Why truncate mode errors on scientific notation?** +Properly evaluating `1e3` to `1000` requires floating-point arithmetic, which defeats the purpose of embedded no-float configurations. The truncate mode is designed for simple cases like `1.7` → `1` where no exponentiation is needed. + +## Benefits + +- **Zero runtime overhead**: Behavior configured at compile time +- **Exact precision**: Raw strings always preserved +- **Embedded friendly**: Avoid 64-bit math and float routines when not needed +- **Flexible**: Choose the right tradeoffs for your use case +- **no_std compatible**: No heap allocations +- **Fail fast**: Error configurations catch incompatible data early diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4ce6f5a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +resolver = "2" +members = [ "stax","tokenizer", "demos"] diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..3596ef9 --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,170 @@ + +--- + +# Stax Parser Design Notes + +## 1. Goals and Philosophy + +This document outlines the design for the `stax` crate, a high-level, allocation-free JSON pull-parser. + +The primary philosophy is to build upon the lean, compact, and low-level `ujson` tokenizer to provide an ergonomic and highly efficient API for consumers. + +The core design goals are: +- **Zero Heap Allocations**: The parser must not perform any heap allocations during its operation. All memory will be provided by the caller. +- **Ergonomic API**: The parser should be easy to use and feel idiomatic to a Rust developer. +- **Correctness**: The parser must correctly handle all aspects of the JSON spec, including complex string escapes. +- **Footprint**: As minimal resource footprint as possible. This may come at the cost of execution speed. + +## 2. Core API Design: The `Iterator` Trait + +To provide the most idiomatic API, `PullParser` will implement the standard `Iterator` trait. This allows consumers to process JSON events using a simple `for` loop, integrating seamlessly with the rest of the Rust ecosystem. + +```rust +// The user-facing API will be clean and simple: +let mut scratch = [0; 1024]; +let parser = PullParser::new_with_buffer(json_input, &mut scratch); + +for event_result in parser { + let event = event_result?; + // ... process event +} +``` + +The iterator's item will be a `Result` to allow for robust error handling. + +```rust +impl<'a, 'b> Iterator for PullParser<'a, 'b> { + type Item = Result, ParseError>; + + fn next(&mut self) -> Option { + // ... parsing logic ... + } +} +``` + +## 3. Memory Management: External Scratch Buffer + +To achieve the zero-allocation goal while still handling complex cases like string un-escaping, the parser will not manage its own memory. Instead, the caller must provide a temporary "scratch" buffer during instantiation. + +This design was chosen over an internal, fixed-size buffer to avoid complex lifetime issues with the borrow checker and to give the user full control over the memory's size and location (stack, static arena, etc.). + +The parser's constructor will have the following signature: + +```rust +impl<'a, 'b> PullParser<'a, 'b> { + /// Creates a new parser for the given JSON input. + /// + /// - `input`: A string slice containing the JSON data to be parsed. + /// - `scratch_buffer`: A mutable byte slice for temporary operations, + /// like string un-escaping. + pub fn new(input: &'a str, scratch_buffer: &'b mut [u8]) -> Self { + // ... + } +} +``` + +The `'a` lifetime is tied to the input data, while `'b` is tied to the scratch buffer. + +## 4. Handling String Values: The `String` Enum + +To handle string values efficiently, we will use a custom "Copy-on-Write"-like enum called `String`. This avoids allocations by returning either a view into the original input or a view into the scratch buffer. + +```rust +/// Represents a JSON string. +/// 'a is the lifetime of the original input buffer. +/// 'b is the lifetime of the scratch buffer. +#[derive(Debug, PartialEq, Eq)] +pub enum String<'a, 'b> { + /// A raw slice from the original input, used when no un-escaping is needed. + Borrowed(&'a str), + /// A slice from the scratch buffer, used when a string had to be un-escaped. + Unescaped(&'b str), +} +``` + +This enum will implement `Deref` so it can be used almost exactly like a standard `&str`, providing excellent ergonomics. + +## 5. String Parsing Strategy: "Copy-on-Escape" + +To minimize overhead, the parser will adopt a lazy "copy-on-escape" strategy for strings and keys. This optimizes for the most common case where strings do not contain any escape sequences. + +The algorithm is as follows: + +1. **Optimistic Fast Path**: When a string token begins, the parser assumes no escapes will be found. It does not perform any copying. If the end of the string is reached without encountering a `\` character, it returns a `String::Borrowed` variant containing a slice of the original input. This is a zero-copy operation. + +2. **Triggered Slow Path**: If a `\` character *is* encountered while scanning the string: + a. The parser immediately switches to "unescaping mode". + b. It performs a one-time copy of the string prefix (all characters from the start of the string up to the `\`) into the provided scratch buffer. + c. It continues processing the rest of the string, un-escaping sequences and writing the processed characters directly into the scratch buffer. + d. When the end of the string is reached, it returns a `String::Unescaped` variant containing a slice of the now-populated scratch buffer. + +This ensures that work is only done when absolutely necessary. + +## 6. Final Data Structures + +Here is a summary of the core public-facing data structures. + +```rust +// The main parser struct +pub struct PullParser<'a, 'b> { /* ... private fields ... */ } + +// The custom "Cow-like" string type +#[derive(Debug, PartialEq, Eq)] +pub enum String<'a, 'b> { + Borrowed(&'a str), + Unescaped(&'b str), +} + +// The events yielded by the iterator +#[derive(Debug, PartialEq)] +pub enum Event<'a, 'b> { + StartObject, + EndObject, + StartArray, + EndArray, + Key(String<'a, 'b>), + String(String<'a, 'b>), + Number(f64), // Assuming f64 for now + Bool(bool), + Null, +} + +// The comprehensive error type +#[derive(Debug, PartialEq)] +pub enum ParseError { + /// An error bubbled up from the underlying tokenizer. + Tokenizer(ujson::Error), + /// The provided scratch buffer was not large enough for an operation. + ScratchBufferFull, + /// A string slice was not valid UTF-8. + InvalidUtf8(core::str::Utf8Error), + /// A number string could not be parsed. + InvalidNumber(core::num::ParseFloatError), + /// The parser entered an unexpected internal state. + UnexpectedState(&'static str), +} +``` + +## 6. Dealing with non-slice input + +IMPORTANT!!! + +More: In addition of taking just slice [u8] as input, we should accept an `impl Reader` of some sort. +So that the input can come no-copy from any source with low buffering + +Note std::io has Read trait, but unfortunately that's not available in core::, so probably have to +make our own, and auto-implement it for arrays and slices or for anything that looks like AsRef<[u8]> + +## 7. TODO: Working with returned values + +String values in stax now have Deref, AsRef and Format support, so using them in default examples +with things like println! is convenient and easy. + +Same should be done with Number, but it's a little more tricky to design, given the configuration +variability + +## 8. TODO: Add direct defmt support for user API + +For any user of the Stax parser with defmt:: enabled, all the formatting should do sensible +default things. Most tricky is number formatting. The objective is to have clean, ergonomic, readable +examples diff --git a/README.md b/README.md index eddf566..2cc4c82 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,12 @@ # picojson-rs -A minimal Rust JSON parser + +A minimal Rust JSON parser for resource constrained environments. + +- Pull style parsers from byte slices or Reader interface - e.g streaming +- No recursion +- No allocations +- No required dependencies +- User-configured max parsing tree depth +- Configuration of int32 / int64 support +- Configuration and disabling of float support +- no_std by default diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..849e624 --- /dev/null +++ b/TODO.md @@ -0,0 +1,13 @@ +## TODO list +- API cleanup, rename things +- Constify what's possible +- Remove .unrwap()'s +- Dependency cleanup +- Clippy cleanup +- Put all shippable features in one crate ( tokenizer, pull + push parsers ) +- Clean up reference docs +- Provide user guide docs +- Direct defmt support +- Stack size benchmarks +- Code size benchmarks +- Sax-style push parser diff --git a/demos/Cargo.toml b/demos/Cargo.toml new file mode 100644 index 0000000..f53fccc --- /dev/null +++ b/demos/Cargo.toml @@ -0,0 +1,4 @@ +[package] +name = "demos" +version = "0.0.1" +edition = "2021" diff --git a/demos/src/lib.rs b/demos/src/lib.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/demos/src/lib.rs @@ -0,0 +1 @@ + diff --git a/stax/Cargo.toml b/stax/Cargo.toml new file mode 100644 index 0000000..b2d5593 --- /dev/null +++ b/stax/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "stax" +version = "0.1.0" +edition = "2021" + +[features] +default = ["int64", "float"] # Default to full support: 64-bit integers and floating point +float = [] # Enable f64 parsing support + +# Integer width options (mutually exclusive) +int32 = [] # Use i32 for integers (embedded-friendly) +int64 = [] # Use i64 for integers (full range) + +# Float behavior when float feature is disabled (mutually exclusive) +float-skip = [] # Skip float values during parsing (continue with next token) +float-error = [] # Error when encountering floats +float-truncate = [] # Truncate floats to integers (1.7 → 1) +defmt = ["dep:defmt"] + +[dependencies] +defmt = { version = "1.0.1", optional = true } +# TODO: Optional, should be swappable with defmt +log = "0.4.26" +ujson = { path= "../tokenizer" } + +[dev-dependencies] +test-log = "0.2" +env_logger = "0.11.3" +test-env-log = "0.2.8" diff --git a/stax/README.md b/stax/README.md new file mode 100644 index 0000000..4137104 --- /dev/null +++ b/stax/README.md @@ -0,0 +1,59 @@ +### no_std Rust pull parser + +This crate is designed for no_std environment JSON pull parsing. + +Note: For "document" style parsing where all or most of the document is fully +built in memory, please use serde-json with no_std. + +However - pull parsing is useful when you need to process large streams within +constrained memory, without building the entire document, and just picking +elements from the dataset that the application needs. + +Example usage: +```rust +use stax::{PullParser, Event, String}; + +// Simple usage (no string escapes expected) +let json = r#"{"switch": 1}"#; +let parser = PullParser::new(json); +for event in parser { + match event? { + Event::Key(String::Borrowed(key)) => { + println!("Key: '{}'", key); + } + Event::Number(num) => { + println!("Number: {}", num.as_str()); + } + Event::EndDocument => break, + _ => {} + } +} + +// With escape support +let json = r#"{"message": "Hello\nWorld"}"#; +let mut scratch = [0u8; 1024]; +let parser = PullParser::new_with_buffer(json, &mut scratch); +// ... use parser +``` + +PullParser takes the input stream, and an optional scratch buffer +to write unescaped strings to. If the input string is known not +to contain any escapes ( like newlines or unicodes ) the buffer +is not used and strings are returned as slices over input. + +The parser also uses storage for tracking parsing state, one bit for +every nesting level. By default this is a 32-bit int, but can be changed +to arbitrary depth. + +This crate has a few configuration features relevant for embedded targets: + + * int64 ( default ) - numbers are returned in int64 values + * int32 - integers are returned as int32, to avoid 64-bit math on constrained targets, e.g. Cortex-M0 + * float - full float support is included. + * float-error - Any floating point input will yield an error, to reduce float math dependency + * float-skip - Float values are skipped. + * float-truncate - float values are truncated to integers. Scientific notation will generate an error + + Please see examples/no_float_demo.rs + + By default, full float and int64 support is enabled. diff --git a/stax/examples/advanced_bitstack_demo.rs b/stax/examples/advanced_bitstack_demo.rs new file mode 100644 index 0000000..d083c75 --- /dev/null +++ b/stax/examples/advanced_bitstack_demo.rs @@ -0,0 +1,90 @@ +// Example demonstrating configurable BitStack storage for different nesting depths + +use stax::{Event, PullParserFlex}; + +fn main() -> Result<(), stax::ParseError> { + println!("BitStack Configuration Examples"); + println!("==============================="); + + // Test 1: Default PullParser (uses u32 BitStack and DummyReader) + println!("1. Standard PullParser (u32 BitStack, ~32 levels max):"); + let json = r#"{"deeply": {"nested": {"object": {"with": {"data": "test"}}}}}"#; + let mut scratch = [0u8; 512]; + let mut parser = stax::PullParser::new_with_buffer(json, &mut scratch); + let mut depth = 0; + while let Some(event) = parser.next() { + match event? { + Event::StartObject => { + depth += 1; + println!(" {}StartObject (depth: {})", " ".repeat(depth), depth); + } + Event::EndObject => { + println!(" {}EndObject (depth: {})", " ".repeat(depth), depth); + depth -= 1; + } + Event::Key(k) => println!(" {}Key: {:?}", " ".repeat(depth + 1), &*k), + Event::String(s) => println!(" {}String: {:?}", " ".repeat(depth + 1), &*s), + Event::EndDocument => break, + _ => {} + } + } + println!(" Maximum depth reached: {}\n", depth); + + // Test 2: u8 BitStack (8-bit depth, more memory efficient for shallow data) + println!("2. Memory-efficient PullParserFlex (u8 BitStack, ~8 levels max):"); + let json = r#"{"shallow": {"data": [1, 2, 3]}}"#; + let mut scratch = [0u8; 256]; + let mut parser: PullParserFlex = PullParserFlex::new_with_buffer(json, &mut scratch); + let mut depth = 0; + while let Some(event) = parser.next() { + match event? { + Event::StartObject => { + depth += 1; + println!(" {}StartObject (depth: {})", " ".repeat(depth), depth); + } + Event::StartArray => { + depth += 1; + println!(" {}StartArray (depth: {})", " ".repeat(depth), depth); + } + Event::EndObject => { + println!(" {}EndObject (depth: {})", " ".repeat(depth), depth); + depth -= 1; + } + Event::EndArray => { + println!(" {}EndArray (depth: {})", " ".repeat(depth), depth); + depth -= 1; + } + Event::Key(k) => println!(" {}Key: {:?}", " ".repeat(depth + 1), &*k), + Event::Number(n) => println!(" {}Number: {}", " ".repeat(depth + 1), n), + Event::EndDocument => break, + _ => {} + } + } + println!(" Maximum depth reached: {}\n", depth); + + // Test 3: u64 BitStack (64-bit depth, for very deep nesting) + println!("3. Deep-nesting PullParserFlex (u64 BitStack, ~64 levels max):"); + let json = r#"{"very": {"deeply": {"nested": {"structure": {"with": {"many": {"levels": {"data": "deep"}}}}}}}}"#; + let mut scratch = [0u8; 1024]; + let mut parser: PullParserFlex = PullParserFlex::new_with_buffer(json, &mut scratch); + let mut depth = 0; + while let Some(event) = parser.next() { + match event? { + Event::StartObject => { + depth += 1; + println!(" {}StartObject (depth: {})", " ".repeat(depth), depth); + } + Event::EndObject => { + println!(" {}EndObject (depth: {})", " ".repeat(depth), depth); + depth -= 1; + } + Event::Key(k) => println!(" {}Key: {:?}", " ".repeat(depth + 1), &*k), + Event::String(s) => println!(" {}String: {:?}", " ".repeat(depth + 1), &*s), + Event::EndDocument => break, + _ => {} + } + } + println!(" Maximum depth reached: {}", depth); + + Ok(()) +} diff --git a/stax/examples/array_bitstack_demo.rs b/stax/examples/array_bitstack_demo.rs new file mode 100644 index 0000000..e048570 --- /dev/null +++ b/stax/examples/array_bitstack_demo.rs @@ -0,0 +1,259 @@ +// Example demonstrating ArrayBitStack for large nesting depths + +use stax::{Event, PullParserFlex}; +use ujson::bitstack::ArrayBitStack; + +fn main() -> Result<(), stax::ParseError> { + println!("=== ArrayBitStack Demo ===\n"); + + // Generate deeply nested JSON with mixed objects and arrays (70+ levels) + let deep_json = generate_deep_mixed_json(65); + println!("1. ArrayBitStack<3, u32> (96-bit depth) - Mixed {{}} and [] nesting to depth ~65:"); + println!( + " Generated JSON (first 100 chars): {}", + &deep_json[..deep_json.len().min(100)] + ); + println!(" JSON structure: obj->arr->obj->arr->... (alternating pattern)"); + + let mut scratch = [0u8; 2048]; + let mut parser: PullParserFlex, u16> = + PullParserFlex::new_with_buffer(&deep_json, &mut scratch); + let mut depth = 0; + let mut max_depth = 0; + + loop { + match parser.next() { + Some(Ok(event)) => match event { + Event::StartObject => { + depth += 1; + max_depth = max_depth.max(depth); + if depth <= 5 || depth % 10 == 0 { + println!( + " {}StartObject (depth: {})", + " ".repeat((depth - 1).min(3)), + depth + ); + } + } + Event::StartArray => { + depth += 1; + max_depth = max_depth.max(depth); + if depth <= 5 || depth % 10 == 0 { + println!( + " {}StartArray (depth: {})", + " ".repeat((depth - 1).min(3)), + depth + ); + } + } + Event::EndObject => { + if depth <= 5 || depth % 10 == 0 { + println!( + " {}EndObject (depth: {})", + " ".repeat((depth - 1).min(3)), + depth + ); + } + depth -= 1; + } + Event::EndArray => { + if depth <= 5 || depth % 10 == 0 { + println!( + " {}EndArray (depth: {})", + " ".repeat((depth - 1).min(3)), + depth + ); + } + depth -= 1; + } + Event::Key(key) => { + if depth <= 5 { + println!(" {}Key: '{}'", " ".repeat(depth.min(3)), key); + } + } + Event::String(s) => { + println!( + " {}String: '{}' (at max depth: {})", + " ".repeat(depth.min(3)), + s, + depth + ); + } + Event::Number(num) => { + println!( + " {}Number: {} (at max depth: {})", + " ".repeat(depth.min(3)), + num, + depth + ); + } + Event::EndDocument => break, + _ => {} + }, + Some(Err(_)) => { + println!( + " ! Parse error encountered at depth {}, continuing...", + depth + ); + break; + } + None => break, + } + } + println!( + " ✅ Successfully parsed {} levels of mixed nesting!\n", + max_depth + ); + + // Test ArrayBitStack with smaller elements for memory efficiency + println!("2. ArrayBitStack<8, u8> (64-bit depth tracking) - Complex nested structure:"); + let complex_json = generate_complex_nested_json(25); + println!(" JSON structure: Objects with arrays containing objects with data"); + + let mut scratch = [0u8; 1024]; + let mut parser: PullParserFlex, u8> = + PullParserFlex::new_with_buffer(&complex_json, &mut scratch); + let mut depth = 0; + let mut max_depth = 0; + + while let Some(event) = parser.next() { + match event? { + Event::StartArray => { + depth += 1; + max_depth = max_depth.max(depth); + if depth <= 8 { + println!(" {}StartArray (depth: {})", " ".repeat(depth), depth); + } + } + Event::StartObject => { + depth += 1; + max_depth = max_depth.max(depth); + if depth <= 8 { + println!(" {}StartObject (depth: {})", " ".repeat(depth), depth); + } + } + Event::EndArray => { + if depth <= 8 { + println!(" {}EndArray (depth: {})", " ".repeat(depth), depth); + } + depth -= 1; + } + Event::EndObject => { + if depth <= 8 { + println!(" {}EndObject (depth: {})", " ".repeat(depth), depth); + } + depth -= 1; + } + Event::Key(key) => { + if depth <= 8 { + println!(" {}Key: '{}'", " ".repeat(depth), key); + } + } + Event::Number(num) => { + if depth <= 8 { + println!(" {}Number: {}", " ".repeat(depth), num); + } + } + Event::String(s) => { + if depth <= 8 { + println!(" {}String: '{}'", " ".repeat(depth), s); + } + } + Event::EndDocument => break, + _ => {} + } + } + println!( + " ✅ Successfully parsed {} levels of complex nesting!\n", + max_depth + ); + + println!("✅ ArrayBitStack configurations working!"); + println!(); + + println!("ArrayBitStack Summary:"); + println!("• ArrayBitStack<3, u32>: 96-bit depth (3 × 32 bits)"); + println!("• ArrayBitStack<8, u8>: 64-bit depth (8 × 8 bits) - memory efficient"); + println!("• ArrayBitStack<16, u32>: 512-bit depth (16 × 32 bits) - ultra deep"); + println!("• Configurable element type (u8, u16, u32, u64) and array size"); + + Ok(()) +} + +/// Generate deeply nested JSON with alternating objects and arrays +/// Pattern: {"level0": [{"level2": [{"level4": ... "data"}]}]} +fn generate_deep_mixed_json(depth: usize) -> String { + let mut json = String::new(); + + // Opening structures (alternating object/array) + for i in 0..depth { + if i % 2 == 0 { + // Object level + json.push_str(&format!(r#"{{"level{}":"#, i)); + } else { + // Array level + json.push('['); + } + } + + // Core data at the deepest level + json.push_str(r#""reached_the_deep_end""#); + + // Closing structures (reverse order) + for i in (0..depth).rev() { + if i % 2 == 0 { + // Close object + json.push('}'); + } else { + // Close array + json.push(']'); + } + } + + json +} + +/// Generate complex nested JSON with realistic structure +/// Pattern: [{"data": [{"data": [{"value": 123}]}]}] +fn generate_complex_nested_json(depth: usize) -> String { + let mut json = String::new(); + + // Start with array + json.push('['); + + for i in 0..depth { + if i % 3 == 0 { + // Object with "data" key + json.push_str(r#"{"data":"#); + } else if i % 3 == 1 { + // Array + json.push('['); + } else { + // Object with "nested" key + json.push_str(r#"{"nested":"#); + } + } + + // Core data + json.push_str(&format!( + r#"{{"value": {}, "msg": "depth_{}_reached"}}"#, + depth * 42, + depth + )); + + // Close all structures + for i in (0..depth).rev() { + if i % 3 == 0 || i % 3 == 2 { + // Close object + json.push('}'); + } else { + // Close array + json.push(']'); + } + } + + // Close initial array + json.push(']'); + + json +} diff --git a/stax/examples/direct_parser_demo.rs b/stax/examples/direct_parser_demo.rs new file mode 100644 index 0000000..9567979 --- /dev/null +++ b/stax/examples/direct_parser_demo.rs @@ -0,0 +1,130 @@ +// Example demonstrating DirectParser with a Reader over a fixed-size array + +use stax::{DirectParser, Event, Reader}; + +/// Simple Reader implementation that reads from a fixed-size byte array +/// This simulates reading from a stream, network socket, or any other byte source +struct ArrayReader<'a> { + data: &'a [u8], + position: usize, + chunk_size: usize, // Simulate streaming by reading in chunks +} + +impl<'a> ArrayReader<'a> { + /// Create a new ArrayReader from a byte slice + /// chunk_size controls how many bytes are read at once (simulates network packets) + fn new(data: &'a [u8], chunk_size: usize) -> Self { + Self { + data, + position: 0, + chunk_size, + } + } +} + +impl<'a> Reader for ArrayReader<'a> { + type Error = std::io::Error; + + fn read(&mut self, buf: &mut [u8]) -> Result { + let remaining = self.data.len().saturating_sub(self.position); + if remaining == 0 { + return Ok(0); // EOF + } + + // Read at most chunk_size bytes to simulate streaming behavior + let to_read = remaining.min(buf.len()).min(self.chunk_size); + let end_pos = self.position + to_read; + + buf[..to_read].copy_from_slice(&self.data[self.position..end_pos]); + self.position = end_pos; + + println!( + " 📖 Reader: read {} bytes (pos: {}/{})", + to_read, + self.position, + self.data.len() + ); + Ok(to_read) + } +} + +fn main() -> Result<(), stax::ParseError> { + println!("🚀 DirectParser Demo with ArrayReader"); + println!("====================================="); + + // Test JSON with various data types including escape sequences + let json = br#"{"name": "hello\nworld", "items": [1, 2.5, true, null], "count": 42}"#; + + println!("📄 Input JSON: {}", std::str::from_utf8(json).unwrap()); + println!("📏 Total size: {} bytes", json.len()); + println!(); + + // Create ArrayReader that reads in small chunks (simulates network streaming) + let reader = ArrayReader::new(json, 8); // Read 8 bytes at a time + + // Create DirectParser with a reasonably sized buffer + let mut buffer = [0u8; 256]; + let buffer_size = buffer.len(); + let mut parser: DirectParser = DirectParser::new(reader, &mut buffer); + + println!("🔄 Starting DirectParser with streaming ArrayReader:"); + println!(" Buffer size: {} bytes", buffer_size); + println!(" Chunk size: 8 bytes (simulates small network packets)"); + println!(); + + let mut event_count = 0; + loop { + match parser.next_event() { + Ok(event) => { + event_count += 1; + match event { + Event::StartObject => println!(" 🏁 StartObject"), + Event::EndObject => println!(" 🏁 EndObject"), + Event::StartArray => println!(" 📋 StartArray"), + Event::EndArray => println!(" 📋 EndArray"), + Event::Key(key) => { + println!(" 🔑 Key: '{}'", key.as_str()); + } + Event::String(s) => { + println!(" 📝 String: '{}'", s.as_str()); + } + Event::Number(num) => { + println!(" 🔢 Number: {}", num); + } + Event::Bool(b) => { + println!(" ✅ Bool: {}", b); + } + Event::Null => { + println!(" ⭕ Null"); + } + Event::EndDocument => { + println!(" 🏁 EndDocument"); + break; + } + } + } + Err(e) => { + println!("❌ Parse error: {:?}", e); + return Err(e); + } + } + } + + println!(); + println!( + "✅ Successfully parsed {} events with DirectParser!", + event_count + ); + println!("💡 Notice how the Reader was called multiple times in small chunks,"); + println!(" demonstrating true streaming behavior with a fixed-size buffer."); + + // Show buffer statistics + let stats = parser.buffer_stats(); + println!(); + println!("📊 Final buffer statistics:"); + println!(" Total capacity: {} bytes", stats.total_capacity); + println!(" Data processed: {} bytes", stats.data_end); + println!(" Remaining: {} bytes", stats.remaining_bytes); + + Ok(()) +} diff --git a/stax/examples/no_float_demo.rs b/stax/examples/no_float_demo.rs new file mode 100644 index 0000000..e88ef32 --- /dev/null +++ b/stax/examples/no_float_demo.rs @@ -0,0 +1,170 @@ +// Example demonstrating configurable number handling for embedded targets +// Shows both successful parsing and error scenarios based on input data. +// +// Try different configurations: +// cargo run --example no_float_demo --no-default-features # Basic no-float +// cargo run --example no_float_demo --features int32 # 32-bit integers +// cargo run --example no_float_demo --features int32,float-truncate # Truncate floats (shows both error and success paths) +// cargo run --example no_float_demo --features int32,float-error # Error on floats (embedded-friendly) +// cargo run --example no_float_demo --features float # Full float support + +use stax::{Event, NumberResult, PullParser, String}; + +fn main() { + // Full JSON with scientific notation + let json_full = r#"{"integers": [1, 2, 3], "floats": [1.5, 2.7, 3.14], "scientific": [1e3, 2.5e-1, 1.23e+2], "mixed": [42, 1.618, 100]}"#; + + // Limited JSON without scientific notation (for truncate mode demonstration) + let json_limited = + r#"{"integers": [1, 2, 3], "floats": [1.5, 2.7, 3.14], "mixed": [42, 1.618, 100]}"#; + + println!("Parsing JSON with configurable number handling:"); + + // Show configuration being used + #[cfg(feature = "int32")] + println!("Configuration: Using i32 integers (embedded-friendly)"); + #[cfg(not(feature = "int32"))] + println!("Configuration: Using i64 integers (full range)"); + + #[cfg(feature = "float")] + println!("Configuration: Float support enabled"); + #[cfg(all(not(feature = "float"), feature = "float-error"))] + println!("Configuration: Error on floats (fail-fast for embedded)"); + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + println!("Configuration: Truncate floats to integers"); + #[cfg(all( + not(feature = "float"), + not(any(feature = "float-error", feature = "float-truncate")) + ))] + println!("Configuration: Float support disabled (raw strings only)"); + + println!(); + + // Determine which inputs to test based on configuration + let test_cases = [ + ("Full JSON (with scientific notation)", json_full), + ("Limited JSON (no scientific notation)", json_limited), + ]; + + // For float-truncate mode, test both to show error and success paths + // For other modes, skip the second test if behavior would be identical + let should_test_both = cfg!(all(not(feature = "float"), feature = "float-truncate")); + + for (i, (description, json)) in test_cases.iter().enumerate() { + // Skip second test for non-truncate modes (behavior would be identical) + if i == 1 && !should_test_both { + break; + } + + println!("=== {} ===", description); + println!("Input: {}", json); + println!(); + + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(json, &mut scratch); + + parse_and_display(&mut parser); + + if i == 0 && should_test_both { + println!("\n--- Now testing without scientific notation ---\n"); + } + } + + print_summary(); +} + +fn parse_and_display(parser: &mut PullParser) { + loop { + match parser.next_event() { + Ok(Event::Number(num)) => { + println!("Number: raw='{}', parsed={:?}", num.as_str(), num.parsed()); + + // Show behavior based on configuration + match num.parsed() { + NumberResult::Integer(i) => println!(" → Integer: {}", i), + NumberResult::IntegerOverflow => { + println!(" → Integer overflow (use raw string): '{}'", num.as_str()) + } + #[cfg(feature = "float")] + NumberResult::Float(f) => println!(" → Float: {}", f), + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + NumberResult::FloatTruncated(i) => { + println!( + " → Float truncated to integer: {} (from '{}')", + i, + num.as_str() + ) + } + #[cfg(feature = "float-skip")] + NumberResult::FloatSkipped => { + println!(" → Float skipped (use raw string): '{}'", num.as_str()) + } + #[cfg(not(feature = "float"))] + NumberResult::FloatDisabled => { + println!( + " → Float disabled - raw string available: '{}'", + num.as_str() + ); + + // User could still parse manually if needed: + if let Ok(f) = num.as_str().parse::() { + println!(" → Manual parse as f64: {}", f); + } + } + } + } + Ok(Event::Key(String::Borrowed(key))) => { + println!("Key: '{}'", key); + } + Ok(Event::StartObject) => println!("StartObject"), + Ok(Event::EndObject) => println!("EndObject"), + Ok(Event::StartArray) => println!("StartArray"), + Ok(Event::EndArray) => println!("EndArray"), + Ok(Event::EndDocument) => { + println!("EndDocument"); + break; + } + Ok(other) => println!("Other event: {:?}", other), + Err(e) => { + println!("Error: {:?}", e); + break; + } + } + } +} + +fn print_summary() { + println!("\n=== Summary ==="); + #[cfg(feature = "int32")] + println!("- Using i32 integers (no 64-bit math routines needed)"); + #[cfg(not(feature = "int32"))] + println!("- Using i64 integers (full range)"); + + #[cfg(feature = "float")] + println!("- Float support enabled (f64 parsing)"); + #[cfg(all(not(feature = "float"), feature = "float-error"))] + println!("- Error on floats (embedded fail-fast behavior)"); + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + println!("- Truncate floats to integers (simple decimals only, errors on scientific notation)"); + #[cfg(all( + not(feature = "float"), + not(any(feature = "float-error", feature = "float-truncate")) + ))] + println!("- Floats disabled (raw strings preserved for manual parsing)"); + + println!("- Raw strings always preserved for exact precision"); + println!("- Zero heap allocations (no_std compatible)"); + + println!("\nScientific notation handling:"); + #[cfg(feature = "float")] + println!("- 1e3 = 1000, 2.5e-1 = 0.25, 1.23e+2 = 123 (full evaluation)"); + #[cfg(all(not(feature = "float"), feature = "float-error"))] + println!("- All floats including scientific notation trigger FloatNotAllowed error"); + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + println!("- Scientific notation triggers InvalidNumber error (would require float math)"); + #[cfg(all( + not(feature = "float"), + not(any(feature = "float-error", feature = "float-truncate")) + ))] + println!("- Raw strings preserved: '1e3', '2.5e-1', '1.23e+2' (manual parsing available)"); +} diff --git a/stax/examples/simple_api_demo.rs b/stax/examples/simple_api_demo.rs new file mode 100644 index 0000000..a269198 --- /dev/null +++ b/stax/examples/simple_api_demo.rs @@ -0,0 +1,40 @@ +// Example demonstrating the simple new API + +use stax::{Event, PullParser}; + +fn main() -> Result<(), stax::ParseError> { + // Test the new simple API + let json = r#"{"name": "value", "number": 42, "flag": true}"#; + let mut parser = PullParser::new(json); + println!("Using PullParser::new() - simple API:"); + println!("Input: {}", json); + + while let Some(event) = parser.next() { + match event? { + Event::StartObject => println!("StartObject"), + Event::EndObject => println!("EndObject"), + Event::Key(key) => { + println!("Key: '{}'", key); + } + Event::String(s) => { + println!("String: '{}'", s); + } + Event::Number(num) => { + // Now with ergonomic Display trait - shows parsed value when available, raw string otherwise + println!("Number: {}", num); + } + Event::Bool(b) => { + println!("Bool: {}", b); + } + Event::EndDocument => { + println!("EndDocument"); + break; + } + other => println!("Other: {:?}", other), + } + } + + println!(); + println!("✅ Successfully parsed with simple API!"); + Ok(()) +} diff --git a/stax/src/config_check.rs b/stax/src/config_check.rs new file mode 100644 index 0000000..db64dd4 --- /dev/null +++ b/stax/src/config_check.rs @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Compile-time configuration validation +//! +//! This module contains compile-time checks to ensure that mutually exclusive +//! features are not enabled simultaneously. + +// Compile-time checks for mutually exclusive integer width features +#[cfg(all(feature = "int32", feature = "int64"))] +compile_error!( + "Cannot enable both 'int32' and 'int64' features simultaneously: choose one integer width" +); + +// Compile-time checks for mutually exclusive float behavior features +#[cfg(all(feature = "float-skip", feature = "float-error"))] +compile_error!("Cannot enable both 'float-skip' and 'float-error' features simultaneously"); + +#[cfg(all(feature = "float-skip", feature = "float-truncate"))] +compile_error!("Cannot enable both 'float-skip' and 'float-truncate' features simultaneously"); + +#[cfg(all(feature = "float-error", feature = "float-truncate"))] +compile_error!("Cannot enable both 'float-error' and 'float-truncate' features simultaneously"); + +#[cfg(all( + feature = "float-skip", + feature = "float-error", + feature = "float-truncate" +))] +compile_error!("Cannot enable multiple float behavior features: choose only one of 'float-skip', 'float-error', or 'float-truncate'"); + +// Compile-time checks to prevent 'float' feature conflicts with float-behavior features +#[cfg(all(feature = "float", feature = "float-skip"))] +compile_error!("Cannot enable both 'float' and 'float-skip' features: 'float-skip' is only for when float parsing is disabled"); + +#[cfg(all(feature = "float", feature = "float-error"))] +compile_error!("Cannot enable both 'float' and 'float-error' features: 'float-error' is only for when float parsing is disabled"); + +#[cfg(all(feature = "float", feature = "float-truncate"))] +compile_error!("Cannot enable both 'float' and 'float-truncate' features: 'float-truncate' is only for when float parsing is disabled"); diff --git a/stax/src/copy_on_escape.rs b/stax/src/copy_on_escape.rs new file mode 100644 index 0000000..5e964ce --- /dev/null +++ b/stax/src/copy_on_escape.rs @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ParseError, String}; + +/// A struct that encapsulates copy-on-escape string processing with full buffer ownership. +/// +/// This version owns the scratch buffer for the entire parser lifetime, eliminating +/// borrow checker issues. The buffer is reused across multiple string operations +/// via reset() calls. +pub struct CopyOnEscape<'a, 'b> { + /// Reference to the input data being parsed + input: &'a [u8], + /// Owned mutable reference to the scratch buffer for unescaping + scratch: &'b mut [u8], + /// Global position in the scratch buffer (never resets) + global_scratch_pos: usize, + + // Current string processing state (resets per string) + /// Where the current string started in the input + string_start: usize, + /// Position in input where we last copied from (for span copying) + last_copied_pos: usize, + /// Whether we've encountered any escapes (and thus are using scratch buffer) + using_scratch: bool, + /// Starting position in scratch buffer for this string + scratch_start: usize, + /// Current position in scratch buffer for this string + scratch_pos: usize, +} + +impl<'a, 'b> CopyOnEscape<'a, 'b> { + /// Creates a new CopyOnEscape processor with full buffer ownership. + /// + /// # Arguments + /// * `input` - The input byte slice being parsed + /// * `scratch` - Mutable scratch buffer for escape processing (owned for parser lifetime) + pub fn new(input: &'a [u8], scratch: &'b mut [u8]) -> Self { + Self { + input, + scratch, + global_scratch_pos: 0, + string_start: 0, + last_copied_pos: 0, + using_scratch: false, + scratch_start: 0, + scratch_pos: 0, + } + } + + /// Resets the processor for a new string at the given position. + /// The scratch buffer position continues from where previous strings left off. + /// + /// # Arguments + /// * `pos` - Position in input where the string content starts + pub fn begin_string(&mut self, pos: usize) { + self.string_start = pos; + self.last_copied_pos = pos; + self.using_scratch = false; // Start with zero-copy optimization + self.scratch_start = self.global_scratch_pos; + self.scratch_pos = self.global_scratch_pos; + } + + /// Copies a span from last_copied_pos to end position with bounds checking. + /// + /// # Arguments + /// * `end` - End position in input (exclusive) + /// * `extra_space` - Additional space needed beyond the span (e.g., for escape character) + fn copy_span_to_scratch(&mut self, end: usize, extra_space: usize) -> Result<(), ParseError> { + if end > self.last_copied_pos { + let span = &self.input[self.last_copied_pos..end]; + if self.scratch_pos + span.len() + extra_space > self.scratch.len() { + return Err(ParseError::ScratchBufferFull); + } + self.scratch[self.scratch_pos..self.scratch_pos + span.len()].copy_from_slice(span); + self.scratch_pos += span.len(); + } + Ok(()) + } + + /// Handles an escape sequence at the given position. + /// + /// This triggers copy-on-escape if this is the first escape encountered. + /// For subsequent escapes, it continues the unescaping process. + /// + /// # Arguments + /// * `pos` - Current position in input (pointing just after the escape sequence) + /// * `unescaped_char` - The unescaped character to write to scratch buffer + pub fn handle_escape(&mut self, pos: usize, unescaped_char: u8) -> Result<(), ParseError> { + if !self.using_scratch { + // First escape found - trigger copy-on-escape + self.using_scratch = true; + } + + // Copy the span from last_copied_pos to the backslash position + // The backslash is at pos-2 (since pos points after the escape sequence) + let backslash_pos = pos.saturating_sub(2); + self.copy_span_to_scratch(backslash_pos, 1)?; + + // Write the unescaped character + if self.scratch_pos >= self.scratch.len() { + return Err(ParseError::ScratchBufferFull); + } + self.scratch[self.scratch_pos] = unescaped_char; + self.scratch_pos += 1; + + // Update last copied position to after the escape sequence + self.last_copied_pos = pos; + + Ok(()) + } + + /// Handles a Unicode escape sequence by writing the UTF-8 encoded bytes to scratch buffer. + /// + /// This triggers copy-on-escape if this is the first escape encountered. + /// Unicode escapes span 6 bytes in input (\uXXXX) but produce 1-4 bytes of UTF-8 output. + /// + /// # Arguments + /// * `start_pos` - Position in input where the \uXXXX sequence starts (at the backslash) + /// * `utf8_bytes` - The UTF-8 encoded bytes to write (1-4 bytes) + pub fn handle_unicode_escape( + &mut self, + start_pos: usize, + utf8_bytes: &[u8], + ) -> Result<(), ParseError> { + if !self.using_scratch { + // First escape found - trigger copy-on-escape + self.using_scratch = true; + } + + // Copy the span from last_copied_pos to the backslash position + self.copy_span_to_scratch(start_pos, utf8_bytes.len())?; + + // Write the UTF-8 encoded bytes + if self.scratch_pos + utf8_bytes.len() > self.scratch.len() { + return Err(ParseError::ScratchBufferFull); + } + self.scratch[self.scratch_pos..self.scratch_pos + utf8_bytes.len()] + .copy_from_slice(utf8_bytes); + self.scratch_pos += utf8_bytes.len(); + + // Update last copied position to after the 6-byte Unicode escape sequence + self.last_copied_pos = start_pos + 6; // \uXXXX is always 6 bytes + + Ok(()) + } + + /// Completes string processing and returns the final String. + /// Updates the global scratch position for the next string. + /// + /// # Arguments + /// * `pos` - Position in input where the string ends + /// + /// # Returns + /// The final String (either borrowed or unescaped) + pub fn end_string(&mut self, pos: usize) -> Result { + if self.using_scratch { + // Copy final span from last_copied_pos to end + self.copy_span_to_scratch(pos, 0)?; + // Update global position for next string + self.global_scratch_pos = self.scratch_pos; + + // Return unescaped string from scratch buffer + let unescaped_slice = &self.scratch[self.scratch_start..self.scratch_pos]; + let unescaped_str = + core::str::from_utf8(unescaped_slice).map_err(ParseError::InvalidUtf8)?; + Ok(String::Unescaped(unescaped_str)) + } else { + // No escapes found - return borrowed slice (zero-copy!) + let borrowed_bytes = &self.input[self.string_start..pos]; + let borrowed_str = + core::str::from_utf8(borrowed_bytes).map_err(ParseError::InvalidUtf8)?; + Ok(String::Borrowed(borrowed_str)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_coe2_no_escapes() { + let input = b"hello world"; + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + let result = processor.end_string(input.len()).unwrap(); + + // Should return borrowed (zero-copy) + assert!(matches!(result, String::Borrowed("hello world"))); + } + + #[test] + fn test_coe2_with_escapes() { + let input = b"hello\\nworld"; + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + processor.handle_escape(7, b'\n').unwrap(); // Position after "hello\n" + let result = processor.end_string(input.len()).unwrap(); + + // Should return unescaped + assert!(matches!(result, String::Unescaped(s) if s == "hello\nworld")); + } + + #[test] + fn test_coe2_multiple_strings() { + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(b"dummy", &mut scratch); + + // First string with escapes + let input1 = b"first\\tstring"; + processor.input = input1; + processor.begin_string(0); + processor.handle_escape(7, b'\t').unwrap(); // After "first\t" + let result1 = processor.end_string(input1.len()).unwrap(); + + assert!(matches!(result1, String::Unescaped(s) if s == "first\tstring")); + + // Second string without escapes + let input2 = b"second string"; + processor.input = input2; + processor.begin_string(0); + let result2 = processor.end_string(input2.len()).unwrap(); + + // Should be borrowed (no scratch used) + assert!(matches!(result2, String::Borrowed("second string"))); + + // Third string with escapes + let input3 = b"third\\nstring"; + processor.input = input3; + processor.begin_string(0); + processor.handle_escape(7, b'\n').unwrap(); + let result3 = processor.end_string(input3.len()).unwrap(); + + assert!(matches!(result3, String::Unescaped(s) if s == "third\nstring")); + } + + #[test] + fn test_coe2_multiple_escapes() { + let input = b"a\\nb\\tc"; + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + processor.handle_escape(3, b'\n').unwrap(); // After "a\n" + processor.handle_escape(6, b'\t').unwrap(); // After "b\t" + let result = processor.end_string(input.len()).unwrap(); + + assert!(matches!(result, String::Unescaped(s) if s == "a\nb\tc")); + } + + #[test] + fn test_coe2_buffer_reuse() { + let mut scratch = [0u8; 50]; // Larger buffer + let mut processor = CopyOnEscape::new(b"dummy", &mut scratch); + + // Fill up buffer with first string + let input1 = b"long\\tstring\\nwith\\rescapes"; + processor.input = input1; + processor.begin_string(0); + processor.handle_escape(6, b'\t').unwrap(); + processor.handle_escape(14, b'\n').unwrap(); + processor.handle_escape(20, b'\r').unwrap(); + let result1 = processor.end_string(input1.len()).unwrap(); + + assert!(matches!(result1, String::Unescaped(_))); + + // Use buffer for second string (will use remaining space) + let input2 = b"new\\tstring"; + processor.input = input2; + processor.begin_string(0); + processor.handle_escape(5, b'\t').unwrap(); + let result2 = processor.end_string(input2.len()).unwrap(); + + assert!(matches!(result2, String::Unescaped(s) if s == "new\tstring")); + } + + #[test] + fn test_coe2_buffer_full() { + let input = b"very long string with escape\\n"; + let mut scratch = [0u8; 5]; // Intentionally small + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + let result = processor.handle_escape(30, b'\n'); + + assert!(matches!(result, Err(ParseError::ScratchBufferFull))); + } + + #[test] + fn test_coe2_unicode_escape() { + let input = b"hello\\u0041world"; // \u0041 = 'A' + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + // Unicode escape: \u0041 -> UTF-8 'A' (1 byte) + let utf8_a = b"A"; + processor.handle_unicode_escape(5, utf8_a).unwrap(); // Position at backslash + let result = processor.end_string(input.len()).unwrap(); + + // Should return unescaped with 'A' substituted + assert!(matches!(result, String::Unescaped(s) if s == "helloAworld")); + } + + #[test] + fn test_coe2_unicode_escape_multibyte() { + let input = b"test\\u03B1end"; // \u03B1 = Greek alpha 'α' (2 bytes in UTF-8) + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + // Unicode escape: \u03B1 -> UTF-8 'α' (2 bytes: 0xCE, 0xB1) + let utf8_alpha = "α".as_bytes(); // UTF-8 encoding of Greek alpha + processor.handle_unicode_escape(4, utf8_alpha).unwrap(); // Position at backslash + let result = processor.end_string(input.len()).unwrap(); + + // Should return unescaped with 'α' substituted + assert!(matches!(result, String::Unescaped(s) if s == "testαend")); + } + + #[test] + fn test_coe2_unicode_escape_no_prior_escapes() { + let input = b"plain\\u0041"; // \u0041 = 'A' + let mut scratch = [0u8; 100]; + let mut processor = CopyOnEscape::new(input, &mut scratch); + + processor.begin_string(0); + // Should trigger copy-on-escape since this is first escape + let utf8_a = b"A"; + processor.handle_unicode_escape(5, utf8_a).unwrap(); + let result = processor.end_string(input.len()).unwrap(); + + assert!(matches!(result, String::Unescaped(s) if s == "plainA")); + } +} diff --git a/stax/src/direct_buffer.rs b/stax/src/direct_buffer.rs new file mode 100644 index 0000000..8e7692f --- /dev/null +++ b/stax/src/direct_buffer.rs @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::ParseError; + +/// Error types for DirectBuffer operations +#[derive(Debug, PartialEq)] +pub enum DirectBufferError { + /// Buffer is full and cannot accommodate more data + BufferFull, + /// Attempted to read beyond available data + EndOfData, + /// Invalid buffer state or operation + InvalidState(&'static str), +} + +impl From for ParseError { + fn from(err: DirectBufferError) -> Self { + match err { + DirectBufferError::BufferFull => ParseError::ScratchBufferFull, + DirectBufferError::EndOfData => ParseError::EndOfData, + DirectBufferError::InvalidState(msg) => ParseError::UnexpectedState(msg), + } + } +} + +/// DirectBuffer manages a single buffer for both input and escape processing +/// +/// Key design principles: +/// - Reader fills unused portions of buffer directly +/// - Unescaped content is copied to buffer start when needed +/// - Zero-copy string extraction when no escapes are present +/// - Guaranteed space for escape processing (unescaped ≤ escaped) +pub struct DirectBuffer<'a> { + /// The entire buffer slice + buffer: &'a mut [u8], + /// Current position where tokenizer is reading + tokenize_pos: usize, + /// End of valid data from Reader (buffer[0..data_end] contains valid data) + data_end: usize, + /// Length of unescaped content at buffer start (0 if no unescaping active) + unescaped_len: usize, + /// Minimum space to reserve for escape processing + escape_reserve: usize, +} + +impl<'a> DirectBuffer<'a> { + /// Create a new DirectBuffer with the given buffer slice + pub fn new(buffer: &'a mut [u8]) -> Self { + // Reserve 10% of buffer for escape processing, minimum 64 bytes + let escape_reserve = (buffer.len() / 10).max(64); + + Self { + buffer, + tokenize_pos: 0, + data_end: 0, + unescaped_len: 0, + escape_reserve, + } + } + + /// Get the current byte at tokenize position + pub fn current_byte(&self) -> Result { + if self.tokenize_pos >= self.data_end { + return Err(DirectBufferError::EndOfData); + } + Ok(self.buffer[self.tokenize_pos]) + } + + /// Advance the tokenize position by one byte + pub fn advance(&mut self) -> Result<(), DirectBufferError> { + if self.tokenize_pos >= self.data_end { + return Err(DirectBufferError::EndOfData); + } + self.tokenize_pos += 1; + Ok(()) + } + + /// Get remaining bytes available for reading + pub fn remaining_bytes(&self) -> usize { + self.data_end.saturating_sub(self.tokenize_pos) + } + + /// Get slice for Reader to fill with new data + /// Returns None if no space available + pub fn get_fill_slice(&mut self) -> Option<&mut [u8]> { + if self.data_end >= self.buffer.len() { + return None; + } + Some(&mut self.buffer[self.data_end..]) + } + + /// Mark that Reader filled `bytes_read` bytes + pub fn mark_filled(&mut self, bytes_read: usize) -> Result<(), DirectBufferError> { + if self.data_end + bytes_read > self.buffer.len() { + return Err(DirectBufferError::InvalidState( + "Attempted to mark more bytes than buffer space", + )); + } + self.data_end += bytes_read; + Ok(()) + } + + /// Start unescaping and copy existing content from a range in the buffer + /// This handles the common case of starting escape processing partway through a string + pub fn start_unescaping_with_copy( + &mut self, + max_escaped_len: usize, + copy_start: usize, + copy_end: usize, + ) -> Result<(), DirectBufferError> { + // Clear any previous unescaped content + self.unescaped_len = 0; + + // Ensure we have space at the start for unescaping + if max_escaped_len > self.buffer.len() { + return Err(DirectBufferError::BufferFull); + } + + // Copy existing content if there is any + if copy_end > copy_start && copy_start < self.data_end { + let span_len = copy_end - copy_start; + + // Ensure the span fits in the buffer - return error instead of silent truncation + if span_len > self.buffer.len() { + return Err(DirectBufferError::BufferFull); + } + + // Copy within the same buffer: move data from [copy_start..copy_end] to [0..span_len] + // Use copy_within to handle overlapping ranges safely + self.buffer + .copy_within(copy_start..copy_start + span_len, 0); + self.unescaped_len = span_len; + } + + Ok(()) + } + + /// Get the unescaped content slice + pub fn get_unescaped_slice(&self) -> Result<&[u8], DirectBufferError> { + if self.unescaped_len == 0 { + return Err(DirectBufferError::InvalidState( + "No unescaped content available", + )); + } + Ok(&self.buffer[0..self.unescaped_len]) + } + + /// Clear unescaped content (call after yielding unescaped string) + pub fn clear_unescaped(&mut self) { + self.unescaped_len = 0; + } + + /// Get current tokenize position (for string start tracking) + pub fn current_position(&self) -> usize { + self.tokenize_pos + } + + /// Check if buffer is empty (no more data to process) + pub fn is_empty(&self) -> bool { + self.tokenize_pos >= self.data_end + } + + /// Check if we have unescaped content ready + pub fn has_unescaped_content(&self) -> bool { + self.unescaped_len > 0 + } + + /// Append a single byte to the unescaped content + pub fn append_unescaped_byte(&mut self, byte: u8) -> Result<(), DirectBufferError> { + let available_space = self.buffer.len().saturating_sub(self.escape_reserve); + if self.unescaped_len >= available_space { + return Err(DirectBufferError::BufferFull); + } + + self.buffer[self.unescaped_len] = byte; + self.unescaped_len += 1; + Ok(()) + } + + /// Get a string slice from the buffer (zero-copy) + /// Used for strings without escapes + pub fn get_string_slice(&self, start: usize, end: usize) -> Result<&[u8], DirectBufferError> { + if start > end || end > self.data_end { + return Err(DirectBufferError::InvalidState("Invalid slice bounds")); + } + Ok(&self.buffer[start..end]) + } + + /// Get buffer statistics for debugging + pub fn stats(&self) -> DirectBufferStats { + DirectBufferStats { + total_capacity: self.buffer.len(), + tokenize_pos: self.tokenize_pos, + data_end: self.data_end, + unescaped_len: self.unescaped_len, + remaining_bytes: self.remaining_bytes(), + available_space: self.buffer.len().saturating_sub(self.data_end), + escape_reserve: self.escape_reserve, + } + } +} + +/// Statistics for DirectBuffer state (useful for debugging and testing) +#[derive(Debug, PartialEq)] +pub struct DirectBufferStats { + pub total_capacity: usize, + pub tokenize_pos: usize, + pub data_end: usize, + pub unescaped_len: usize, + pub remaining_bytes: usize, + pub available_space: usize, + pub escape_reserve: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lifetime_expectations() { + // This test demonstrates how DirectBuffer lifetimes should work + let mut buffer = [0u8; 100]; + let mut direct_buffer = DirectBuffer::new(&mut buffer); + + // Simulate some data being in the buffer + let test_data = b"hello world"; + direct_buffer.buffer[0..test_data.len()].copy_from_slice(test_data); + direct_buffer.data_end = test_data.len(); + + // Test that we can get buffer data + + // Test unescaped content - add some unescaped data + direct_buffer.unescaped_len = 3; + direct_buffer.buffer[0..3].copy_from_slice(b"abc"); + + let unescaped_slice = direct_buffer.get_unescaped_slice().unwrap(); + assert_eq!(unescaped_slice, b"abc"); + + // The key expectation: these slices should live as long as the original buffer + // and be usable to create String::Borrowed(&'buffer str) and String::Unescaped(&'buffer str) + } + + #[test] + fn test_new_direct_buffer() { + let mut buffer = [0u8; 100]; + let db = DirectBuffer::new(&mut buffer); + + assert_eq!(db.tokenize_pos, 0); + assert_eq!(db.data_end, 0); + assert_eq!(db.unescaped_len, 0); + assert_eq!(db.escape_reserve, 64); // 10% of 100, minimum 64 + assert!(db.is_empty()); + } + + #[test] + fn test_fill_and_advance() { + let mut buffer = [0u8; 100]; + let mut db = DirectBuffer::new(&mut buffer); + + // Fill with some data + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice[0..5].copy_from_slice(b"hello"); + } + db.mark_filled(5).unwrap(); + + assert_eq!(db.data_end, 5); + assert_eq!(db.remaining_bytes(), 5); + + // Read bytes + assert_eq!(db.current_byte().unwrap(), b'h'); + db.advance().unwrap(); + assert_eq!(db.current_byte().unwrap(), b'e'); + assert_eq!(db.remaining_bytes(), 4); + } + + #[test] + fn test_error_conditions() { + let mut buffer = [0u8; 10]; + let mut db = DirectBuffer::new(&mut buffer); + + // EndOfData errors + assert_eq!(db.current_byte().unwrap_err(), DirectBufferError::EndOfData); + assert_eq!(db.advance().unwrap_err(), DirectBufferError::EndOfData); + + // No unescaped content + assert!(db.get_unescaped_slice().is_err()); + } + + #[test] + fn test_buffer_stats() { + let mut buffer = [0u8; 100]; + let mut db = DirectBuffer::new(&mut buffer); + + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice[0..10].copy_from_slice(b"0123456789"); + } + db.mark_filled(10).unwrap(); + + for _ in 0..3 { + db.advance().unwrap(); + } + + let stats = db.stats(); + assert_eq!(stats.total_capacity, 100); + assert_eq!(stats.tokenize_pos, 3); + assert_eq!(stats.data_end, 10); + assert_eq!(stats.remaining_bytes, 7); + assert_eq!(stats.available_space, 90); + } + + #[test] + fn test_buffer_full_scenario() { + // Test what happens when buffer gets completely full + let mut buffer = [0u8; 10]; + let mut db = DirectBuffer::new(&mut buffer); + + // Fill buffer completely + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice.copy_from_slice(b"0123456789"); + } + db.mark_filled(10).unwrap(); + + // No more space for filling + assert!(db.get_fill_slice().is_none()); + + // We can still read from buffer + assert_eq!(db.current_byte().unwrap(), b'0'); + assert_eq!(db.remaining_bytes(), 10); + } + + #[test] + fn test_minimal_buffer_with_long_token() { + // Test very small buffer with a token that doesn't fit + let mut buffer = [0u8; 8]; // Very small buffer + let mut db = DirectBuffer::new(&mut buffer); + + // Try to put a string that's almost as big as the buffer + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice[0..6].copy_from_slice(b"\"hello"); // Start of a long string, no closing quote + } + db.mark_filled(6).unwrap(); + + // Advance through the data + for _ in 0..6 { + db.advance().unwrap(); + } + + // Now buffer is exhausted but we don't have a complete token + assert!(db.is_empty()); + assert_eq!(db.remaining_bytes(), 0); + + // This simulates the scenario where we need more data but can't fit it + // The parser would need to handle this by buffering the incomplete token + } + + #[test] + fn test_reader_returns_zero_bytes() { + let mut buffer = [0u8; 20]; + let mut db = DirectBuffer::new(&mut buffer); + + // Simulate Reader returning 0 bytes (EOF) + { + let fill_slice = db.get_fill_slice().unwrap(); + assert_eq!(fill_slice.len(), 20); + // Reader returns 0 bytes - simulating EOF or no data available + } + db.mark_filled(0).unwrap(); // Reader returned 0 + + assert!(db.is_empty()); + assert_eq!(db.data_end, 0); + assert_eq!(db.remaining_bytes(), 0); + + // Should still be able to get fill slice for next attempt + let fill_slice = db.get_fill_slice().unwrap(); + assert_eq!(fill_slice.len(), 20); + } + + #[test] + fn test_maximum_escape_reserve_scenario() { + let mut buffer = [0u8; 100]; + let db = DirectBuffer::new(&mut buffer); + + // Check escape reserve calculation + let stats = db.stats(); + assert_eq!(stats.escape_reserve, 64); // max(100/10, 64) = 64 + + // Test with smaller buffer + let mut small_buffer = [0u8; 50]; + let small_db = DirectBuffer::new(&mut small_buffer); + let small_stats = small_db.stats(); + assert_eq!(small_stats.escape_reserve, 64); // Still 64 (minimum) + + // Test with larger buffer + let mut large_buffer = [0u8; 1000]; + let large_db = DirectBuffer::new(&mut large_buffer); + let large_stats = large_db.stats(); + assert_eq!(large_stats.escape_reserve, 100); // 1000/10 = 100 + } + + #[test] + fn test_boundary_conditions() { + let mut buffer = [0u8; 3]; // Absolute minimum + let mut db = DirectBuffer::new(&mut buffer); + + // Can't even hold a proper JSON token, but should not crash + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice.copy_from_slice(b"\"a\""); + } + db.mark_filled(3).unwrap(); + + // Should be able to read through it + assert_eq!(db.current_byte().unwrap(), b'"'); + db.advance().unwrap(); + assert_eq!(db.current_byte().unwrap(), b'a'); + db.advance().unwrap(); + assert_eq!(db.current_byte().unwrap(), b'"'); + db.advance().unwrap(); + + assert!(db.is_empty()); + } + + #[test] + fn test_start_unescaping_with_copy_span_too_large() { + let mut buffer = [0u8; 10]; // Small buffer + let mut db = DirectBuffer::new(&mut buffer); + + // Fill buffer with some data + { + let fill_slice = db.get_fill_slice().unwrap(); + fill_slice.copy_from_slice(b"0123456789"); + } + db.mark_filled(10).unwrap(); + + // Try to copy a span that's larger than the entire buffer + let copy_start = 0; + let copy_end = 15; // This span (15 bytes) is larger than buffer (10 bytes) + let max_escaped_len = 5; // This is fine + + // Should return BufferFull error instead of silently truncating + let result = db.start_unescaping_with_copy(max_escaped_len, copy_start, copy_end); + assert_eq!(result.unwrap_err(), DirectBufferError::BufferFull); + + // Test boundary case: span exactly equals buffer size should work + let copy_end_exact = 10; // Span of exactly 10 bytes (buffer size) + let result = db.start_unescaping_with_copy(max_escaped_len, 0, copy_end_exact); + assert!(result.is_ok()); + assert_eq!(db.unescaped_len, 10); + + // Test valid smaller span should work + db.clear_unescaped(); + let result = db.start_unescaping_with_copy(max_escaped_len, 2, 6); // 4 byte span + assert!(result.is_ok()); + assert_eq!(db.unescaped_len, 4); + assert_eq!(db.get_unescaped_slice().unwrap(), b"2345"); + } + + #[test] + fn test_append_unescaped_byte_respects_escape_reserve() { + let mut buffer = [0u8; 100]; // 100 byte buffer + let mut db = DirectBuffer::new(&mut buffer); + + // Check escape reserve was set correctly (10% of 100, minimum 64) + let stats = db.stats(); + assert_eq!(stats.escape_reserve, 64); + + // Should be able to append up to (buffer_len - escape_reserve) bytes + let max_unescaped = 100 - db.escape_reserve; // 100 - 64 = 36 + + // Fill up to the limit - should succeed + for i in 0..max_unescaped { + let result = db.append_unescaped_byte(b'A'); + assert!(result.is_ok(), "Failed at byte {}", i); + } + + assert_eq!(db.unescaped_len, max_unescaped); + + // One more byte should fail due to escape reserve constraint + let result = db.append_unescaped_byte(b'B'); + assert_eq!(result.unwrap_err(), DirectBufferError::BufferFull); + + // Verify we didn't exceed the escape reserve boundary + assert_eq!(db.unescaped_len, max_unescaped); + } + + #[test] + fn test_append_unescaped_byte_escape_reserve_larger_than_buffer() { + let mut buffer = [0u8; 10]; // Very small buffer + let mut db = DirectBuffer::new(&mut buffer); + + // Even small buffers get minimum 64 byte escape reserve, but that's larger than buffer + let stats = db.stats(); + assert_eq!(stats.escape_reserve, 64); // minimum + + // Since escape_reserve (64) > buffer.len() (10), no bytes should be appendable + // This should not panic with underflow, but return BufferFull error + let result = db.append_unescaped_byte(b'A'); + assert_eq!(result.unwrap_err(), DirectBufferError::BufferFull); + + // Test with even smaller buffer to ensure we handle underflow correctly + let mut tiny_buffer = [0u8; 3]; + let mut tiny_db = DirectBuffer::new(&mut tiny_buffer); + let tiny_stats = tiny_db.stats(); + assert_eq!(tiny_stats.escape_reserve, 64); // Still minimum 64 + + // Should handle this gracefully without panic + let result = tiny_db.append_unescaped_byte(b'B'); + assert_eq!(result.unwrap_err(), DirectBufferError::BufferFull); + } +} + +impl<'b> crate::number_parser::NumberExtractor for DirectBuffer<'b> { + fn get_number_slice( + &self, + start: usize, + end: usize, + ) -> Result<&[u8], crate::shared::ParseError> { + self.get_string_slice(start, end) + .map_err(|_| crate::shared::ParseError::UnexpectedState("Invalid number slice bounds")) + } + + fn current_position(&self) -> usize { + self.tokenize_pos + } + + fn is_empty(&self) -> bool { + self.tokenize_pos >= self.data_end + } +} diff --git a/stax/src/direct_parser.rs b/stax/src/direct_parser.rs new file mode 100644 index 0000000..9d253dd --- /dev/null +++ b/stax/src/direct_parser.rs @@ -0,0 +1,1437 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::direct_buffer::DirectBuffer; +use crate::escape_processor::{EscapeProcessor, UnicodeEscapeCollector}; +use crate::shared::{ContentRange, Event, ParseError, ParserErrorHandler, ParserState}; +use ujson::BitStackCore; +use ujson::{BitStack, EventToken, Tokenizer}; + +/// Trait for input sources that can provide data to the streaming parser +pub trait Reader { + /// The error type returned by read operations + type Error; + + /// Read data into the provided buffer. + /// Returns the number of bytes read, or an error. + /// + /// # Contract + /// - A return value of 0 **MUST** indicate true end of stream + /// - Implementations **MUST NOT** return 0 unless no more data will ever be available + /// - Returning 0 followed by non-zero reads in subsequent calls violates this contract + fn read(&mut self, buf: &mut [u8]) -> Result; +} + +/// Result of processing a tokenizer event +enum EventResult { + /// Event processing complete, return this event + Complete(Event<'static, 'static>), + /// Continue processing, no event to return yet + Continue, + /// Extract string content from current state + ExtractString, + /// Extract key content from current state + ExtractKey, + /// Extract number content from current state + ExtractNumber, + /// Extract number content from current state (came from container end - exclude delimiter) + ExtractNumberFromContainer, +} + +/// Represents a pending container end event that needs to be emitted after number extraction +#[derive(Debug, Clone, Copy, PartialEq)] +enum PendingContainerEnd { + /// Pending ArrayEnd event + ArrayEnd, + /// Pending ObjectEnd event + ObjectEnd, +} + +/// Represents the processing state of the DirectParser +/// Enforces logical invariants: once Finished, no other processing states are possible +#[derive(Debug)] +enum ProcessingState { + /// Normal active processing + Active { + unescaped_reset_queued: bool, + in_escape_sequence: bool, + }, + /// All input consumed, tokenizer finished + Finished, +} + +/// A streaming JSON parser using DirectBuffer for single-buffer input and escape processing +pub struct DirectParser<'b, T: BitStack, D, R: Reader> { + /// The tokenizer that processes JSON tokens + tokenizer: Tokenizer, + /// Parser state tracking + parser_state: ParserState, + /// Reader for streaming input + reader: R, + /// DirectBuffer for single-buffer input and escape processing + direct_buffer: DirectBuffer<'b>, + + // NEW: Future state machine - will gradually replace fields below + /// Processing state machine that enforces logical invariants + processing_state: ProcessingState, + + // PHASE 2.4 COMPLETE: Escape sequence state migrated to processing_state enum + /// Pending container end event to emit after number extraction + pending_container_end: Option, + /// Shared Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, +} + +impl<'b, T: BitStack + core::fmt::Debug, D: BitStackCore, R: Reader> DirectParser<'b, T, D, R> { + /// Create a new DirectParser + pub fn new(reader: R, buffer: &'b mut [u8]) -> Self { + Self { + tokenizer: Tokenizer::new(), + parser_state: ParserState::new(), + reader, + direct_buffer: DirectBuffer::new(buffer), + + // Initialize new state machine to Active with default values + processing_state: ProcessingState::Active { + unescaped_reset_queued: false, + in_escape_sequence: false, + }, + + // Phase 2.4 complete: escape sequence state now in enum + pending_container_end: None, + unicode_escape_collector: UnicodeEscapeCollector::new(), + } + } + + /// Iterator-compatible method that returns None when parsing is complete. + /// This method returns None when EndDocument is reached, Some(Ok(event)) for successful events, + /// and Some(Err(error)) for parsing errors. + pub fn next(&mut self) -> Option> { + match self.next_event() { + Ok(Event::EndDocument) => None, + other => Some(other), + } + } + + /// Get the next JSON event from the stream - very simple increment + pub fn next_event(&mut self) -> Result { + // Apply any queued unescaped content reset from previous call + self.apply_unescaped_reset_if_queued(); + + // Check if we have pending events to emit + if let Some(pending) = self.pending_container_end.take() { + match pending { + PendingContainerEnd::ArrayEnd => { + return Ok(Event::EndArray); + } + PendingContainerEnd::ObjectEnd => { + return Ok(Event::EndObject); + } + } + } + + loop { + // Make sure we have data in buffer + self.fill_buffer_from_reader()?; + + if self.direct_buffer.is_empty() { + // End of data - call tokenizer finish to handle any pending tokens (only once) + if !matches!(self.processing_state, ProcessingState::Finished) { + // Transition to Finished state + self.processing_state = ProcessingState::Finished; + self.parser_state.evts[0] = None; + let mut callback = |event, _len| { + self.parser_state.evts[0] = Some(event); + }; + + match self.tokenizer.finish(&mut callback) { + Ok(_) => { + // Check if finish generated an event + if let Some(event) = self.parser_state.evts[0].take() { + match self.process_tokenizer_event(event)? { + EventResult::Complete(parsed_event) => return Ok(parsed_event), + EventResult::ExtractString => { + return self.extract_string_from_state(); + } + EventResult::ExtractKey => { + return self.extract_key_from_state(); + } + EventResult::ExtractNumber => { + return self.extract_number_from_state_with_context(false); + } + EventResult::ExtractNumberFromContainer => { + return self.extract_number_from_state_with_context(true); + } + EventResult::Continue => { + // Continue to EndDocument + } + } + } + } + Err(_) => { + return Err(ParseError::TokenizerError); + } + } + } + + return Ok(Event::EndDocument); + } + + // Get byte and advance in separate steps to avoid borrow conflicts + let byte = self.direct_buffer.current_byte()?; + self.direct_buffer.advance()?; + + // Process byte through tokenizer + self.parser_state.evts[0] = None; + let mut callback = |event, _len| { + self.parser_state.evts[0] = Some(event); + }; + + match self.tokenizer.parse_chunk(&[byte], &mut callback) { + Ok(_) => { + // Handle special cases for Begin events that include the current byte + if let Some(event) = &self.parser_state.evts[0] { + match event { + ujson::Event::Begin(EventToken::UnicodeEscape) => { + // Current byte is the first hex digit - reset collector and add it + self.unicode_escape_collector.reset(); + self.unicode_escape_collector.add_hex_digit(byte)?; + } + ujson::Event::End(EventToken::UnicodeEscape) => { + // Current byte is the fourth hex digit - add it to complete the sequence + self.unicode_escape_collector.add_hex_digit(byte)?; + } + _ => {} + } + } + + // Check if we got an event + if let Some(event) = self.parser_state.evts[0].take() { + // Process the event and see what to do + match self.process_tokenizer_event(event)? { + EventResult::Complete(parsed_event) => return Ok(parsed_event), + EventResult::ExtractString => { + // Extract string content after buffer operations are done + return self.extract_string_from_state(); + } + EventResult::ExtractKey => { + // Extract key content after buffer operations are done + return self.extract_key_from_state(); + } + EventResult::ExtractNumber => { + // Extract number content after buffer operations are done + return self.extract_number_from_state_with_context(false); + } + EventResult::ExtractNumberFromContainer => { + // Extract number content that was terminated by container end + return self.extract_number_from_state_with_context(true); + } + EventResult::Continue => { + // Continue processing + } + } + } else { + // No event was generated, handle accumulation + self.handle_byte_accumulation(byte)?; + } + // Continue processing if no event produced + } + Err(_) => { + return Err(ParseError::TokenizerError); + } + } + } + } + + /// Process event and update state, but defer complex processing + fn process_tokenizer_event(&mut self, event: ujson::Event) -> Result { + Ok(match event { + // Container events + ujson::Event::ObjectStart => EventResult::Complete(Event::StartObject), + ujson::Event::ObjectEnd => { + // Check if we're in the middle of parsing a number - if so, extract it first + if matches!(self.parser_state.state, crate::shared::State::Number(_)) { + // Extract the number first, then we'll emit EndObject on the next call + self.pending_container_end = Some(PendingContainerEnd::ObjectEnd); + EventResult::ExtractNumberFromContainer + } else { + EventResult::Complete(Event::EndObject) + } + } + ujson::Event::ArrayStart => EventResult::Complete(Event::StartArray), + ujson::Event::ArrayEnd => { + // Check if we're in the middle of parsing a number - if so, extract it first + if matches!(self.parser_state.state, crate::shared::State::Number(_)) { + // Extract the number first, then we'll emit EndArray on the next call + self.pending_container_end = Some(PendingContainerEnd::ArrayEnd); + EventResult::ExtractNumberFromContainer + } else { + EventResult::Complete(Event::EndArray) + } + } + + // String/Key events + ujson::Event::Begin(EventToken::Key) => { + // Mark start position for key (current position is AFTER opening quote was processed) + // We want to store the position of the opening quote, so back up by 1 + let current_pos = self.direct_buffer.current_position(); + let quote_pos = ContentRange::quote_position_from_current(current_pos); + self.parser_state.state = crate::shared::State::Key(quote_pos); + + // DirectBuffer will handle escape processing state internally + + EventResult::Continue // Continue processing + } + ujson::Event::End(EventToken::Key) => { + // Mark that we need to extract key, but defer the actual extraction + EventResult::ExtractKey + } + ujson::Event::Begin(EventToken::String) => { + // Mark start position for string (current position is AFTER opening quote was processed) + // We want to store the position of the opening quote, so back up by 1 + let current_pos = self.direct_buffer.current_position(); + let quote_pos = ContentRange::quote_position_from_current(current_pos); + self.parser_state.state = crate::shared::State::String(quote_pos); + + // DirectBuffer will handle escape processing state internally + + EventResult::Continue // Continue processing + } + ujson::Event::End(EventToken::String) => { + // Mark that we need to extract string, but defer the actual extraction + EventResult::ExtractString + } + + // Number events + ujson::Event::Begin(EventToken::Number) => { + // Mark start position for number (current position is where number starts) + let current_pos = self.direct_buffer.current_position(); + let number_start = ContentRange::number_start_from_current(current_pos); + self.parser_state.state = crate::shared::State::Number(number_start); + EventResult::Continue + } + ujson::Event::End(EventToken::Number) => { + // Extract number content after buffer operations are done (standalone number) + EventResult::ExtractNumber + } + ujson::Event::End(EventToken::NumberAndArray) => { + // Extract number content, but the tokenizer will handle the array end separately + EventResult::ExtractNumber + } + ujson::Event::End(EventToken::NumberAndObject) => { + // Extract number content, but the tokenizer will handle the object end separately + EventResult::ExtractNumber + } + + // Boolean and null values + ujson::Event::Begin(EventToken::True | EventToken::False | EventToken::Null) => { + EventResult::Continue + } + ujson::Event::End(EventToken::True) => EventResult::Complete(Event::Bool(true)), + ujson::Event::End(EventToken::False) => EventResult::Complete(Event::Bool(false)), + ujson::Event::End(EventToken::Null) => EventResult::Complete(Event::Null), + + // Escape sequence handling + ujson::Event::Begin(EventToken::EscapeSequence) => { + // Start of escape sequence - we'll handle escapes by unescaping to buffer start + return self.start_escape_processing(); + } + ujson::Event::End( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) => { + // Process simple escape sequence + self.handle_simple_escape(&escape_token)? + } + ujson::Event::Begin(EventToken::UnicodeEscape) => { + // Start Unicode escape - initialize hex collection + self.start_unicode_escape() + } + ujson::Event::End(EventToken::UnicodeEscape) => { + // End Unicode escape - process collected hex digits + return self.finish_unicode_escape(); + } + ujson::Event::End(EventToken::EscapeSequence) => { + // End of escape sequence - should not occur as individual event + // Escape sequences should end with specific escape types + return Err(ParseError::TokenizerError); + } + + // Handle any unexpected Begin events defensively + ujson::Event::Begin( + EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab, + ) => { + // These should never have Begin events, only End events + return Err(ParseError::TokenizerError); + } + ujson::Event::Begin(EventToken::NumberAndArray | EventToken::NumberAndObject) => { + // These tokens should only appear as End events, not Begin events + return Err(ParseError::TokenizerError); + } + }) + } + + /// Extract string after all buffer operations are complete + fn extract_string_from_state(&mut self) -> Result { + let crate::shared::State::String(start_pos) = self.parser_state.state else { + return Err(ParserErrorHandler::state_mismatch("string", "extract")); + }; + + self.parser_state.state = crate::shared::State::None; + + if self.direct_buffer.has_unescaped_content() { + self.create_unescaped_string() + } else { + self.create_borrowed_string(start_pos) + } + } + + /// Helper to create an unescaped string from DirectBuffer + fn create_unescaped_string(&mut self) -> Result { + self.queue_unescaped_reset(); + let unescaped_slice = self.direct_buffer.get_unescaped_slice()?; + let str_content = ParserErrorHandler::bytes_to_utf8_str(unescaped_slice)?; + Ok(Event::String(crate::String::Unescaped(str_content))) + } + + /// Helper to create a borrowed string from DirectBuffer + fn create_borrowed_string(&mut self, start_pos: usize) -> Result { + let current_pos = self.direct_buffer.current_position(); + let (content_start, content_end) = + ContentRange::string_content_bounds(start_pos, current_pos); + + let bytes = self + .direct_buffer + .get_string_slice(content_start, content_end)?; + let str_content = ParserErrorHandler::bytes_to_utf8_str(bytes)?; + Ok(Event::String(crate::String::Borrowed(str_content))) + } + + /// Extract key after all buffer operations are complete + fn extract_key_from_state(&mut self) -> Result { + let crate::shared::State::Key(start_pos) = self.parser_state.state else { + return Err(ParserErrorHandler::state_mismatch("key", "extract")); + }; + + self.parser_state.state = crate::shared::State::None; + + if self.direct_buffer.has_unescaped_content() { + self.create_unescaped_key() + } else { + self.create_borrowed_key(start_pos) + } + } + + /// Helper to create an unescaped key from DirectBuffer + fn create_unescaped_key(&mut self) -> Result { + self.queue_unescaped_reset(); + let unescaped_slice = self.direct_buffer.get_unescaped_slice()?; + let str_content = ParserErrorHandler::bytes_to_utf8_str(unescaped_slice)?; + Ok(Event::Key(crate::String::Unescaped(str_content))) + } + + /// Helper to create a borrowed key from DirectBuffer + fn create_borrowed_key(&mut self, start_pos: usize) -> Result { + let current_pos = self.direct_buffer.current_position(); + let (content_start, content_end) = + ContentRange::string_content_bounds(start_pos, current_pos); + + let bytes = self + .direct_buffer + .get_string_slice(content_start, content_end)?; + let str_content = ParserErrorHandler::bytes_to_utf8_str(bytes)?; + Ok(Event::Key(crate::String::Borrowed(str_content))) + } + + /// Extract number with delimiter context using unified parsing logic + fn extract_number_from_state_with_context( + &mut self, + from_container_end: bool, + ) -> Result { + let crate::shared::State::Number(start_pos) = self.parser_state.state else { + return Err(ParserErrorHandler::state_mismatch("number", "extract")); + }; + + self.parser_state.state = crate::shared::State::None; + + // Use unified number parsing logic + crate::number_parser::parse_number_event(&self.direct_buffer, start_pos, from_container_end) + } + /// Fill buffer from reader + fn fill_buffer_from_reader(&mut self) -> Result<(), ParseError> { + if let Some(fill_slice) = self.direct_buffer.get_fill_slice() { + let bytes_read = self + .reader + .read(fill_slice) + .map_err(|_| ParseError::ReaderError)?; + + self.direct_buffer.mark_filled(bytes_read)?; + + // Note: bytes_read == 0 indicates end-of-stream per trait contract. + // The main loop will handle transitioning to Finished state when buffer is empty. + } + Ok(()) + } + + /// Get buffer statistics for debugging + pub fn buffer_stats(&self) -> crate::direct_buffer::DirectBufferStats { + self.direct_buffer.stats() + } + + /// Handle byte accumulation for strings/keys and Unicode escape sequences + fn handle_byte_accumulation(&mut self, byte: u8) -> Result<(), ParseError> { + // Check if we're in a string or key state + let in_string_mode = matches!( + self.parser_state.state, + crate::shared::State::String(_) | crate::shared::State::Key(_) + ); + + if in_string_mode { + // Access escape state from enum + let in_escape = if let ProcessingState::Active { + in_escape_sequence, .. + } = &self.processing_state + { + *in_escape_sequence + } else { + false + }; + + // Check if we're collecting Unicode hex digits (2nd and 3rd) + let hex_count = self.unicode_escape_collector.hex_count(); + if in_escape && hex_count > 0 && hex_count < 3 { + // We're in a Unicode escape - collect 2nd and 3rd hex digits + self.unicode_escape_collector.add_hex_digit(byte)?; + } else if !in_escape { + // Normal byte - if we're doing escape processing, accumulate it + if self.direct_buffer.has_unescaped_content() { + self.append_byte_to_escape_buffer(byte)?; + } + } + } + + Ok(()) + } + + /// Start escape processing using DirectBuffer + fn start_escape_processing(&mut self) -> Result { + // Update escape state in enum + if let ProcessingState::Active { + ref mut in_escape_sequence, + .. + } = self.processing_state + { + *in_escape_sequence = true; + } + + // Initialize escape processing with DirectBuffer if not already started + if !self.direct_buffer.has_unescaped_content() { + if let crate::shared::State::String(start_pos) | crate::shared::State::Key(start_pos) = + self.parser_state.state + { + let current_pos = self.direct_buffer.current_position(); + let (content_start, content_end) = + ContentRange::string_content_bounds_before_escape(start_pos, current_pos); + + // Estimate max length needed for unescaping (content so far + remaining buffer) + let max_escaped_len = + self.direct_buffer.remaining_bytes() + (content_end - content_start); + + // Start unescaping with DirectBuffer and copy existing content + self.direct_buffer.start_unescaping_with_copy( + max_escaped_len, + content_start, + content_end, + )?; + } + } + + Ok(EventResult::Continue) + } + + /// Handle simple escape sequence using unified EscapeProcessor + fn handle_simple_escape( + &mut self, + escape_token: &EventToken, + ) -> Result { + // Update escape state in enum + if let ProcessingState::Active { + ref mut in_escape_sequence, + .. + } = self.processing_state + { + *in_escape_sequence = false; + } + + // Use unified escape token processing from EscapeProcessor + if let Ok(unescaped_char) = EscapeProcessor::process_escape_token(escape_token) { + self.append_byte_to_escape_buffer(unescaped_char)?; + } + + Ok(EventResult::Continue) + } + + /// Start Unicode escape sequence + fn start_unicode_escape(&mut self) -> EventResult { + // Update escape state in enum + if let ProcessingState::Active { + ref mut in_escape_sequence, + .. + } = self.processing_state + { + *in_escape_sequence = true; + } + // Note: unicode_hex_pos and first hex digit are set in the special case handler + EventResult::Continue + } + + /// Finish Unicode escape sequence using shared UnicodeEscapeCollector + fn finish_unicode_escape(&mut self) -> Result { + // Update escape state + if let ProcessingState::Active { + ref mut in_escape_sequence, + .. + } = self.processing_state + { + *in_escape_sequence = false; + } else { + return Err(ParserErrorHandler::state_mismatch("active", "process")); + } + + // Verify we have collected all 4 hex digits + if !self.unicode_escape_collector.is_complete() { + return Err(ParserErrorHandler::invalid_unicode_escape()); + } + + // Process Unicode escape using the shared collector + let mut utf8_buf = [0u8; 4]; + let utf8_bytes = self + .unicode_escape_collector + .process_to_utf8(&mut utf8_buf)?; + + // Append UTF-8 bytes to escape buffer + for &byte in utf8_bytes { + self.append_byte_to_escape_buffer(byte)?; + } + + Ok(EventResult::Continue) + } + + /// Append a byte to the DirectBuffer's unescaped content + fn append_byte_to_escape_buffer(&mut self, byte: u8) -> Result<(), ParseError> { + self.direct_buffer + .append_unescaped_byte(byte) + .map_err(|e| e.into()) + } + + /// Queue a reset of unescaped content for the next next_event() call + fn queue_unescaped_reset(&mut self) { + // Set the reset flag in the Active state + if let ProcessingState::Active { + ref mut unescaped_reset_queued, + .. + } = self.processing_state + { + *unescaped_reset_queued = true; + } + // Legacy field removed - now fully using enum + } + + /// Apply queued unescaped content reset if flag is set + fn apply_unescaped_reset_if_queued(&mut self) { + // Check the enum field first + let should_reset = if let ProcessingState::Active { + ref mut unescaped_reset_queued, + .. + } = self.processing_state + { + let needs_reset = *unescaped_reset_queued; + *unescaped_reset_queued = false; // Clear the flag + needs_reset + } else { + false + }; + + if should_reset { + self.direct_buffer.clear_unescaped(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Simple test reader that reads from a byte slice + pub struct SliceReader<'a> { + data: &'a [u8], + position: usize, + } + + impl<'a> SliceReader<'a> { + pub fn new(data: &'a [u8]) -> Self { + Self { data, position: 0 } + } + } + + impl<'a> Reader for SliceReader<'a> { + type Error = (); + + fn read(&mut self, buf: &mut [u8]) -> Result { + let remaining = self.data.len() - self.position; + if remaining == 0 { + return Ok(0); // EOF + } + + let to_copy = remaining.min(buf.len()); + buf[..to_copy].copy_from_slice(&self.data[self.position..self.position + to_copy]); + self.position += to_copy; + Ok(to_copy) + } + } + + type TestDirectParser<'b> = DirectParser<'b, u32, u8, SliceReader<'static>>; + + #[test] + fn test_direct_parser_simple_object() { + let json = b"{}"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + // Should get ObjectStart + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::StartObject)); + + // Should get ObjectEnd + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::EndObject)); + + // Should get EndDocument + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::EndDocument)); + } + + #[test] + fn test_direct_parser_simple_array() { + let json = b"[]"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + // Should get ArrayStart + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::StartArray)); + + // Should get ArrayEnd + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::EndArray)); + + // Should get EndDocument + let event = parser.next_event().unwrap(); + assert!(matches!(event, Event::EndDocument)); + } + + #[test] + fn test_direct_parser_simple_escape() { + let json = b"\"hello\\nworld\""; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + if let Event::String(json_string) = parser.next_event().unwrap() { + // For now, test will fail as escapes aren't implemented yet + // This will be fixed once escape handling is added + println!("Got string: '{}'", json_string.as_str()); + } else { + panic!("Expected String event"); + } + } + + #[test] + fn test_pending_state_edge_cases() { + // Test 1: Complex nested container endings + let json1 = br#"{"a": {"b": [{"c": 123}]}}"#; + let reader1 = SliceReader::new(json1); + let mut buffer1 = [0u8; 256]; + let mut parser1 = TestDirectParser::new(reader1, &mut buffer1); + + let mut events = Vec::new(); + loop { + match parser1.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => events.push(format!("{:?}", event)), + Err(e) => panic!("Nested containers failed: {:?}", e), + } + } + + // Should contain all expected events + assert!(events.len() >= 8); // StartObject, Key, StartObject, Key, StartArray, StartObject, Key, Number, EndObject, EndArray, EndObject, EndObject + + // Test 2: Mixed types after numbers in array + let json2 = br#"[123, "string", true, null, 456]"#; + let reader2 = SliceReader::new(json2); + let mut buffer2 = [0u8; 256]; + let mut parser2 = TestDirectParser::new(reader2, &mut buffer2); + + let mut number_count = 0; + loop { + match parser2.next_event() { + Ok(Event::EndDocument) => break, + Ok(Event::Number(_)) => number_count += 1, + Ok(_) => {} + Err(e) => panic!("Mixed types failed: {:?}", e), + } + } + assert_eq!(number_count, 2); // Should find both 123 and 456 + + // Test 3: Empty containers + let json3 = br#"[[], {}, [{}], {"empty": []}]"#; + let reader3 = SliceReader::new(json3); + let mut buffer3 = [0u8; 256]; + let mut parser3 = TestDirectParser::new(reader3, &mut buffer3); + + loop { + match parser3.next_event() { + Ok(Event::EndDocument) => break, + Ok(_) => {} + Err(e) => panic!("Empty containers failed: {:?}", e), + } + } + + // Test 4: Multiple consecutive numbers + let json4 = br#"[1, 2, 3, 4, 5]"#; + let reader4 = SliceReader::new(json4); + let mut buffer4 = [0u8; 256]; + let mut parser4 = TestDirectParser::new(reader4, &mut buffer4); + + let mut consecutive_numbers = Vec::new(); + loop { + match parser4.next_event() { + Ok(Event::EndDocument) => break, + Ok(Event::Number(n)) => consecutive_numbers.push(n.as_str().to_string()), + Ok(_) => {} + Err(e) => panic!("Consecutive numbers failed: {:?}", e), + } + } + assert_eq!(consecutive_numbers, vec!["1", "2", "3", "4", "5"]); + } + + #[test] + fn test_error_recovery_with_pending_state() { + // Test error handling - this should fail gracefully without hanging onto pending state + let invalid_json = br#"{"key": 123,"#; // Missing closing brace + let reader = SliceReader::new(invalid_json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + // Parse until we hit an error or EOF + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, // EOF reached + Ok(_) => {} + Err(_) => break, // Error occurred - this is expected + } + } + + // The important thing is that we don't panic or hang + // The specific error behavior may vary + } + + #[test] + fn test_multiple_rapid_container_ends() { + // Test deeply nested structures that end with numbers + // This tests whether we can handle multiple rapid container ends correctly + + // Test 1: Deeply nested arrays ending with number + let json1 = br#"[[[123]]]"#; + let reader1 = SliceReader::new(json1); + let mut buffer1 = [0u8; 256]; + let mut parser1 = TestDirectParser::new(reader1, &mut buffer1); + + let mut events1 = Vec::new(); + loop { + match parser1.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => events1.push(format!("{:?}", event)), + Err(e) => panic!("Deeply nested arrays failed: {:?}", e), + } + } + + // Should have: StartArray, StartArray, StartArray, Number(123), EndArray, EndArray, EndArray + assert_eq!(events1.len(), 7); + assert!(events1[3].contains("Number")); + assert_eq!(&events1[4], "EndArray"); + assert_eq!(&events1[5], "EndArray"); + assert_eq!(&events1[6], "EndArray"); + + // Test 2: Mixed nested containers ending with number + let json2 = br#"{"a": [{"b": 456}]}"#; + let reader2 = SliceReader::new(json2); + let mut buffer2 = [0u8; 256]; + let mut parser2 = TestDirectParser::new(reader2, &mut buffer2); + + let mut events2 = Vec::new(); + loop { + match parser2.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => events2.push(format!("{:?}", event)), + Err(e) => panic!("Mixed nested containers failed: {:?}", e), + } + } + + // Should properly handle the sequence of: number -> EndObject -> EndArray -> EndObject + assert!(events2.len() >= 8); + + // Test 3: Multiple numbers at different nesting levels + let json3 = br#"[123, [456, [789]]]"#; + let reader3 = SliceReader::new(json3); + let mut buffer3 = [0u8; 256]; + let mut parser3 = TestDirectParser::new(reader3, &mut buffer3); + + let mut number_count = 0; + let mut events3 = Vec::new(); + loop { + match parser3.next_event() { + Ok(Event::EndDocument) => break, + Ok(Event::Number(n)) => { + number_count += 1; + events3.push(format!("Number({})", n.as_str())); + } + Ok(event) => events3.push(format!("{:?}", event)), + Err(e) => panic!("Multiple nested numbers failed: {:?}", e), + } + } + + assert_eq!(number_count, 3); // Should find all three numbers: 123, 456, 789 + } + + #[test] + fn test_pending_flag_priority() { + // Defensive test: ensure that if both pending flags were somehow set, + // we handle it gracefully (this shouldn't happen in normal operation) + + let json = br#"[{"key": 123}]"#; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + // Parse normally - this should work fine and never set both flags + let mut events = Vec::new(); + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => events.push(format!("{:?}", event)), + Err(e) => panic!("Flag priority test failed: {:?}", e), + } + } + + // Should successfully parse: StartArray, StartObject, Key, Number, EndObject, EndArray + assert_eq!(events.len(), 6); + assert!(events[3].contains("Number")); + assert_eq!(&events[4], "EndObject"); + assert_eq!(&events[5], "EndArray"); + } + + #[test_log::test] + fn test_number_parsing_comparison() { + // Test case to reproduce numbers problem - numbers at end of containers + let problematic_json = r#"{"key": 123, "arr": [456, 789]}"#; + + println!("=== Testing FlexParser ==="); + let mut scratch = [0u8; 1024]; + let mut flex_parser = crate::PullParser::new_with_buffer(problematic_json, &mut scratch); + + // Parse with FlexParser and collect events + let mut flex_events = Vec::new(); + loop { + match flex_parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => flex_events.push(format!("{:?}", event)), + Err(e) => panic!("FlexParser error: {:?}", e), + } + } + + println!("FlexParser events: {:?}", flex_events); + + println!("=== Testing DirectParser ==="); + let json_bytes = problematic_json.as_bytes(); + let reader = SliceReader::new(json_bytes); + let mut buffer = [0u8; 1024]; + let mut direct_parser = TestDirectParser::new(reader, &mut buffer); + + // Parse with DirectParser and collect events + let mut direct_events = Vec::new(); + loop { + match direct_parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(event) => direct_events.push(format!("{:?}", event)), + Err(e) => panic!("DirectParser error: {:?}", e), + } + } + + println!("DirectParser events: {:?}", direct_events); + + // Compare results + assert_eq!( + flex_events, direct_events, + "Parsers should produce identical events" + ); + } + + #[test] + fn test_direct_parser_array_of_strings() { + let json = b"[\"first\", \"second\"]"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + assert!(matches!(parser.next_event().unwrap(), Event::StartArray)); + + if let Event::String(s1) = parser.next_event().unwrap() { + assert_eq!(s1.as_str(), "first"); + } else { + panic!("Expected String event"); + } + + if let Event::String(s2) = parser.next_event().unwrap() { + assert_eq!(s2.as_str(), "second"); + } else { + panic!("Expected String event"); + } + + assert!(matches!(parser.next_event().unwrap(), Event::EndArray)); + } + + #[test] + fn test_direct_parser_object_with_keys() { + let json = b"{\"name\": \"value\", \"count\": \"42\"}"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + assert!(matches!(parser.next_event().unwrap(), Event::StartObject)); + + // First key-value pair + if let Event::Key(key1) = parser.next_event().unwrap() { + assert_eq!(key1.as_str(), "name"); + } else { + panic!("Expected Key event"); + } + + if let Event::String(val1) = parser.next_event().unwrap() { + assert_eq!(val1.as_str(), "value"); + } else { + panic!("Expected String event"); + } + + // Second key-value pair + if let Event::Key(key2) = parser.next_event().unwrap() { + assert_eq!(key2.as_str(), "count"); + } else { + panic!("Expected Key event"); + } + + if let Event::String(val2) = parser.next_event().unwrap() { + assert_eq!(val2.as_str(), "42"); + } else { + panic!("Expected String event"); + } + + assert!(matches!(parser.next_event().unwrap(), Event::EndObject)); + } + + #[test] + fn test_direct_parser_multiple_escapes() { + let json = b"\"line1\\nline2\\ttab\\\"quote\""; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + if let Event::String(json_string) = parser.next_event().unwrap() { + let content = json_string.as_str(); + println!("Multiple escapes result: '{}'", content); + println!("Content bytes: {:?}", content.as_bytes()); + + // Check that escape sequences were properly processed + let has_newline = content.contains('\n'); + let has_tab = content.contains('\t'); + let has_quote = content.contains('"'); + + println!( + "Has newline: {}, Has tab: {}, Has quote: {}", + has_newline, has_tab, has_quote + ); + + // These should be real control characters, not literal \n \t \" + assert!(has_newline, "Should contain actual newline character"); + assert!(has_tab, "Should contain actual tab character"); + assert!(has_quote, "Should contain actual quote character"); + } else { + panic!("Expected String event"); + } + } + + #[test] + fn test_direct_parser_unicode_escape() { + let json = b"\"Hello \\u0041\\u03B1\""; // Hello A(alpha) + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + if let Event::String(json_string) = parser.next_event().unwrap() { + let content = json_string.as_str(); + println!("Unicode escape result: '{}'", content); + // Should be "Hello A⍺" (with actual A and alpha characters) + assert!(content.contains('A')); + // Note: This test will initially fail until we implement Unicode escapes + } else { + panic!("Expected String event"); + } + } + + #[test] + fn test_direct_parser_boolean_true() { + let json = b"true"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::Bool(true)); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + + #[test] + fn test_direct_parser_boolean_false() { + let json = b"false"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::Bool(false)); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + + #[test] + fn test_direct_parser_null() { + let json = b"null"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::Null); + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + + #[test] + fn test_direct_parser_booleans_in_array() { + let json = b"[true, false, null]"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + assert_eq!(parser.next_event().unwrap(), Event::StartArray); + assert_eq!(parser.next_event().unwrap(), Event::Bool(true)); + assert_eq!(parser.next_event().unwrap(), Event::Bool(false)); + assert_eq!(parser.next_event().unwrap(), Event::Null); + assert_eq!(parser.next_event().unwrap(), Event::EndArray); + assert_eq!(parser.next_event().unwrap(), Event::EndDocument); + } + + #[test_log::test] + fn test_direct_parser_number_simple() { + let json = b"42"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "42"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + + #[test] + fn test_direct_parser_number_negative() { + let json = b"-123"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "-123"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + + #[test] + fn test_direct_parser_number_float() { + let json = b"3.14159"; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + #[cfg(feature = "float-error")] + { + // float-error configuration should return an error for float values + let result = parser.next_event(); + assert!( + result.is_err(), + "Expected error for float with float-error configuration" + ); + return; + } + + #[cfg(not(feature = "float-error"))] + { + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "3.14159"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } + } + + #[test_log::test] + fn test_direct_parser_numbers_in_array() { + #[cfg(feature = "float-error")] + let json = b"[42, -7]"; // No floats for float-error config + #[cfg(not(feature = "float-error"))] + let json = b"[42, -7, 3.14]"; // Include float for other configs + + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + assert_eq!(parser.next_event().unwrap(), Event::StartArray); + + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "42"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "-7"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + + #[cfg(not(feature = "float-error"))] + { + let event = parser.next_event().unwrap(); + if let Event::Number(json_number) = event { + assert_eq!(json_number.as_str(), "3.14"); + } else { + panic!("Expected Number event, got: {:?}", event); + } + } + + assert_eq!(parser.next_event().unwrap(), Event::EndArray); + assert_eq!(parser.next_event().unwrap(), Event::EndDocument); + } + + #[test_log::test] + fn test_direct_parser_numbers_in_object() { + #[cfg(feature = "float-error")] + let json = b"{\"count\": 42, \"score\": -7}"; // No floats for float-error config + #[cfg(not(feature = "float-error"))] + let json = b"{\"count\": 42, \"score\": -7.5}"; // Include float for other configs + + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + assert_eq!(parser.next_event().unwrap(), Event::StartObject); + + // First key-value pair + if let Event::Key(key1) = parser.next_event().unwrap() { + assert_eq!(key1.as_str(), "count"); + } else { + panic!("Expected Key event"); + } + + if let Event::Number(val1) = parser.next_event().unwrap() { + assert_eq!(val1.as_str(), "42"); + } else { + panic!("Expected Number event"); + } + + // Second key-value pair + if let Event::Key(key2) = parser.next_event().unwrap() { + assert_eq!(key2.as_str(), "score"); + } else { + panic!("Expected Key event"); + } + + if let Event::Number(val2) = parser.next_event().unwrap() { + #[cfg(feature = "float-error")] + assert_eq!(val2.as_str(), "-7"); + #[cfg(not(feature = "float-error"))] + assert_eq!(val2.as_str(), "-7.5"); + } else { + panic!("Expected Number event"); + } + + assert_eq!(parser.next_event().unwrap(), Event::EndObject); + assert_eq!(parser.next_event().unwrap(), Event::EndDocument); + } + + #[test] + fn test_direct_parser_no_float_configuration() { + // Test that DirectParser properly uses unified number parsing with no-float config + let json = br#"{"integer": 42, "float": 3.14, "scientific": 1e3}"#; + let reader = SliceReader::new(json); + let mut buffer = [0u8; 256]; + let mut parser = TestDirectParser::new(reader, &mut buffer); + + // Parse through the JSON and verify number handling + assert_eq!(parser.next_event().unwrap(), Event::StartObject); + + // Integer key-value + assert_eq!( + parser.next_event().unwrap(), + Event::Key(crate::String::Borrowed("integer")) + ); + if let Event::Number(num) = parser.next_event().unwrap() { + assert_eq!(num.as_str(), "42"); + match num.parsed() { + crate::NumberResult::Integer(i) => assert_eq!(*i, 42), + _ => panic!("Expected integer parsing"), + } + } else { + panic!("Expected Number event"); + } + + // Float key-value - behavior varies by configuration + assert_eq!( + parser.next_event().unwrap(), + Event::Key(crate::String::Borrowed("float")) + ); + + #[cfg(feature = "float-error")] + { + // float-error should return an error when encountering floats + let result = parser.next_event(); + assert!( + result.is_err(), + "Expected error for float with float-error configuration" + ); + return; // Test ends here for float-error + } + + #[cfg(not(feature = "float-error"))] + { + if let Event::Number(num) = parser.next_event().unwrap() { + assert_eq!(num.as_str(), "3.14"); + // In no-float configuration, this should be FloatDisabled + match num.parsed() { + #[cfg(not(feature = "float"))] + crate::NumberResult::FloatDisabled => { + // This is expected in no-float build + } + #[cfg(feature = "float")] + crate::NumberResult::Float(f) => { + // This is expected in float-enabled build + assert!((f - 3.14).abs() < f64::EPSILON); + } + #[cfg(feature = "float-skip")] + crate::NumberResult::FloatSkipped => { + // This is expected in float-skip build + } + #[cfg(feature = "float-truncate")] + crate::NumberResult::FloatTruncated(i) => { + // This is expected in float-truncate build (3.14 -> 3) + assert_eq!(*i, 3); + } + _ => panic!("Unexpected number parsing result for float"), + } + } else { + panic!("Expected Number event"); + } + } + + // Scientific notation handling varies by float configuration + assert_eq!( + parser.next_event().unwrap(), + Event::Key(crate::String::Borrowed("scientific")) + ); + + // float-truncate rejects scientific notation, so test should end early for that config + #[cfg(feature = "float-truncate")] + { + // float-truncate rejects scientific notation since it would require float math + let result = parser.next_event(); + assert!( + result.is_err(), + "Expected error for scientific notation with float-truncate" + ); + return; // Test ends here for float-truncate + } + + #[cfg(not(feature = "float-truncate"))] + { + if let Event::Number(num) = parser.next_event().unwrap() { + assert_eq!(num.as_str(), "1e3"); + match num.parsed() { + #[cfg(not(feature = "float"))] + crate::NumberResult::FloatDisabled => { + // This is expected in no-float build - raw string preserved for manual parsing + } + #[cfg(feature = "float-skip")] + crate::NumberResult::FloatSkipped => { + // This is expected in float-skip build + } + #[cfg(feature = "float")] + crate::NumberResult::Float(f) => { + // This is expected in float-enabled build + assert!((f - 1000.0).abs() < f64::EPSILON); + } + _ => panic!("Unexpected number parsing result for scientific notation"), + } + } else { + panic!("Expected Number event"); + } + + assert_eq!(parser.next_event().unwrap(), Event::EndObject); + assert_eq!(parser.next_event().unwrap(), Event::EndDocument); + } + } +} diff --git a/stax/src/escape_processor.rs b/stax/src/escape_processor.rs new file mode 100644 index 0000000..08e6499 --- /dev/null +++ b/stax/src/escape_processor.rs @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::{shared::ParserErrorHandler, ParseError}; + +/// Shared utilities for processing JSON escape sequences. +/// This module contains pure functions for escape processing that can be used +/// by both CopyOnEscape and StreamingBuffer components. +pub(crate) struct EscapeProcessor; + +impl EscapeProcessor { + /// Convert an escape token from the tokenizer to the corresponding escape character. + /// This extracts the character that follows the backslash in the escape sequence. + /// + /// # Arguments + /// * `escape_token` - The escape token from the tokenizer + /// + /// # Returns + /// The character that follows the backslash, or None if the token is not a simple escape. + /// + /// # Examples + /// ```ignore + /// // Internal API - see unit tests for usage examples + /// assert_eq!(EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), b'n'); + /// ``` + pub fn token_to_escape_char(escape_token: &ujson::EventToken) -> Option { + match escape_token { + ujson::EventToken::EscapeQuote => Some(b'"'), + ujson::EventToken::EscapeBackslash => Some(b'\\'), + ujson::EventToken::EscapeSlash => Some(b'/'), + ujson::EventToken::EscapeBackspace => Some(b'b'), + ujson::EventToken::EscapeFormFeed => Some(b'f'), + ujson::EventToken::EscapeNewline => Some(b'n'), + ujson::EventToken::EscapeCarriageReturn => Some(b'r'), + ujson::EventToken::EscapeTab => Some(b't'), + _ => None, + } + } + + /// Process an escape token directly to the unescaped byte value. + /// This is a convenience method that combines token_to_escape_char and process_simple_escape. + /// + /// # Arguments + /// * `escape_token` - The escape token from the tokenizer + /// + /// # Returns + /// The unescaped byte value, or an error if the token is invalid or not a simple escape. + /// + /// # Examples + /// ```ignore + /// // Internal API - see unit tests for usage examples + /// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n'); + /// ``` + pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result { + let escape_char = Self::token_to_escape_char(escape_token) + .ok_or(ParserErrorHandler::unexpected_state("Invalid escape token"))?; + Self::process_simple_escape(escape_char) + } + + /// Process a simple escape sequence character and return the unescaped byte. + /// + /// # Arguments + /// * `escape_char` - The character following the backslash in an escape sequence + /// + /// # Returns + /// The unescaped byte value, or an error if the escape sequence is invalid. + /// + /// # Examples + /// ```ignore + /// // Internal API - see unit tests for usage examples + /// assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n'); + /// ``` + pub fn process_simple_escape(escape_char: u8) -> Result { + match escape_char { + b'n' => Ok(b'\n'), + b't' => Ok(b'\t'), + b'r' => Ok(b'\r'), + b'\\' => Ok(b'\\'), + b'"' => Ok(b'"'), + b'/' => Ok(b'/'), + b'b' => Ok(0x08), // Backspace + b'f' => Ok(0x0C), // Form feed + _ => Err(ParseError::InvalidEscapeSequence), + } + } + + /// Validate that a byte represents a valid hexadecimal digit. + /// + /// # Arguments + /// * `byte` - The byte to validate + /// + /// # Returns + /// The numeric value (0-15) of the hex digit, or an error if invalid. + pub fn validate_hex_digit(byte: u8) -> Result { + match byte { + b'0'..=b'9' => Ok((byte - b'0') as u32), + b'a'..=b'f' => Ok((byte - b'a' + 10) as u32), + b'A'..=b'F' => Ok((byte - b'A' + 10) as u32), + _ => Err(ParseError::InvalidUnicodeHex), + } + } + + /// Process a Unicode escape sequence (\uXXXX) and return the UTF-8 encoded bytes. + /// + /// # Arguments + /// * `hex_slice` - A 4-byte slice containing the hexadecimal digits + /// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes) + /// + /// # Returns + /// A slice containing the UTF-8 encoded bytes, or an error if the escape is invalid. + /// + /// # Examples + /// ```ignore + /// // Internal API - see unit tests for usage examples + /// let mut buffer = [0u8; 4]; + /// let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap(); + /// assert_eq!(result, b"A"); + /// ``` + pub fn process_unicode_escape<'a>( + hex_slice: &[u8], + utf8_buffer: &'a mut [u8], + ) -> Result<&'a [u8], ParseError> { + if hex_slice.len() != 4 { + return Err(ParseError::InvalidUnicodeHex); + } + + // Convert hex bytes to Unicode codepoint + let mut codepoint = 0u32; + for &byte in hex_slice { + let digit = Self::validate_hex_digit(byte)?; + codepoint = (codepoint << 4) | digit; + } + + // Convert codepoint to character and encode as UTF-8 + let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?; + let utf8_str = ch.encode_utf8(utf8_buffer); + Ok(utf8_str.as_bytes()) + } +} + +/// Shared Unicode escape hex digit collector for both parsers. +/// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences. +#[derive(Debug)] +pub(crate) struct UnicodeEscapeCollector { + /// Buffer to collect the 4 hex digits + hex_buffer: [u8; 4], + /// Current position in the hex buffer (0-4) + hex_pos: usize, +} + +impl UnicodeEscapeCollector { + /// Create a new Unicode escape collector + pub fn new() -> Self { + Self { + hex_buffer: [0u8; 4], + hex_pos: 0, + } + } + + /// Reset the collector for a new Unicode escape sequence + pub fn reset(&mut self) { + self.hex_pos = 0; + } + + /// Add a hex digit to the collector + /// Returns true if this completes the 4-digit sequence + pub fn add_hex_digit(&mut self, digit: u8) -> Result { + // Validate the hex digit first + EscapeProcessor::validate_hex_digit(digit)?; + + if self.hex_pos >= 4 { + return Err(ParserErrorHandler::unexpected_state( + "Too many hex digits in Unicode escape", + )); + } + + self.hex_buffer[self.hex_pos] = digit; + self.hex_pos += 1; + + Ok(self.hex_pos == 4) + } + + /// Process the collected hex digits and return UTF-8 bytes + /// Should only be called when is_complete() returns true + pub fn process_to_utf8<'a>(&self, utf8_buffer: &'a mut [u8]) -> Result<&'a [u8], ParseError> { + if self.hex_pos != 4 { + return Err(ParserErrorHandler::incomplete_unicode_escape()); + } + + EscapeProcessor::process_unicode_escape(&self.hex_buffer, utf8_buffer) + } + + /// Check if we have collected all 4 hex digits + pub fn is_complete(&self) -> bool { + self.hex_pos == 4 + } + + /// Get the current number of collected hex digits + pub fn hex_count(&self) -> usize { + self.hex_pos + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_escapes() { + assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n'); + assert_eq!(EscapeProcessor::process_simple_escape(b't').unwrap(), b'\t'); + assert_eq!(EscapeProcessor::process_simple_escape(b'r').unwrap(), b'\r'); + assert_eq!( + EscapeProcessor::process_simple_escape(b'\\').unwrap(), + b'\\' + ); + assert_eq!(EscapeProcessor::process_simple_escape(b'"').unwrap(), b'"'); + assert_eq!(EscapeProcessor::process_simple_escape(b'/').unwrap(), b'/'); + assert_eq!(EscapeProcessor::process_simple_escape(b'b').unwrap(), 0x08); + assert_eq!(EscapeProcessor::process_simple_escape(b'f').unwrap(), 0x0C); + } + + #[test] + fn test_invalid_simple_escape() { + assert!(EscapeProcessor::process_simple_escape(b'x').is_err()); + assert!(EscapeProcessor::process_simple_escape(b'z').is_err()); + assert!(EscapeProcessor::process_simple_escape(b'1').is_err()); + } + + #[test] + fn test_hex_digit_validation() { + // Valid digits + assert_eq!(EscapeProcessor::validate_hex_digit(b'0').unwrap(), 0); + assert_eq!(EscapeProcessor::validate_hex_digit(b'9').unwrap(), 9); + assert_eq!(EscapeProcessor::validate_hex_digit(b'a').unwrap(), 10); + assert_eq!(EscapeProcessor::validate_hex_digit(b'f').unwrap(), 15); + assert_eq!(EscapeProcessor::validate_hex_digit(b'A').unwrap(), 10); + assert_eq!(EscapeProcessor::validate_hex_digit(b'F').unwrap(), 15); + + // Invalid digits + assert!(EscapeProcessor::validate_hex_digit(b'g').is_err()); + assert!(EscapeProcessor::validate_hex_digit(b'G').is_err()); + assert!(EscapeProcessor::validate_hex_digit(b'z').is_err()); + assert!(EscapeProcessor::validate_hex_digit(b' ').is_err()); + } + + #[test] + fn test_unicode_escape_basic() { + let mut buffer = [0u8; 4]; + + // Test basic ASCII character \u0041 -> 'A' + let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap(); + assert_eq!(result, b"A"); + + // Test another ASCII character \u0048 -> 'H' + let result = EscapeProcessor::process_unicode_escape(b"0048", &mut buffer).unwrap(); + assert_eq!(result, b"H"); + } + + #[test] + fn test_unicode_escape_multibyte() { + let mut buffer = [0u8; 4]; + + // Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1) + let result = EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer).unwrap(); + assert_eq!(result, "α".as_bytes()); + + // Test emoji \u1F60A -> '😊' (4 bytes in UTF-8) + let _result = EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer).unwrap(); + // Note: This is actually incomplete - \u1F60A requires surrogate pairs + // But for basic testing this verifies the hex parsing works + } + + #[test] + fn test_unicode_escape_invalid_hex() { + let mut buffer = [0u8; 4]; + + // Invalid hex characters + assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer).is_err()); + + // Wrong length + assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer).is_err()); + assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer).is_err()); + } + + #[test] + fn test_unicode_escape_invalid_codepoint() { + let mut buffer = [0u8; 4]; + + // Note: Most values in the BMP are valid Unicode codepoints + // Invalid surrogate codepoints would be D800-DFFF but they're complex to test + // For now, test basic valid cases to ensure the function works + let result = EscapeProcessor::process_unicode_escape(b"0000", &mut buffer).unwrap(); + assert_eq!(result, "\0".as_bytes()); + } + + #[test] + fn test_token_to_escape_char() { + use ujson::EventToken; + + // Test all valid escape tokens + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(), + b'"' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackslash).unwrap(), + b'\\' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeSlash).unwrap(), + b'/' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackspace).unwrap(), + b'b' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeFormFeed).unwrap(), + b'f' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), + b'n' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeCarriageReturn).unwrap(), + b'r' + ); + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::EscapeTab).unwrap(), + b't' + ); + + // Test invalid token + assert_eq!( + EscapeProcessor::token_to_escape_char(&EventToken::String), + None + ); + } + + #[test] + fn test_process_escape_token() { + use ujson::EventToken; + + // Test valid escape tokens that produce correct unescaped bytes + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(), + b'"' + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeBackslash).unwrap(), + b'\\' + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeSlash).unwrap(), + b'/' + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeBackspace).unwrap(), + 0x08 + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeFormFeed).unwrap(), + 0x0C + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), + b'\n' + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeCarriageReturn).unwrap(), + b'\r' + ); + assert_eq!( + EscapeProcessor::process_escape_token(&EventToken::EscapeTab).unwrap(), + b'\t' + ); + + // Test invalid token + assert!(EscapeProcessor::process_escape_token(&EventToken::String).is_err()); + } + + #[test] + fn test_unicode_escape_collector_basic() { + let mut collector = UnicodeEscapeCollector::new(); + let mut utf8_buffer = [0u8; 4]; + + assert_eq!(collector.hex_count(), 0); + assert!(!collector.is_complete()); + + // Add hex digits for \u0041 -> 'A' + assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet + assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet + assert!(!collector.add_hex_digit(b'4').unwrap()); // Not complete yet + assert!(collector.add_hex_digit(b'1').unwrap()); // Complete! + + assert_eq!(collector.hex_count(), 4); + assert!(collector.is_complete()); + + // Process to UTF-8 + let result = collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert_eq!(result, b"A"); + } + + #[test] + fn test_unicode_escape_collector_invalid_hex() { + let mut collector = UnicodeEscapeCollector::new(); + + // Valid digits first + assert!(!collector.add_hex_digit(b'0').unwrap()); + assert!(!collector.add_hex_digit(b'0').unwrap()); + + // Invalid hex digit should fail + assert!(collector.add_hex_digit(b'G').is_err()); + + // State should be preserved after error + assert_eq!(collector.hex_count(), 2); + assert!(!collector.is_complete()); + } + + #[test] + fn test_unicode_escape_collector_reset() { + let mut collector = UnicodeEscapeCollector::new(); + + // Add some digits + assert!(!collector.add_hex_digit(b'0').unwrap()); + assert!(!collector.add_hex_digit(b'1').unwrap()); + assert_eq!(collector.hex_count(), 2); + + // Reset should clear state + collector.reset(); + assert_eq!(collector.hex_count(), 0); + assert!(!collector.is_complete()); + + // Should be able to start fresh + assert!(!collector.add_hex_digit(b'A').unwrap()); + assert_eq!(collector.hex_count(), 1); + } + + #[test] + fn test_unicode_escape_collector_multibyte() { + let mut collector = UnicodeEscapeCollector::new(); + let mut utf8_buffer = [0u8; 4]; + + // Add hex digits for \u03B1 -> 'α' (Greek alpha) + assert!(!collector.add_hex_digit(b'0').unwrap()); + assert!(!collector.add_hex_digit(b'3').unwrap()); + assert!(!collector.add_hex_digit(b'B').unwrap()); + assert!(collector.add_hex_digit(b'1').unwrap()); + + let result = collector.process_to_utf8(&mut utf8_buffer).unwrap(); + assert_eq!(result, "α".as_bytes()); + } + + #[test] + fn test_unicode_escape_collector_incomplete_processing() { + let mut collector = UnicodeEscapeCollector::new(); + let mut utf8_buffer = [0u8; 4]; + + // Add only 2 digits + assert!(!collector.add_hex_digit(b'0').unwrap()); + assert!(!collector.add_hex_digit(b'0').unwrap()); + + // Should fail to process incomplete sequence + assert!(collector.process_to_utf8(&mut utf8_buffer).is_err()); + } +} diff --git a/stax/src/flex_parser.rs b/stax/src/flex_parser.rs new file mode 100644 index 0000000..ecbad23 --- /dev/null +++ b/stax/src/flex_parser.rs @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::copy_on_escape::CopyOnEscape; +use crate::escape_processor::{EscapeProcessor, UnicodeEscapeCollector}; +use crate::shared::{ContentRange, Event, ParseError, ParserErrorHandler, ParserState, State}; +use crate::slice_input_buffer::{InputBuffer, SliceInputBuffer}; +use ujson::BitStackCore; +use ujson::{BitStack, EventToken, Tokenizer}; + +/// Result of processing a tokenizer event +enum EventResult<'a, 'b> { + /// Event processing complete, return this event + Complete(Event<'a, 'b>), + /// Continue processing, no event to return yet + Continue, + /// Extract string content from current state + ExtractString, + /// Extract key content from current state + ExtractKey, + /// Extract number content from current state, + ExtractNumber, +} + +/// A flexible pull parser for JSON that yields events on demand. +/// Generic over BitStack storage type for configurable nesting depth. +// Lifetime 'a is the input buffer lifetime +// lifetime 'b is the scratch/copy buffer lifetime +pub struct PullParserFlex<'a, 'b, T: BitStack, D> { + tokenizer: Tokenizer, + buffer: SliceInputBuffer<'a>, + parser_state: ParserState, + copy_on_escape: CopyOnEscape<'a, 'b>, + /// Zero-length internal buffer for when no external scratch buffer is provided + _internal_scratch: [u8; 0], + /// Shared Unicode escape collector for \uXXXX sequences + unicode_escape_collector: UnicodeEscapeCollector, +} + +/// Type alias for the standard pull parser with default BitStack configuration. +/// Uses u32 BitStack (32-bit depth) and u8 depth counter. +pub type PullParser<'a, 'b> = PullParserFlex<'a, 'b, u32, u8>; + +/// Methods for the pull parser. +impl<'a, 'b, T: BitStack + core::fmt::Debug, D: BitStackCore> PullParserFlex<'a, 'b, T, D> { + /// Creates a new parser for the given JSON input. + /// + /// This parser assumes no string escapes will be encountered. If escapes are found, + /// parsing will fail with `ScratchBufferFull` error. + /// + /// For JSON with potential string escapes, use `new_with_buffer()` instead. + /// + /// # Arguments + /// * `input` - A string slice containing the JSON data to be parsed. + /// + /// # Example + /// ``` + /// use stax::PullParser; + /// let parser = PullParser::new(r#"{"name": "value"}"#); + /// ``` + pub fn new(input: &'a str) -> Self { + let data = input.as_bytes(); + // Use a mutable reference to the internal zero-length buffer + let internal_buffer: &mut [u8] = &mut []; + let copy_on_escape = CopyOnEscape::new(data, internal_buffer); + PullParserFlex { + tokenizer: Tokenizer::new(), + buffer: SliceInputBuffer::new(data), + parser_state: ParserState::new(), + copy_on_escape, + _internal_scratch: [], + unicode_escape_collector: UnicodeEscapeCollector::new(), + } + } + + /// Creates a new parser for the given JSON input with external scratch buffer. + /// + /// Use this when your JSON contains string escapes (like `\n`, `\"`, `\u0041`) that + /// need to be unescaped during parsing. + /// + /// # Arguments + /// * `input` - A string slice containing the JSON data to be parsed. + /// * `scratch_buffer` - A mutable byte slice for temporary string unescaping operations. + /// + /// # Example + /// ``` + /// use stax::PullParser; + /// let mut scratch = [0u8; 1024]; + /// let parser = PullParser::new_with_buffer(r#"{"msg": "Hello\nWorld"}"#, &mut scratch); + /// ``` + pub fn new_with_buffer(input: &'a str, scratch_buffer: &'b mut [u8]) -> Self { + let data = input.as_bytes(); + let copy_on_escape = CopyOnEscape::new(data, scratch_buffer); + PullParserFlex { + tokenizer: Tokenizer::new(), + buffer: SliceInputBuffer::new(data), + parser_state: ParserState::new(), + copy_on_escape, + _internal_scratch: [], + unicode_escape_collector: UnicodeEscapeCollector::new(), + } + } + + fn have_events(&self) -> bool { + self.parser_state.evts.iter().any(|evt| evt.is_some()) + } + + /// Helper function to parse a number from the buffer given a start position. + /// Uses unified number parsing logic. + fn parse_number_from_buffer(&mut self, start: usize) -> Result { + crate::number_parser::parse_number_event_simple(&self.buffer, start) + } + + /// Helper method to handle simple escape tokens using EscapeProcessor + /// Converts EventToken back to original escape character and processes it + fn handle_simple_escape_token( + &mut self, + escape_token: &EventToken, + ) -> Result, ParseError> { + // Use unified escape token processing + let unescaped_char = EscapeProcessor::process_escape_token(escape_token)?; + + // Handle the escape using existing logic + self.handle_escape_event(unescaped_char) + } + + /// Handles escape sequence events by delegating to CopyOnEscape if we're inside a string or key + fn handle_escape_event(&mut self, escape_char: u8) -> Result, ParseError> { + if let State::String(_) | State::Key(_) = self.parser_state.state { + self.copy_on_escape + .handle_escape(self.buffer.current_pos(), escape_char)?; + } + Ok(None) + } + + /// Process Unicode escape sequence using shared UnicodeEscapeCollector + /// Extracts hex digits from buffer and processes them through the collector + fn process_unicode_escape_with_collector(&mut self) -> Result<(), ParseError> { + // Current position is right after the 4 hex digits + let current_pos = self.buffer.current_pos(); + let (hex_start, hex_end, escape_start_pos) = + ContentRange::unicode_escape_bounds(current_pos); + + // Extract the 4 hex digits from buffer + let hex_slice = self.buffer.slice(hex_start, hex_end)?; + + if hex_slice.len() != 4 { + return Err(ParserErrorHandler::invalid_unicode_length()); + } + + // Feed hex digits to the shared collector + for &hex_digit in hex_slice { + self.unicode_escape_collector.add_hex_digit(hex_digit)?; + } + + // Process the complete sequence to UTF-8 + let mut utf8_buf = [0u8; 4]; + let utf8_bytes = self + .unicode_escape_collector + .process_to_utf8(&mut utf8_buf)?; + + // Handle the Unicode escape via CopyOnEscape + self.copy_on_escape + .handle_unicode_escape(escape_start_pos, utf8_bytes)?; + + Ok(()) + } + + fn pull_tokenizer_events(&mut self) -> Result<(), ParseError> { + use crate::slice_input_buffer::InputBuffer; + if self.buffer.is_past_end() { + return Err(ParseError::EndOfData); + } + let mut callback = |event, _len| { + for evt in self.parser_state.evts.iter_mut() { + if evt.is_none() { + *evt = Some(event); + return; + } + } + }; + + let res = match self.buffer.consume_byte() { + Err(crate::slice_input_buffer::Error::ReachedEnd) => { + self.tokenizer.finish(&mut callback) + } + Err(crate::slice_input_buffer::Error::InvalidSliceBounds) => { + return Err(ParseError::UnexpectedState( + "Invalid slice bounds in consume_byte", + )); + } + Ok(byte) => self.tokenizer.parse_chunk(&[byte], &mut callback), + }; + + if let Err(_tokenizer_error) = res { + return Err(ParseError::TokenizerError); + } + Ok(()) + } + + pub fn next(&mut self) -> Option> { + match self.next_event() { + Ok(Event::EndDocument) => None, + other => Some(other), + } + } + + /// Returns the next JSON event or an error if parsing fails. + /// Parsing continues until `EndDocument` is returned or an error occurs. + pub fn next_event(&mut self) -> Result { + if self.buffer.is_past_end() { + return Ok(Event::EndDocument); + } + loop { + while !self.have_events() { + self.pull_tokenizer_events()?; + if self.buffer.is_past_end() { + return Ok(Event::EndDocument); + } + } + // Find and move out the first available event to avoid holding mutable borrow during processing + let taken_event = { + let mut found_event = None; + for evt in self.parser_state.evts.iter_mut() { + if evt.is_some() { + found_event = evt.take(); + break; + } + } + found_event + }; + + if let Some(taken) = taken_event { + let res = match taken { + // Container events + ujson::Event::ObjectStart => EventResult::Complete(Event::StartObject), + ujson::Event::ObjectEnd => EventResult::Complete(Event::EndObject), + ujson::Event::ArrayStart => EventResult::Complete(Event::StartArray), + ujson::Event::ArrayEnd => EventResult::Complete(Event::EndArray), + + // String/Key events + ujson::Event::Begin(EventToken::Key) => { + self.parser_state.state = State::Key(self.buffer.current_pos()); + self.copy_on_escape.begin_string(self.buffer.current_pos()); + EventResult::Continue + } + ujson::Event::End(EventToken::Key) => EventResult::ExtractKey, + ujson::Event::Begin(EventToken::String) => { + self.parser_state.state = State::String(self.buffer.current_pos()); + self.copy_on_escape.begin_string(self.buffer.current_pos()); + EventResult::Continue + } + ujson::Event::End(EventToken::String) => EventResult::ExtractString, + + // Number events + ujson::Event::Begin( + EventToken::Number + | EventToken::NumberAndArray + | EventToken::NumberAndObject, + ) => { + let number_start = + ContentRange::number_start_from_current(self.buffer.current_pos()); + self.parser_state.state = State::Number(number_start); + EventResult::Continue + } + ujson::Event::End(EventToken::Number) => EventResult::ExtractNumber, + ujson::Event::End(EventToken::NumberAndArray) => EventResult::ExtractNumber, + ujson::Event::End(EventToken::NumberAndObject) => EventResult::ExtractNumber, + // Boolean and null values + ujson::Event::Begin( + EventToken::True | EventToken::False | EventToken::Null, + ) => EventResult::Continue, + ujson::Event::End(EventToken::True) => EventResult::Complete(Event::Bool(true)), + ujson::Event::End(EventToken::False) => { + EventResult::Complete(Event::Bool(false)) + } + ujson::Event::End(EventToken::Null) => EventResult::Complete(Event::Null), + // Escape sequence handling + ujson::Event::Begin( + escape_token @ (EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab), + ) => { + // Use EscapeProcessor for all simple escape sequences + self.handle_simple_escape_token(&escape_token)?; + EventResult::Continue + } + ujson::Event::Begin(EventToken::UnicodeEscape) => { + // Start Unicode escape collection - reset collector for new sequence + // Only handle if we're inside a string or key + match self.parser_state.state { + State::String(_) | State::Key(_) => { + self.unicode_escape_collector.reset(); + } + _ => {} // Ignore if not in string/key + } + EventResult::Continue + } + ujson::Event::End(EventToken::UnicodeEscape) => { + // Handle end of Unicode escape sequence (\uXXXX) using shared collector + match self.parser_state.state { + State::String(_) | State::Key(_) => { + // Process Unicode escape using shared collector logic + self.process_unicode_escape_with_collector()?; + } + _ => {} // Ignore if not in string/key context + } + EventResult::Continue + } + // EscapeSequence events (only emitted when flag is enabled, ignored in original parser) + ujson::Event::Begin(EventToken::EscapeSequence) => { + // Ignore in original parser since it uses slice-based parsing + EventResult::Continue + } + ujson::Event::End(EventToken::EscapeSequence) => { + // Ignore in original parser since it uses slice-based parsing + EventResult::Continue + } + ujson::Event::End( + EventToken::EscapeQuote + | EventToken::EscapeBackslash + | EventToken::EscapeSlash + | EventToken::EscapeBackspace + | EventToken::EscapeFormFeed + | EventToken::EscapeNewline + | EventToken::EscapeCarriageReturn + | EventToken::EscapeTab, + ) => { + // End of escape sequence - ignored here + EventResult::Continue + } + }; + match res { + EventResult::Complete(event) => break Ok(event), + EventResult::Continue => continue, + EventResult::ExtractKey => { + if let State::Key(_start) = self.parser_state.state { + self.parser_state.state = State::None; + // Use CopyOnEscape to get the final key result + let end_pos = ContentRange::end_position_excluding_delimiter( + self.buffer.current_pos(), + ); + let key_result = self.copy_on_escape.end_string(end_pos)?; + break Ok(Event::Key(key_result)); + } else { + break Err(ParserErrorHandler::state_mismatch("key", "end")); + } + } + EventResult::ExtractString => { + if let State::String(_value) = self.parser_state.state { + self.parser_state.state = State::None; + // Use CopyOnEscape to get the final string result + let end_pos = ContentRange::end_position_excluding_delimiter( + self.buffer.current_pos(), + ); + let value_result = self.copy_on_escape.end_string(end_pos)?; + break Ok(Event::String(value_result)); + } else { + break Err(ParserErrorHandler::state_mismatch("string", "end")); + } + } + EventResult::ExtractNumber => { + if let State::Number(start) = self.parser_state.state { + // Reset state before parsing to stop selective copying + self.parser_state.state = State::None; + let event = self.parse_number_from_buffer(start)?; + break Ok(event); + } else { + break Err(ParseError::UnexpectedState( + "Number end without Number start", + )); + } + } + } + } else { + // No event available - this shouldn't happen since we ensured have_events() above + break Err(ParseError::UnexpectedState( + "No events available after ensuring events exist".into(), + )); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::String; + use test_log::test; + + #[test] + fn make_parser() { + let input = r#"{"key": "value"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + assert_eq!( + parser.next_event(), + Ok(Event::String(String::Borrowed("value"))) + ); + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn parse_number() { + let input = r#"{"key": 1242}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + // Check number value using new JsonNumber API + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "1242"); + assert_eq!(num.as_int(), Some(1242)); + } + other => panic!("Expected Number, got: {:?}", other), + } + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn parse_bool_and_null() { + let input = r#"{"key": true, "key2": false, "key3": null}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + assert_eq!(parser.next_event(), Ok(Event::Bool(true))); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("key2"))) + ); + assert_eq!(parser.next_event(), Ok(Event::Bool(false))); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("key3"))) + ); + assert_eq!(parser.next_event(), Ok(Event::Null)); + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn parse_array() { + #[cfg(feature = "float-error")] + let input = r#"{"key": [1, 2, 3]}"#; // No floats for float-error config + #[cfg(not(feature = "float-error"))] + let input = r#"{"key": [1, 2.2, 3]}"#; // Include float for other configs + + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + assert_eq!(parser.next_event(), Ok(Event::StartArray)); + + // First number: 1 (integer) + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "1"); + assert_eq!(num.as_int(), Some(1)); + } + other => panic!("Expected Number(1), got: {:?}", other), + } + + // Second number: depends on configuration + match parser.next_event() { + Ok(Event::Number(num)) => { + #[cfg(feature = "float-error")] + { + assert_eq!(num.as_str(), "2"); + assert_eq!(num.as_int(), Some(2)); + } + #[cfg(not(feature = "float-error"))] + { + assert_eq!(num.as_str(), "2.2"); + #[cfg(feature = "float")] + assert_eq!(num.as_f64(), Some(2.2)); + #[cfg(not(feature = "float-error"))] + assert!(num.is_float()); + } + } + other => panic!("Expected Number, got: {:?}", other), + } + + // Third number: 3 (integer) + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "3"); + assert_eq!(num.as_int(), Some(3)); + } + other => panic!("Expected Number(3), got: {:?}", other), + } + + assert_eq!(parser.next_event(), Ok(Event::EndArray)); + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_simple_parser_api() { + let input = r#"{"name": "test"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("name"))) + ); + assert_eq!( + parser.next_event(), + Ok(Event::String(String::Borrowed("test"))) + ); + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_parser_with_escaped_strings() { + // Use regular string literal to properly include escape sequences + let input = "{\"name\": \"John\\nDoe\", \"message\": \"Hello\\tWorld!\"}"; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + // Test that the parser correctly handles escaped strings + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + + // Key should be simple (no escapes) -> Borrowed + if let Ok(Event::Key(key)) = parser.next_event() { + assert_eq!(&*key, "name"); + // This should be the fast path (borrowed) + assert!(matches!(key, String::Borrowed(_))); + } else { + panic!("Expected Key event"); + } + + // Value should have escapes -> Unescaped + if let Ok(Event::String(value)) = parser.next_event() { + assert_eq!(&*value, "John\nDoe"); + // This should be the slow path (unescaped) + assert!(matches!(value, String::Unescaped(_))); + } else { + panic!("Expected String event"); + } + + // Second key should be simple -> Borrowed + if let Ok(Event::Key(key)) = parser.next_event() { + assert_eq!(&*key, "message"); + assert!(matches!(key, String::Borrowed(_))); + } else { + panic!("Expected Key event"); + } + + // Second value should have escapes -> Unescaped + if let Ok(Event::String(value)) = parser.next_event() { + assert_eq!(&*value, "Hello\tWorld!"); + assert!(matches!(value, String::Unescaped(_))); + } else { + panic!("Expected String event"); + } + + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + } + + #[test] + fn test_copy_on_escape_optimization() { + // Use regular string literal to include proper escape sequences + let input = "{\"simple\": \"no escapes\", \"complex\": \"has\\nescapes\"}"; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + + // "simple" key should be borrowed (fast path) + if let Ok(Event::Key(key)) = parser.next_event() { + assert_eq!(&*key, "simple"); + assert!(matches!(key, String::Borrowed(_))); + } else { + panic!("Expected Key event"); + } + + // "no escapes" value should be borrowed (fast path) + if let Ok(Event::String(value)) = parser.next_event() { + assert_eq!(&*value, "no escapes"); + assert!(matches!(value, String::Borrowed(_))); + } else { + panic!("Expected String event"); + } + + // "complex" key should be borrowed (fast path) + if let Ok(Event::Key(key)) = parser.next_event() { + assert_eq!(&*key, "complex"); + assert!(matches!(key, String::Borrowed(_))); + } else { + panic!("Expected Key event"); + } + + // "has\\nescapes" value should be unescaped (slow path) + if let Ok(Event::String(value)) = parser.next_event() { + assert_eq!(&*value, "has\nescapes"); + assert!(matches!(value, String::Unescaped(_))); + } else { + panic!("Expected String event"); + } + + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_coe2_integration_multiple_escapes() { + let input = r#"{"key": "a\nb\tc\rd"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + + let string_event = parser.next_event().unwrap(); + match string_event { + Event::String(String::Unescaped(s)) => { + assert_eq!(s, "a\nb\tc\rd"); + } + _ => panic!("Expected unescaped string value, got: {:?}", string_event), + } + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_coe2_integration_zero_copy_path() { + let input = r#"{"simple": "no_escapes_here"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("simple"))) + ); + + // This should be borrowed (zero-copy) since no escapes + let string_event = parser.next_event().unwrap(); + match string_event { + Event::String(String::Borrowed(s)) => { + assert_eq!(s, "no_escapes_here"); + } + _ => panic!( + "Expected borrowed string value for zero-copy, got: {:?}", + string_event + ), + } + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_coe2_integration_mixed_strings() { + let input = r#"["plain", "with\nescapes", "plain2", "more\tescapes"]"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartArray)); + + // First string: no escapes -> borrowed + match parser.next_event().unwrap() { + Event::String(String::Borrowed(s)) => assert_eq!(s, "plain"), + other => panic!("Expected borrowed string, got: {:?}", other), + } + + // Second string: has escapes -> unescaped + match parser.next_event().unwrap() { + Event::String(String::Unescaped(s)) => assert_eq!(s, "with\nescapes"), + other => panic!("Expected unescaped string, got: {:?}", other), + } + + // Third string: no escapes -> borrowed + match parser.next_event().unwrap() { + Event::String(String::Borrowed(s)) => assert_eq!(s, "plain2"), + other => panic!("Expected borrowed string, got: {:?}", other), + } + + // Fourth string: has escapes -> unescaped + match parser.next_event().unwrap() { + Event::String(String::Unescaped(s)) => assert_eq!(s, "more\tescapes"), + other => panic!("Expected unescaped string, got: {:?}", other), + } + + assert_eq!(parser.next_event(), Ok(Event::EndArray)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + #[test] + fn test_unicode_escape_integration() { + let input = r#"{"key": "Hello\u0041World"}"#; // \u0041 = 'A' + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + + // The string with Unicode escape should be unescaped + match parser.next_event().unwrap() { + Event::String(String::Unescaped(s)) => { + assert_eq!(s, "HelloAWorld"); + } + other => panic!("Expected unescaped string value, got: {:?}", other), + } + + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); + } + + // Tests for JsonNumber foundation (Phase 1) + + #[test_log::test] + fn test_original_parser_escape_trace() { + // Test escape sequence processing with logging + let input = r#""a\nb""#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + // Should get String with unescaped content + let event = parser.next_event().unwrap(); + if let Event::String(s) = event { + assert_eq!(&*s, "a\nb"); + } else { + panic!("Expected String event, got {:?}", event); + } + + // Should get EndDocument + let event = parser.next_event().unwrap(); + assert_eq!(event, Event::EndDocument); + } +} diff --git a/stax/src/json_number.rs b/stax/src/json_number.rs new file mode 100644 index 0000000..1897e95 --- /dev/null +++ b/stax/src/json_number.rs @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: Apache-2.0 + +use core::ops::Deref; +use core::str::FromStr; + +use crate::ParseError; + +// Type alias for the configured integer type +#[cfg(feature = "int32")] +type ConfiguredInt = i32; +#[cfg(not(feature = "int32"))] +type ConfiguredInt = i64; + +/// Represents the parsed result of a JSON number. +#[derive(Debug, PartialEq)] +pub enum NumberResult { + /// Integer that fits in the configured integer type + Integer(ConfiguredInt), + /// Integer too large for configured type (use raw string for exact representation) + IntegerOverflow, + /// Float value (only available with float feature) + #[cfg(feature = "float")] + Float(f64), + /// Float parsing disabled - behavior depends on configuration + #[cfg(not(feature = "float"))] + FloatDisabled, + /// Float encountered but skipped due to float-skip configuration + #[cfg(all(not(feature = "float"), feature = "float-skip"))] + FloatSkipped, + /// Float truncated to integer due to float-truncate configuration + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + FloatTruncated(ConfiguredInt), +} + +/// Represents a JSON number with both exact string representation and parsed value. +/// +/// This preserves the exact number string from the tokenizer while providing +/// convenient access to parsed representations based on compilation features. +/// +/// Lifetimes: 'a is the input slice lifetime, 'b is the scratch/copy buffer lifetime +#[derive(Debug, PartialEq)] +pub enum JsonNumber<'a, 'b> { + /// A raw slice from the original input, used when no copying is needed. + Borrowed { raw: &'a str, parsed: NumberResult }, + /// A slice from the scratch/copy buffer, used when number had to be copied. + Copied { raw: &'b str, parsed: NumberResult }, +} + +impl<'a, 'b> JsonNumber<'a, 'b> { + /// Get the parsed NumberResult. + pub fn parsed(&self) -> &NumberResult { + match self { + JsonNumber::Borrowed { parsed, .. } => parsed, + JsonNumber::Copied { parsed, .. } => parsed, + } + } + + /// Get the number as the configurable integer type if it's an integer that fits. + pub fn as_int(&self) -> Option { + let parsed = self.parsed(); + match parsed { + NumberResult::Integer(val) => Some(*val), + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + NumberResult::FloatTruncated(val) => Some(*val), + _ => None, + } + } + + /// Get the number as an f64 if float support is enabled. + /// For integers, converts to f64. For overflowing integers, returns None. + #[cfg(feature = "float")] + pub fn as_f64(&self) -> Option { + let parsed = self.parsed(); + match parsed { + NumberResult::Float(val) => Some(*val), + NumberResult::Integer(val) => Some(*val as f64), + _ => None, + } + } + + /// Always available: get the exact string representation. + /// This preserves full precision and never loses information. + pub fn as_str(&self) -> &str { + match self { + JsonNumber::Borrowed { raw, .. } => raw, + JsonNumber::Copied { raw, .. } => raw, + } + } + + /// Parse the number as a custom type using the exact string representation. + /// This allows using external libraries like BigDecimal, arbitrary precision, etc. + pub fn parse(&self) -> Result { + T::from_str(self.as_str()) + } + + /// Check if this number represents an integer (no decimal point or exponent). + pub fn is_integer(&self) -> bool { + let parsed = self.parsed(); + matches!( + parsed, + NumberResult::Integer(_) | NumberResult::IntegerOverflow + ) + } + + /// Returns true if this number is not an integer (i.e., has a decimal point or exponent). + /// + /// Note: This does not guarantee that float values are supported or enabled in this build. + /// It only indicates that the number is not an integer, regardless of float support. + pub fn is_float(&self) -> bool { + !self.is_integer() + } +} + +impl<'a, 'b> AsRef for JsonNumber<'a, 'b> { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Deref for JsonNumber<'_, '_> { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl<'a, 'b> core::fmt::Display for JsonNumber<'a, 'b> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + // Display strategy: Show parsed value when available, fall back to raw string + // This provides the most meaningful representation across all configurations + let (raw, parsed) = match self { + JsonNumber::Borrowed { raw, parsed } => (raw, parsed), + JsonNumber::Copied { raw, parsed } => (raw, parsed), + }; + match parsed { + NumberResult::Integer(val) => write!(f, "{}", val), + #[cfg(feature = "float")] + NumberResult::Float(val) => write!(f, "{}", val), + #[cfg(all(not(feature = "float"), feature = "float-truncate"))] + NumberResult::FloatTruncated(val) => write!(f, "{}", val), + // For overflow, disabled, or skipped cases, show the exact raw string + // This preserves full precision and is least surprising to users + _ => f.write_str(raw), + } + } +} + +/// Detects if a number string represents an integer (no decimal point or exponent). +pub(super) fn is_integer(s: &str) -> bool { + !s.contains('.') && !s.contains('e') && !s.contains('E') +} + +/// Parses an integer string into NumberResult using configured integer type. +pub(super) fn parse_integer(s: &str) -> NumberResult { + match ConfiguredInt::from_str(s) { + Ok(val) => NumberResult::Integer(val), + Err(_) => NumberResult::IntegerOverflow, + } +} + +/// Parses a float string into NumberResult (only available with float feature). +#[cfg(feature = "float")] +pub(super) fn parse_float(s: &str) -> NumberResult { + match f64::from_str(s) { + Ok(val) if val.is_finite() => NumberResult::Float(val), + _ => NumberResult::IntegerOverflow, // Infinity/NaN -> treat as overflow, use raw string + } +} + +/// Parses a float string when float feature is disabled - behavior depends on configuration. +#[cfg(not(feature = "float"))] +pub(super) fn parse_float(s: &str) -> Result { + #[cfg(feature = "float-error")] + { + Err(ParseError::FloatNotAllowed) + } + #[cfg(feature = "float-skip")] + { + Ok(NumberResult::FloatSkipped) + } + #[cfg(feature = "float-truncate")] + { + // Scientific notation (1e3, 2.5e-1) would require float math to evaluate properly. + // For embedded targets avoiding float math, we error on scientific notation. + if s.contains(['e', 'E']) { + return Err(ParseError::InvalidNumber); + } + + // Extract integer part before decimal point for simple decimals like 1.5 → 1 + let int_part = if let Some(dot_pos) = s.find('.') { + &s[..dot_pos] + } else { + s // Should not happen since we detected it's a float, but handle gracefully + }; + + match ConfiguredInt::from_str(int_part) { + Ok(val) => Ok(NumberResult::FloatTruncated(val)), + Err(_) => Ok(NumberResult::IntegerOverflow), + } + } + #[cfg(not(any( + feature = "float-error", + feature = "float-skip", + feature = "float-truncate" + )))] + { + Ok(NumberResult::FloatDisabled) + } +} + +/// Parses a JSON number from a string slice. +/// +/// This is the main entry point for parsing numbers with all the configured +/// behavior (int32/int64, float support, etc.). +pub(super) fn parse_number_from_str(s: &str) -> Result { + if is_integer(s) { + Ok(parse_integer(s)) + } else { + #[cfg(feature = "float")] + { + Ok(parse_float(s)) + } + #[cfg(not(feature = "float"))] + { + parse_float(s) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_json_number_integer() { + let number = JsonNumber::Borrowed { + raw: "42", + parsed: NumberResult::Integer(42), + }; + assert_eq!(number.as_str(), "42"); + assert_eq!(number.as_int(), Some(42)); + assert!(number.is_integer()); + assert!(!number.is_float()); + } + + #[test] + fn test_json_number_negative_integer() { + let number = JsonNumber::Borrowed { + raw: "-123", + parsed: NumberResult::Integer(-123), + }; + assert_eq!(number.as_str(), "-123"); + assert_eq!(number.as_int(), Some(-123)); + assert!(number.is_integer()); + } + + #[test] + fn test_json_number_large_integer() { + let large_int_str = "12345678901234567890"; // Larger than configured integer max + let number = JsonNumber::Borrowed { + raw: large_int_str, + parsed: NumberResult::IntegerOverflow, + }; + assert_eq!(number.as_str(), large_int_str); + assert_eq!(number.as_int(), None); // Should be None due to overflow + match number { + JsonNumber::Borrowed { + parsed: NumberResult::IntegerOverflow, + .. + } => {} + _ => panic!("Expected IntegerOverflow"), + } + assert!(number.is_integer()); + } + + #[test] + #[cfg(feature = "float")] + fn test_json_number_float() { + let number = JsonNumber::Borrowed { + raw: "3.14159", + parsed: NumberResult::Float(3.14159), + }; + assert_eq!(number.as_str(), "3.14159"); + assert_eq!(number.as_int(), None); + assert_eq!(number.as_f64(), Some(3.14159)); + assert!(!number.is_integer()); + assert!(number.is_float()); + } + + #[test] + #[cfg(feature = "float")] + fn test_json_number_exponent() { + let number = JsonNumber::Borrowed { + raw: "1.5e10", + parsed: NumberResult::Float(1.5e10), + }; + assert_eq!(number.as_str(), "1.5e10"); + assert_eq!(number.as_f64(), Some(1.5e10)); + assert!(number.is_float()); + } + + #[test] + #[cfg(not(feature = "float"))] + fn test_json_number_float_disabled() { + let number = JsonNumber::Borrowed { + raw: "3.14159", + parsed: NumberResult::FloatDisabled, + }; + assert_eq!(number.as_str(), "3.14159"); + assert_eq!(number.as_int(), None); + match number { + JsonNumber::Borrowed { + parsed: NumberResult::FloatDisabled, + .. + } => {} + _ => panic!("Expected FloatDisabled"), + } + assert!(number.is_float()); + } + + #[test] + fn test_json_number_parse_custom() { + let number = JsonNumber::Borrowed { + raw: "42", + parsed: NumberResult::Integer(42), + }; + let parsed: u32 = number.parse().unwrap(); + assert_eq!(parsed, 42u32); + + let float_number = JsonNumber::Borrowed { + raw: "3.14", + parsed: NumberResult::Integer(3), // Mock for test, would be Float in real usage + }; + let parsed_f32: Result = float_number.parse(); + assert!(parsed_f32.is_ok()); + } + + #[test] + fn test_is_integer_detection() { + assert!(is_integer("42")); + assert!(is_integer("-123")); + assert!(is_integer("0")); + assert!(!is_integer("3.14")); + assert!(!is_integer("1e10")); + assert!(!is_integer("2.5E-3")); + } +} diff --git a/stax/src/json_string.rs b/stax/src/json_string.rs new file mode 100644 index 0000000..0527e7c --- /dev/null +++ b/stax/src/json_string.rs @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: Apache-2.0 + +use core::ops::Deref; + +/// Represents a JSON string. +/// 'a is the lifetime of the original input buffer. +/// 'b is the lifetime of the scratch buffer. +#[derive(Debug, PartialEq, Eq)] +pub enum String<'a, 'b> { + /// A raw slice from the original input, used when no un-escaping is needed. + Borrowed(&'a str), + /// A slice from the scratch buffer, used when a string had to be un-escaped. + Unescaped(&'b str), +} + +impl<'a, 'b> String<'a, 'b> { + /// Returns the string as a `&str`, whether borrowed or unescaped. + pub fn as_str(&self) -> &str { + match self { + String::Borrowed(s) => s, + String::Unescaped(s) => s, + } + } +} + +impl<'a, 'b> AsRef for String<'a, 'b> { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Deref for String<'_, '_> { + type Target = str; + + fn deref(&self) -> &Self::Target { + match self { + String::Borrowed(s) => s, + String::Unescaped(s) => s, + } + } +} + +impl<'a, 'b> core::fmt::Display for String<'a, 'b> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_json_string_deref() { + let borrowed = String::Borrowed("test"); + assert_eq!(&*borrowed, "test"); + assert_eq!(borrowed.len(), 4); + + // Test that it works as a string reference + fn takes_str(s: &str) -> usize { + s.len() + } + assert_eq!(takes_str(&borrowed), 4); + } +} diff --git a/stax/src/lib.rs b/stax/src/lib.rs new file mode 100644 index 0000000..94c0d28 --- /dev/null +++ b/stax/src/lib.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 + +#![cfg_attr(not(test), no_std)] + +// Compile-time configuration validation +mod config_check; + +mod copy_on_escape; + +mod escape_processor; + +mod direct_buffer; + +mod direct_parser; + +mod flex_parser; + +mod shared; +pub use shared::{Event, ParseError}; +pub use ujson::BitStackCore; + +mod slice_input_buffer; + +mod json_number; +use json_number::parse_number_from_str; +pub use json_number::{JsonNumber, NumberResult}; + +mod json_string; +pub use json_string::String; + +mod number_parser; + +pub use direct_parser::{DirectParser, Reader}; +pub use flex_parser::{PullParser, PullParserFlex}; + +impl From for ParseError { + fn from(err: slice_input_buffer::Error) -> Self { + match err { + slice_input_buffer::Error::ReachedEnd => ParseError::EndOfData, + slice_input_buffer::Error::InvalidSliceBounds => { + ParseError::UnexpectedState("Invalid slice bounds in input buffer") + } + } + } +} diff --git a/stax/src/number_parser.rs b/stax/src/number_parser.rs new file mode 100644 index 0000000..8242bfa --- /dev/null +++ b/stax/src/number_parser.rs @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::shared::{Event, ParseError, ParserErrorHandler}; +use crate::JsonNumber; + +/// Trait for extracting number slices from different buffer implementations. +/// This allows unified number parsing logic between FlexParser and DirectParser. +pub trait NumberExtractor { + /// Extract a slice of bytes representing a number from start to end position. + /// + /// # Arguments + /// * `start` - The starting position of the number (inclusive) + /// * `end` - The ending position of the number (exclusive) + /// + /// # Returns + /// A byte slice containing the number content + fn get_number_slice(&self, start: usize, end: usize) -> Result<&[u8], ParseError>; + + /// Get the current position in the buffer for end position calculation. + fn current_position(&self) -> usize; + + /// Check if the buffer is empty (used for delimiter logic). + fn is_empty(&self) -> bool; +} + +/// Unified number parsing logic shared between FlexParser and DirectParser. +/// +/// This function encapsulates the common pattern: +/// 1. Extract number slice from buffer +/// 2. Convert to UTF-8 string +/// 3. Parse using shared number parsing logic +/// 4. Create JsonNumber::Borrowed event +pub fn parse_number_event( + extractor: &T, + start_pos: usize, + from_container_end: bool, +) -> Result { + let current_pos = extractor.current_position(); + + // Determine if we should exclude a delimiter from the number + let number_end = if from_container_end || (!extractor.is_empty()) { + // Came from container end OR not at EOF - number was terminated by delimiter, exclude it + current_pos.saturating_sub(1) + } else { + // At EOF and not from container end - number wasn't terminated by delimiter, use full span + current_pos + }; + + // Extract number bytes and convert to string + let number_bytes = extractor.get_number_slice(start_pos, number_end)?; + let number_str = ParserErrorHandler::bytes_to_utf8_str(number_bytes)?; + + // Parse number using shared logic + let parsed_result = crate::parse_number_from_str(number_str)?; + + // Create event + Ok(Event::Number(JsonNumber::Borrowed { + raw: number_str, + parsed: parsed_result, + })) +} + +/// Simple version for FlexParser that doesn't need container context. +/// Uses current buffer position as end without delimiter exclusion logic. +pub fn parse_number_event_simple( + extractor: &T, + start_pos: usize, +) -> Result { + let current_pos = extractor.current_position(); + + // Extract number bytes and convert to string + let number_bytes = extractor.get_number_slice(start_pos, current_pos)?; + let number_str = ParserErrorHandler::bytes_to_utf8_str(number_bytes)?; + + // Parse number using shared logic + let parsed_result = crate::parse_number_from_str(number_str)?; + + // Create event + Ok(Event::Number(JsonNumber::Borrowed { + raw: number_str, + parsed: parsed_result, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + // Mock extractor for testing + struct MockExtractor { + data: &'static [u8], + position: usize, + empty: bool, + } + + impl MockExtractor { + fn new(data: &'static [u8], position: usize, empty: bool) -> Self { + Self { + data, + position, + empty, + } + } + } + + impl NumberExtractor for MockExtractor { + fn get_number_slice(&self, start: usize, end: usize) -> Result<&[u8], ParseError> { + if end > self.data.len() { + return Err(ParserErrorHandler::unexpected_state( + "End position beyond buffer", + )); + } + Ok(&self.data[start..end]) + } + + fn current_position(&self) -> usize { + self.position + } + + fn is_empty(&self) -> bool { + self.empty + } + } + + #[test] + fn test_parse_number_event_simple() { + let data = b"123"; + let extractor = MockExtractor::new(data, 3, false); + + let result = parse_number_event_simple(&extractor, 0).unwrap(); + if let Event::Number(num) = result { + assert_eq!(num.as_str(), "123"); + assert_eq!(num.as_int(), Some(123)); + } else { + panic!("Expected Number event"); + } + } + + #[test] + fn test_parse_number_event_with_container() { + let data = b"456}"; // Number followed by container end + let extractor = MockExtractor::new(data, 4, false); // Position after '}' + + let result = parse_number_event(&extractor, 0, true).unwrap(); + if let Event::Number(num) = result { + assert_eq!(num.as_str(), "456"); // Should exclude the '}' + assert_eq!(num.as_int(), Some(456)); + } else { + panic!("Expected Number event"); + } + } + + #[test] + fn test_parse_number_event_at_eof() { + let data = b"789"; + let extractor = MockExtractor::new(data, 3, true); // At EOF + + let result = parse_number_event(&extractor, 0, false).unwrap(); + if let Event::Number(num) = result { + assert_eq!(num.as_str(), "789"); // Should include full number + assert_eq!(num.as_int(), Some(789)); + } else { + panic!("Expected Number event"); + } + } +} diff --git a/stax/src/shared.rs b/stax/src/shared.rs new file mode 100644 index 0000000..4743ff3 --- /dev/null +++ b/stax/src/shared.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: Apache-2.0 + +/// Shared components for JSON parsers +use crate::{JsonNumber, String}; + +/// Events produced by JSON parsers +#[derive(Debug, PartialEq)] +pub enum Event<'a, 'b> { + /// The start of an object (e.g., `{`). + StartObject, + /// The end of an object (e.g., `}`). + EndObject, + /// The start of an array (e.g., `[`). + StartArray, + /// The end of an array (e.g., `]`). + EndArray, + /// An object key (e.g., `"key":`). + Key(String<'a, 'b>), + /// A string value (e.g., `"value"`). + String(String<'a, 'b>), + /// A number value (e.g., `42` or `3.14`). + Number(JsonNumber<'a, 'b>), + /// A boolean value (e.g., `true` or `false`). + Bool(bool), + /// A null value (e.g., `null`). + Null, + /// End of the document. + EndDocument, +} + +/// Errors that can occur during JSON parsing +#[derive(Debug, PartialEq)] +pub enum ParseError { + /// An error bubbled up from the underlying tokenizer. + TokenizerError, + /// The provided scratch buffer was not large enough for an operation. + ScratchBufferFull, + /// A string slice was not valid UTF-8. + InvalidUtf8(core::str::Utf8Error), + /// A number string could not be parsed. + InvalidNumber, + /// The parser entered an unexpected internal state. + UnexpectedState(&'static str), + /// End of input data. + EndOfData, + /// Invalid hex digits in Unicode escape sequence. + InvalidUnicodeHex, + /// Valid hex but invalid Unicode codepoint. + InvalidUnicodeCodepoint, + /// Invalid escape sequence character. + InvalidEscapeSequence, + /// Float encountered but float support is disabled and float-error is configured + #[cfg(all(not(feature = "float"), feature = "float-error"))] + FloatNotAllowed, + /// A JSON token was too large to fit in the available buffer space + TokenTooLarge { + token_size: usize, + buffer_size: usize, + suggestion: &'static str, + }, + /// End of input stream was reached unexpectedly + EndOfStream, + /// Error from the underlying reader (I/O error, not end-of-stream) + ReaderError, +} + +impl From for ParseError { + fn from(err: core::str::Utf8Error) -> Self { + ParseError::InvalidUtf8(err) + } +} + +/// Internal parser state tracking +#[derive(Debug, PartialEq)] +pub enum State { + None, + Key(usize), + String(usize), + Number(usize), +} + +/// Parser state and event storage +pub(super) struct ParserState { + pub state: State, + pub evts: [Option; 2], +} + +impl ParserState { + pub fn new() -> Self { + Self { + state: State::None, + evts: core::array::from_fn(|_| None), + } + } +} + +impl Default for ParserState { + fn default() -> Self { + Self::new() + } +} + +/// Utility for calculating common content range boundaries in JSON parsing. +/// Provides consistent position arithmetic for string/number content extraction. +pub(crate) struct ContentRange; + +impl ContentRange { + /// Calculate string content boundaries from quote positions + /// + /// # Arguments + /// * `quote_start` - Position of opening quote + /// * `current_pos` - Current parser position (typically after closing quote) + /// + /// # Returns + /// (content_start, content_end) where content_start is after opening quote + /// and content_end is before closing quote + pub fn string_content_bounds(quote_start: usize, current_pos: usize) -> (usize, usize) { + let content_start = quote_start + 1; // Skip opening quote + let content_end = current_pos.saturating_sub(1); // Back up to exclude closing quote + (content_start, content_end) + } + + /// Calculate string content boundaries when escape sequence is in progress + /// + /// # Arguments + /// * `quote_start` - Position of opening quote + /// * `current_pos` - Current parser position (typically at escape sequence) + /// + /// # Returns + /// (content_start, content_end) where content_end is before the backslash + pub fn string_content_bounds_before_escape( + quote_start: usize, + current_pos: usize, + ) -> (usize, usize) { + let content_start = quote_start + 1; // Skip opening quote + let content_end = current_pos.saturating_sub(2); // Back up to before the backslash + (content_start, content_end) + } + + /// Calculate number content start from current position + /// + /// # Arguments + /// * `current_pos` - Current parser position (typically after first digit was processed) + /// + /// # Returns + /// Position that includes the first digit of the number + pub fn number_start_from_current(current_pos: usize) -> usize { + current_pos.saturating_sub(1) // Back up to include first digit + } + + /// Calculate quote position from current position + /// Used when tokenizer position is after a quote was processed + /// + /// # Arguments + /// * `current_pos` - Current parser position (after quote was processed) + /// + /// # Returns + /// Position of the quote itself + pub fn quote_position_from_current(current_pos: usize) -> usize { + current_pos.saturating_sub(1) // Back up to the quote + } + + /// Calculate Unicode escape sequence boundaries + /// + /// # Arguments + /// * `current_pos` - Current position (after 4 hex digits) + /// + /// # Returns + /// (hex_start, hex_end, escape_start) where hex_start/hex_end bound the XXXX + /// and escape_start is the position of the backslash in \uXXXX + pub fn unicode_escape_bounds(current_pos: usize) -> (usize, usize, usize) { + let hex_start = current_pos.saturating_sub(4); // Start of XXXX + let hex_end = current_pos; // End of XXXX + let escape_start = current_pos.saturating_sub(6); // Start of \uXXXX + (hex_start, hex_end, escape_start) + } + + /// Calculate end position for string content in FlexParser style + /// Used when the parser position needs to exclude the delimiter + /// + /// # Arguments + /// * `current_pos` - Current parser position + /// + /// # Returns + /// Position excluding the final delimiter + pub fn end_position_excluding_delimiter(current_pos: usize) -> usize { + current_pos.saturating_sub(1) + } +} + +/// Utility for common error handling patterns in JSON parsing. +/// Provides consistent error creation and UTF-8 validation across parsers. +pub(crate) struct ParserErrorHandler; + +impl ParserErrorHandler { + /// Convert bytes to UTF-8 string with consistent error handling + /// + /// # Arguments + /// * `bytes` - The byte slice to validate and convert + /// + /// # Returns + /// A UTF-8 string slice or ParseError::InvalidUtf8 + pub fn bytes_to_utf8_str(bytes: &[u8]) -> Result<&str, ParseError> { + core::str::from_utf8(bytes).map_err(ParseError::InvalidUtf8) + } + + /// Create an UnexpectedState error with context + /// + /// # Arguments + /// * `context` - Description of what state was unexpected + /// + /// # Returns + /// ParseError::UnexpectedState with the given context + pub fn unexpected_state(context: &'static str) -> ParseError { + ParseError::UnexpectedState(context) + } + + /// Create a state mismatch error for parser state validation + /// + /// # Arguments + /// * `expected` - The expected parser state + /// * `operation` - The operation that failed + /// + /// # Returns + /// ParseError::UnexpectedState with formatted message + pub fn state_mismatch(expected: &'static str, operation: &'static str) -> ParseError { + // Since we can't use format! in no_std, we'll use predefined common patterns + match (expected, operation) { + ("string", "end") => ParseError::UnexpectedState("String end without String start"), + ("key", "end") => ParseError::UnexpectedState("Key end without Key start"), + ("number", "extract") => ParseError::UnexpectedState("Not in number state"), + ("active", "process") => ParseError::UnexpectedState("Not in active processing state"), + _ => ParseError::UnexpectedState("State mismatch"), + } + } + + /// Create error for invalid Unicode escape sequences + pub fn invalid_unicode_escape() -> ParseError { + ParseError::InvalidUnicodeHex + } + + /// Create error for invalid Unicode escape length + pub fn invalid_unicode_length() -> ParseError { + ParseError::UnexpectedState("Invalid Unicode escape length") + } + + /// Create error for incomplete Unicode escape sequences + pub fn incomplete_unicode_escape() -> ParseError { + ParseError::UnexpectedState("Incomplete Unicode escape sequence") + } +} diff --git a/stax/src/slice_input_buffer.rs b/stax/src/slice_input_buffer.rs new file mode 100644 index 0000000..bb7904f --- /dev/null +++ b/stax/src/slice_input_buffer.rs @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: Apache-2.0 + +/// Error type for SliceInputBuffer operations. +#[derive(Debug, PartialEq)] +pub enum Error { + /// Reached the end of input data. + ReachedEnd, + /// Invalid slice bounds provided. + InvalidSliceBounds, +} + +/// A buffer that manages input data and current parsing position. +/// This encapsulates the data slice and position that are always used together. +#[derive(Debug)] +pub struct SliceInputBuffer<'a> { + data: &'a [u8], + pos: usize, +} + +pub trait InputBuffer { + fn is_past_end(&self) -> bool; + fn consume_byte(&mut self) -> Result; +} + +impl<'a> InputBuffer for SliceInputBuffer<'a> { + fn is_past_end(&self) -> bool { + self.pos > self.data.len() + } + fn consume_byte(&mut self) -> Result { + if self.pos >= self.data.len() { + self.pos += 1; // Still increment position like original logic + return Err(Error::ReachedEnd); + } + let byte = self.data[self.pos]; + self.pos += 1; + Ok(byte) + } +} +impl<'a> SliceInputBuffer<'a> { + pub fn current_pos(&self) -> usize { + self.pos + } + /// Creates a new SliceInputBuffer with the given data. + pub fn new(data: &'a [u8]) -> Self { + Self { data, pos: 0 } + } + + /// Gets a slice of the data from start to end positions, with bounds checking. + pub fn slice(&self, start: usize, end: usize) -> Result<&'a [u8], Error> { + if start > end || end > self.data.len() { + return Err(Error::InvalidSliceBounds); + } + Ok(&self.data[start..end]) + } +} + +impl<'a> crate::number_parser::NumberExtractor for SliceInputBuffer<'a> { + fn get_number_slice( + &self, + start: usize, + end: usize, + ) -> Result<&[u8], crate::shared::ParseError> { + self.slice(start, end) + .map_err(|_| crate::shared::ParseError::InvalidNumber) + } + + fn current_position(&self) -> usize { + // FlexParser's position is AFTER the delimiter that ended the number + // We need to return the position BEFORE that delimiter for consistent behavior + self.pos.saturating_sub(1) + } + + fn is_empty(&self) -> bool { + self.pos >= self.data.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_buffer_boundary_behavior() { + let data = b"abc"; // 3 bytes: positions 0, 1, 2 are valid + let mut buffer = SliceInputBuffer::new(data); + + // Position 0: start, should have data + assert_eq!(buffer.current_pos(), 0); + assert!(!buffer.is_past_end(), "pos=0 should not be past end"); + assert_eq!(buffer.consume_byte(), Ok(b'a')); + + // Position 1: middle, should have data + assert_eq!(buffer.current_pos(), 1); + assert!(!buffer.is_past_end(), "pos=1 should not be past end"); + assert_eq!(buffer.consume_byte(), Ok(b'b')); + + // Position 2: last byte, should have data + assert_eq!(buffer.current_pos(), 2); + assert!(!buffer.is_past_end(), "pos=2 should not be past end"); + assert_eq!(buffer.consume_byte(), Ok(b'c')); + + // Position 3: exactly at end (pos == data.len()), no more data + assert_eq!(buffer.current_pos(), 3); + assert_eq!( + buffer.current_pos(), + data.len(), + "pos should equal data.len()" + ); + + // INTENTIONAL DESIGN: Different semantics when pos == data.len() + // - is_past_end() returns false (parser can still finish processing) + // - consume_byte() returns Err (no more bytes to read) + // This allows the tokenizer to complete final events (like EndObject) + // even when no input bytes remain to be consumed + assert!( + !buffer.is_past_end(), + "pos == data.len() should NOT be past end (allows tokenizer.finish())" + ); + assert!( + buffer.consume_byte().is_err(), + "consume_byte() should fail when pos == data.len() (no bytes)" + ); + + // Position 4: past end (pos > data.len()), definitely error + assert_eq!(buffer.current_pos(), 4); + assert!(buffer.is_past_end(), "pos > data.len() should be past end"); + assert!( + buffer.consume_byte().is_err(), + "consume_byte() should fail when pos > data.len()" + ); + } +} diff --git a/stax/tests/api_test.rs b/stax/tests/api_test.rs new file mode 100644 index 0000000..9a47e3f --- /dev/null +++ b/stax/tests/api_test.rs @@ -0,0 +1,135 @@ +// Test the new API entry points + +use stax::{Event, ParseError, PullParser, String}; + +#[test] +fn test_new_no_escapes() { + let json = r#"{"name": "value", "number": 42, "bool": true}"#; + let mut parser = PullParser::new(json); + + // Should parse successfully since there are no escapes + // Events: StartObject, Key, String, Key, Number, Key, Bool, EndObject + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("name"))) + ); + assert_eq!( + parser.next_event(), + Ok(Event::String(String::Borrowed("value"))) + ); + // Skip to end for brevity + let mut remaining_count = 0; + loop { + match parser.next_event() { + Ok(Event::EndDocument) => break, + Ok(_) => remaining_count += 1, + Err(e) => panic!("Parse error: {:?}", e), + } + } + assert_eq!(remaining_count, 5); // Key, Number, Key, Bool, EndObject +} + +#[test] +fn test_new_with_escapes_fails() { + let json = r#"{"message": "Hello\nWorld"}"#; // Contains escape sequence + let mut parser = PullParser::new(json); + + // Should parse until it hits the escape + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("message"))) + ); + + // Should fail on the escaped string + match parser.next_event() { + Err(ParseError::ScratchBufferFull) => { + // Expected behavior + } + other => panic!("Expected ScratchBufferFull error, got: {:?}", other), + } +} + +#[test] +fn test_new_with_buffer_handles_escapes() { + let json = r#"{"message": "Hello\nWorld"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(json, &mut scratch); + + // Should parse successfully with escape handling + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("message"))) + ); + + // The escaped string should be unescaped + match parser.next_event() { + Ok(Event::String(String::Unescaped(s))) => { + assert_eq!(s, "Hello\nWorld"); + } + other => panic!("Expected unescaped string, got: {:?}", other), + } + + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); +} + +#[test] +fn test_new_with_numbers_and_arrays() { + #[cfg(feature = "float-error")] + let json = r#"[1, 2, true, false, null]"#; // No floats for float-error config + #[cfg(not(feature = "float-error"))] + let json = r#"[1, 2.5, true, false, null]"#; // Include float for other configs + + let mut parser = PullParser::new(json); + + // Should handle all basic types without issues + assert_eq!(parser.next_event(), Ok(Event::StartArray)); + assert!(matches!(parser.next_event(), Ok(Event::Number(_)))); + assert!(matches!(parser.next_event(), Ok(Event::Number(_)))); + assert_eq!(parser.next_event(), Ok(Event::Bool(true))); + assert_eq!(parser.next_event(), Ok(Event::Bool(false))); + assert_eq!(parser.next_event(), Ok(Event::Null)); + assert_eq!(parser.next_event(), Ok(Event::EndArray)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); +} + +#[test] +fn test_mixed_string_types() { + let json = r#"{"simple": "no_escapes", "complex": "with\tescapes"}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(json, &mut scratch); + + // Events: StartObject, Key("simple"), String("no_escapes"), Key("complex"), String("with\tescapes"), EndObject + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("simple"))) + ); + + // First string should be borrowed (no escapes) + match parser.next_event() { + Ok(Event::String(String::Borrowed(s))) => { + assert_eq!(s, "no_escapes"); + } + other => panic!("Expected borrowed string, got: {:?}", other), + } + + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("complex"))) + ); + + // Second string should be unescaped (has escapes) + match parser.next_event() { + Ok(Event::String(String::Unescaped(s))) => { + assert_eq!(s, "with\tescapes"); + } + other => panic!("Expected unescaped string, got: {:?}", other), + } + + assert_eq!(parser.next_event(), Ok(Event::EndObject)); + assert_eq!(parser.next_event(), Ok(Event::EndDocument)); +} diff --git a/stax/tests/api_test_errors.rs b/stax/tests/api_test_errors.rs new file mode 100644 index 0000000..e6e490d --- /dev/null +++ b/stax/tests/api_test_errors.rs @@ -0,0 +1,212 @@ +// Additional error handling tests for the API + +use stax::{Event, ParseError, PullParser, String}; + +#[test] +fn test_malformed_json_missing_quotes() { + let json = r#"{name: "value"}"#; // Missing quotes around key + let mut parser = PullParser::new(json); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + + // Should fail when parsing the unquoted key + match parser.next_event() { + Err(ParseError::TokenizerError) => { + // Expected - tokenizer should reject unquoted keys + } + other => panic!("Expected TokenizerError for unquoted key, got: {:?}", other), + } +} + +#[test] +fn test_malformed_json_unterminated_string() { + let json = r#"{"unterminated": "missing quote}"#; // Missing closing quote + let mut parser = PullParser::new(json); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("unterminated"))) + ); + + // Should fail when trying to parse the unterminated string + match parser.next_event() { + Err(ParseError::TokenizerError) => { + // Expected behavior + } + other => panic!( + "Expected TokenizerError for unterminated string, got: {:?}", + other + ), + } +} + +#[test] +fn test_malformed_json_invalid_escape() { + let json = r#"{"bad_escape": "invalid\x"}"#; // Invalid escape sequence + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(json, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("bad_escape"))) + ); + + // Should fail on invalid escape sequence + match parser.next_event() { + Err(ParseError::InvalidEscapeSequence) => { + // Expected behavior + } + Err(ParseError::TokenizerError) => { + // Also acceptable - tokenizer might catch this first + } + other => panic!("Expected escape sequence error, got: {:?}", other), + } +} + +#[test] +fn test_malformed_json_invalid_unicode_escape() { + let json = r#"{"bad_unicode": "test\uXYZ"}"#; // Invalid Unicode hex + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(json, &mut scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("bad_unicode"))) + ); + + // Should fail on invalid Unicode escape + match parser.next_event() { + Err(ParseError::InvalidUnicodeHex) => { + // Expected behavior + } + Err(ParseError::TokenizerError) => { + // Also acceptable - tokenizer might catch this first + } + other => panic!("Expected Unicode hex error, got: {:?}", other), + } +} + +#[test] +fn test_buffer_overflow_error() { + let json = r#"{"large_string": "This is a very long string with escapes\nand more escapes\tand even more content that might overflow a small buffer"}"#; + let mut small_scratch = [0u8; 10]; // Deliberately small buffer + let mut parser = PullParser::new_with_buffer(json, &mut small_scratch); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("large_string"))) + ); + + // Should fail due to insufficient scratch buffer space + match parser.next_event() { + Err(ParseError::ScratchBufferFull) => { + // Expected behavior + } + other => panic!("Expected ScratchBufferFull error, got: {:?}", other), + } +} + +#[test] +fn test_empty_input_error() { + let json = ""; + let mut parser = PullParser::new(json); + + // Should handle empty input gracefully + match parser.next_event() { + Ok(Event::EndDocument) => { + // This is acceptable - empty input could be treated as end + } + Err(ParseError::EndOfData) => { + // This is also acceptable + } + Err(ParseError::TokenizerError) => { + // This is also acceptable + } + other => panic!("Unexpected result for empty input: {:?}", other), + } +} + +#[test] +fn test_incomplete_json_error() { + let json = r#"{"incomplete""#; // Incomplete JSON + let mut parser = PullParser::new(json); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + + // Actually parses the key since it's well-formed so far + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("incomplete"))) + ); + + // Should fail when trying to find the value or colon + match parser.next_event() { + Err(ParseError::TokenizerError) => { + // Expected behavior when tokenizer hits end unexpectedly + } + Err(ParseError::EndOfData) => { + // Also acceptable + } + Ok(Event::EndDocument) => { + // Parser might be lenient and treat as end + } + other => panic!( + "Expected error or EndDocument for incomplete JSON, got: {:?}", + other + ), + } +} + +#[test] +fn test_malformed_json_unexpected_comma() { + let json = r#"{"key": "value",}"#; // Trailing comma + let mut parser = PullParser::new(json); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!(parser.next_event(), Ok(Event::Key(String::Borrowed("key")))); + assert_eq!( + parser.next_event(), + Ok(Event::String(String::Borrowed("value"))) + ); + + // Parser is lenient with trailing commas, just ends the object + match parser.next_event() { + Ok(Event::EndObject) => { + // Parser accepts trailing comma (lenient behavior) + } + Err(ParseError::TokenizerError) => { + // Strict parser would reject trailing comma + } + other => panic!( + "Expected EndObject or TokenizerError for trailing comma, got: {:?}", + other + ), + } +} + +#[test] +fn test_malformed_json_invalid_number() { + let json = r#"{"number": 123.456.789}"#; // Invalid number format + let mut parser = PullParser::new(json); + + assert_eq!(parser.next_event(), Ok(Event::StartObject)); + assert_eq!( + parser.next_event(), + Ok(Event::Key(String::Borrowed("number"))) + ); + + // Should fail on invalid number format + match parser.next_event() { + Err(ParseError::TokenizerError) => { + // Expected behavior + } + other => panic!( + "Expected TokenizerError for invalid number, got: {:?}", + other + ), + } +} diff --git a/stax/tests/configurable_numbers.rs b/stax/tests/configurable_numbers.rs new file mode 100644 index 0000000..9e5e362 --- /dev/null +++ b/stax/tests/configurable_numbers.rs @@ -0,0 +1,251 @@ +// Comprehensive tests for configurable number handling +// These tests demonstrate the various compilation configurations + +use stax::{Event, NumberResult, PullParser}; + +#[test] +#[cfg(feature = "int32")] +fn test_int32_overflow() { + let input = r#"{"value": 9999999999}"#; // Larger than i32::MAX (2,147,483,647) + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "9999999999"); + assert!(matches!(num.parsed(), NumberResult::IntegerOverflow)); + assert_eq!(num.as_int(), None); // Too large for i32 + } + other => panic!("Expected Number, got: {:?}", other), + } +} + +#[test] +#[cfg(feature = "int64")] +fn test_int64_handles_large_numbers() { + let input = r#"{"value": 9999999999}"#; // Within i64 range + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "9999999999"); + assert!(matches!(num.parsed(), NumberResult::Integer(9999999999))); + assert_eq!(num.as_int(), Some(9999999999)); + } + other => panic!("Expected Number, got: {:?}", other), + } +} + +#[test] +#[cfg(all(not(feature = "float"), feature = "float-error"))] +fn test_float_error_behavior() { + let input = r#"{"value": 3.14}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + // Should parse normally until we hit the float + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + // Float should cause an error + match parser.next_event() { + Err(stax::ParseError::FloatNotAllowed) => { + // Expected behavior - test passes + } + other => panic!("Expected FloatNotAllowed error, got: {:?}", other), + } +} + +#[test] +#[cfg(all(not(feature = "float"), feature = "float-truncate", feature = "int32"))] +fn test_float_truncate_to_i32() { + let input = r#"[1.7, 2.9, 3.1]"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartArray))); + + // 1.7 -> 1 + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "1.7"); + assert!(matches!(num.parsed(), NumberResult::FloatTruncated(1))); + assert_eq!(num.as_int(), Some(1)); + } + other => panic!("Expected truncated Number(1), got: {:?}", other), + } + + // 2.9 -> 2 + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "2.9"); + assert!(matches!(num.parsed(), NumberResult::FloatTruncated(2))); + assert_eq!(num.as_int(), Some(2)); + } + other => panic!("Expected truncated Number(2), got: {:?}", other), + } + + // 3.1 -> 3 + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "3.1"); + assert!(matches!(num.parsed(), NumberResult::FloatTruncated(3))); + assert_eq!(num.as_int(), Some(3)); + } + other => panic!("Expected truncated Number(3), got: {:?}", other), + } + + assert!(matches!(parser.next_event(), Ok(Event::EndArray))); +} + +#[test] +#[cfg(all( + not(feature = "float"), + feature = "float-truncate", + not(feature = "int32") +))] +fn test_float_truncate_to_i64() { + let input = r#"[1.7, 2.9, 3.1]"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartArray))); + + // Should truncate to i64 values + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "1.7"); + assert!(matches!(num.parsed(), NumberResult::FloatTruncated(1i64))); + } + other => panic!("Expected truncated Number, got: {:?}", other), + } +} + +#[test] +#[cfg(all(not(feature = "float"), feature = "float-truncate"))] +fn test_float_truncate_scientific_notation() { + let input = r#"{"value": 1.5e2}"#; // Scientific notation should error in truncate mode + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + // Scientific notation should cause InvalidNumber error to avoid float math + match parser.next_event() { + Err(stax::ParseError::InvalidNumber) => { + // Expected behavior - test passes + } + other => panic!( + "Expected InvalidNumber error for scientific notation, got: {:?}", + other + ), + } +} + +#[test] +#[cfg(all( + not(feature = "float"), + feature = "int64", + not(any( + feature = "float-error", + feature = "float-skip", + feature = "float-truncate" + )) +))] +fn test_default_float_disabled_behavior() { + let input = r#"{"value": 3.14}"#; + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "3.14"); + assert!(matches!(num.parsed(), NumberResult::FloatDisabled)); + assert_eq!(num.as_int(), None); + + // Raw string should still be available for manual parsing + assert_eq!(num.as_str(), "3.14"); + let manual_parse: Result = num.parse(); + assert!(manual_parse.is_ok()); + } + other => panic!("Expected Number with FloatDisabled, got: {:?}", other), + } +} + +#[test] +#[cfg(feature = "int32")] +fn test_mixed_numbers_with_i32() { + let input = r#"{"small": 42, "large": 999999999999}"#; // large > i32::MAX + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + // Small number should parse fine + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "42"); + assert!(matches!(num.parsed(), NumberResult::Integer(42))); + assert_eq!(num.as_int(), Some(42)); + } + other => panic!("Expected Number(42), got: {:?}", other), + } + + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + // Large number should overflow + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "999999999999"); + assert!(matches!(num.parsed(), NumberResult::IntegerOverflow)); + assert_eq!(num.as_int(), None); + + // But raw string is still available + assert_eq!(num.as_str(), "999999999999"); + } + other => panic!("Expected Number with overflow, got: {:?}", other), + } +} + +// This test ensures the library compiles and works with the most restrictive embedded configuration +#[test] +#[cfg(all(feature = "int32", not(feature = "float"), feature = "float-error"))] +fn test_embedded_friendly_config() { + // This configuration uses: + // - i32 integers (no 64-bit math) + // - No float support + // - Error on floats (fail fast) + + let input = r#"{"sensor": 42, "status": 1}"#; + let mut scratch = [0u8; 256]; // Small buffer for embedded + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + // Should parse integers normally + assert!(matches!(parser.next_event(), Ok(Event::StartObject))); + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + + match parser.next_event() { + Ok(Event::Number(num)) => { + assert_eq!(num.as_str(), "42"); + assert!(matches!(num.parsed(), NumberResult::Integer(42i32))); + assert_eq!(num.as_int(), Some(42i32)); + } + other => panic!("Expected Number(42), got: {:?}", other), + } + + assert!(matches!(parser.next_event(), Ok(Event::Key(_)))); + assert!(matches!(parser.next_event(), Ok(Event::Number(_)))); + assert!(matches!(parser.next_event(), Ok(Event::EndObject))); +} diff --git a/stax/tests/debug_root_numbers.rs b/stax/tests/debug_root_numbers.rs new file mode 100644 index 0000000..8283b0b --- /dev/null +++ b/stax/tests/debug_root_numbers.rs @@ -0,0 +1,48 @@ +// Debug root-level number parsing issue +use stax::{Event, PullParser}; + +fn test_json(input: &str, description: &str) { + println!("\n=== Testing: {} ===", description); + println!("Input: '{}'", input); + + let mut scratch = [0u8; 1024]; + let mut parser = PullParser::new_with_buffer(input, &mut scratch); + + let mut event_count = 0; + loop { + match parser.next_event() { + Ok(event) => { + event_count += 1; + println!("Event {}: {:?}", event_count, event); + if matches!(event, Event::EndDocument) { + break; + } + if event_count > 10 { + println!("Too many events, stopping..."); + break; + } + } + Err(e) => { + println!("Error: {:?}", e); + break; + } + } + } + println!("Total events: {}", event_count); +} + +#[test] +fn debug_root_level_numbers() { + // Test root-level primitives + test_json("42", "Root number"); + test_json(r#""hello""#, "Root string"); + test_json("true", "Root boolean true"); + test_json("false", "Root boolean false"); + test_json("null", "Root null"); + + // Compare with structured JSON + test_json(r#"{"value": 42}"#, "Small number in object"); + test_json(r#"{"value": 9999999999}"#, "Large number in object"); + test_json("[42]", "Small number in array"); + test_json("[9999999999]", "Large number in array"); +} diff --git a/tokenizer/Cargo.toml b/tokenizer/Cargo.toml new file mode 100644 index 0000000..795c37e --- /dev/null +++ b/tokenizer/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ujson" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.31", optional = true, features = ["derive"] } +log = "0.4.26" + +[dev-dependencies] +env_logger = "0.11.3" +test-log = { version = "0.2.14", features = ["trace"] } + +[features] +clap = ["dep:clap"] diff --git a/tokenizer/README.md b/tokenizer/README.md new file mode 100644 index 0000000..6219d8b --- /dev/null +++ b/tokenizer/README.md @@ -0,0 +1,3 @@ +Non-recursive JSON stream tokenizer. + +It uses 1 bit per nesting depth to track whether the level represents an array [] or an object {} diff --git a/tokenizer/src/bin/main.rs b/tokenizer/src/bin/main.rs new file mode 100644 index 0000000..8087f49 --- /dev/null +++ b/tokenizer/src/bin/main.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +use std::env; +use std::fs::File; +use std::io::Read; +//use std::process; + +fn main() { + println!("Hello, world!"); + + let args: Vec<_> = env::args().collect(); + if args.len() != 2 { + println!("Usage: {} file.json", args[0]); + std::process::exit(1); + } + let path = &args[1]; + let mut s = String::new(); + let mut f = match File::open(path) { + Ok(file) => file, + Err(e) => { + eprintln!("Error: Unable to open file '{}': {}", path, e); + std::process::exit(1); + } + }; + + match f.read_to_string(&mut s) { + Err(e) => { + eprintln!("Error: Unable to read file '{}': {}", path, e); + std::process::exit(1); + } + Ok(_) => println!("{}", s), + } + + let mut parser = ujson::Tokenizer::::new(); + match parser.parse_full(s.as_bytes(), &mut |_, _| {}) { + Err(e) => { + eprintln!("Error: JSON parsing failed: {:?}", e); + std::process::exit(1); + } + Ok(_) => std::process::exit(0), + }; +} diff --git a/tokenizer/src/bitstack/mod.rs b/tokenizer/src/bitstack/mod.rs new file mode 100644 index 0000000..6f82b94 --- /dev/null +++ b/tokenizer/src/bitstack/mod.rs @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: Apache-2.0 + +use core::cmp::PartialEq; +use core::ops::{BitAnd, Shl, Shr}; + +pub trait BitStack { + fn default() -> Self; + /// Pushes a bit (true for 1, false for 0) onto the stack. + fn push(&mut self, bit: bool); + /// Pops the top bit off the stack, returning it if the stack isn’t empty. + fn pop(&mut self) -> bool; + /// Returns the top bit without removing it. + fn top(&self) -> bool; +} + +impl BitStack for T +where + T: Shl + + Shr + + BitAnd + + core::ops::BitOr + + PartialEq + + Clone, + T: From, // To create 0 and 1 constants +{ + fn default() -> Self { + T::from(0) + } + fn push(&mut self, bit: bool) { + *self = (self.clone() << 1u8) | T::from(bit as u8); + } + + fn pop(&mut self) -> bool { + let bit = (self.clone() & T::from(1)) != T::from(0); + *self = self.clone() >> 1u8; + bit + } + + fn top(&self) -> bool { + (self.clone() & T::from(1)) != T::from(0) + } +} + +// Newtype wrapper for arrays to implement BitStack trait +// Provides large BitStack storage using multiple elements +#[derive(Debug)] +pub struct ArrayBitStack(pub [T; N]); + +impl BitStack for ArrayBitStack +where + T: Shl + + Shr + + BitAnd + + core::ops::BitOr + + PartialEq + + Clone + + From, +{ + fn default() -> Self { + ArrayBitStack(core::array::from_fn(|_| T::from(0))) + } + + fn push(&mut self, bit: bool) { + // Strategy: Use array as big-endian storage, with leftmost element as most significant + // Shift all elements left, carrying overflow from right to left + let bit_val = T::from(bit as u8); + let mut carry = bit_val; + let element_bits = (core::mem::size_of::() * 8) as u8; + let msb_shift = element_bits - 1; + + // Start from the rightmost (least significant) element and work left + for i in (0..N).rev() { + let old_msb = (self.0[i].clone() >> msb_shift) & T::from(1); // Extract MSB that will be lost + self.0[i] = (self.0[i].clone() << 1u8) | carry; + carry = old_msb; + } + // Note: carry from leftmost element is discarded (overflow) + } + + fn pop(&mut self) -> bool { + // Extract rightmost bit from least significant element + let bit = (self.0[N - 1].clone() & T::from(1)) != T::from(0); + + // Shift all elements right, carrying underflow from left to right + let mut carry = T::from(0); + let element_bits = (core::mem::size_of::() * 8) as u8; + let msb_shift = element_bits - 1; + + // Start from the leftmost (most significant) element and work right + for i in 0..N { + let old_lsb = self.0[i].clone() & T::from(1); // Extract LSB that will be lost + self.0[i] = (self.0[i].clone() >> 1u8) | (carry << msb_shift); + carry = old_lsb; + } + + bit + } + + fn top(&self) -> bool { + // Return rightmost bit from least significant element without modifying + (self.0[N - 1].clone() & T::from(1)) != T::from(0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bitstack() { + let mut bitstack = 0; + bitstack.push(true); + bitstack.push(false); + assert_eq!(bitstack.pop(), false); + assert_eq!(bitstack.pop(), true); + } + + #[test] + fn test_array_bitstack() { + // Test ArrayBitStack with 2 u8 elements (16-bit total capacity) + let mut bitstack: ArrayBitStack<2, u8> = ArrayBitStack::default(); + + // Test basic push/pop operations + bitstack.push(true); + bitstack.push(false); + bitstack.push(true); + + // Verify top() doesn't modify stack + assert_eq!(bitstack.top(), true); + assert_eq!(bitstack.top(), true); + + // Verify LIFO order + assert_eq!(bitstack.pop(), true); + assert_eq!(bitstack.pop(), false); + assert_eq!(bitstack.pop(), true); + } + + #[test] + fn test_array_bitstack_large_capacity() { + // Test larger ArrayBitStack (320-bit capacity with 10 u32 elements) + let mut bitstack: ArrayBitStack<10, u32> = ArrayBitStack::default(); + + // Push many bits to test multi-element handling + let pattern = [true, false, true, true, false, false, true, false]; + for &bit in &pattern { + bitstack.push(bit); + } + + // Pop and verify reverse order (LIFO) + for &expected in pattern.iter().rev() { + assert_eq!(bitstack.pop(), expected); + } + } + + #[test] + fn test_element_size_handling() { + // Test that bitstack correctly handles different element sizes + + // Test u8 elements (8-bit each) + let mut bitstack_u8: ArrayBitStack<1, u8> = ArrayBitStack::default(); + + // Fill all 8 bits of a u8 element + for i in 0..8 { + bitstack_u8.push(i % 2 == 0); // alternating pattern: true, false, true, false... + } + + // Verify we can retrieve all 8 bits in LIFO order + for i in (0..8).rev() { + assert_eq!(bitstack_u8.pop(), i % 2 == 0); + } + + // Test u32 elements (32-bit each) + let mut bitstack_u32: ArrayBitStack<1, u32> = ArrayBitStack::default(); + + // Fill all 32 bits of a u32 element + for i in 0..32 { + bitstack_u32.push(i % 3 == 0); // pattern: true, false, false, true, false, false... + } + + // Verify we can retrieve all 32 bits in LIFO order + for i in (0..32).rev() { + assert_eq!(bitstack_u32.pop(), i % 3 == 0); + } + } +} diff --git a/tokenizer/src/lib.rs b/tokenizer/src/lib.rs new file mode 100644 index 0000000..79fa504 --- /dev/null +++ b/tokenizer/src/lib.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 + +#![cfg_attr(not(test), no_std)] + +pub mod bitstack; +pub use bitstack::BitStack; +mod tokenizer; + +pub use tokenizer::Tokenizer; +pub use tokenizer::{Event, EventToken}; + +/// Trait that combines all the required trait bounds for depth counter types. +/// This is automatically implemented for any type that satisfies the individual bounds. +pub trait BitStackCore: + From + + core::cmp::PartialEq + + core::ops::AddAssign + + core::ops::SubAssign + + core::ops::Not + + core::fmt::Debug +{ +} + +impl BitStackCore for T where + T: From + + core::cmp::PartialEq + + core::ops::AddAssign + + core::ops::SubAssign + + core::ops::Not + + core::fmt::Debug +{ +} diff --git a/tokenizer/src/tokenizer/mod.rs b/tokenizer/src/tokenizer/mod.rs new file mode 100644 index 0000000..709e38d --- /dev/null +++ b/tokenizer/src/tokenizer/mod.rs @@ -0,0 +1,2197 @@ +// SPDX-License-Identifier: Apache-2.0 + +use crate::bitstack::BitStack; +use crate::BitStackCore; + +#[derive(Debug, Clone)] +struct ParseContext { + /// Keeps track of the depth of the object/array + depth: D, + /// Keeps track of the stack of objects/arrays + stack: T, + /// Keeps track of the last comma and its position + after_comma: Option<(u8, usize)>, +} + +impl ParseContext { + // We can expect an unsigned with From requirement + // So this math usually works + fn max_depth() -> D { + D::from(0u8).not() + } + fn new() -> Self { + ParseContext { + depth: 0u8.into(), + stack: T::default(), + after_comma: None, + } + } + fn enter_object(&mut self, data: u8, pos: usize) -> Result<(), Error> { + if self.depth == Self::max_depth() { + return Error::new(ErrKind::MaxDepthReached, data, pos); + } + self.stack.push(true); + self.depth += 1u8.into(); + Ok(()) + } + fn exit_object(&mut self, pos: usize) -> Result<(), Error> { + if self.depth == 0u8.into() { + return Error::new(ErrKind::UnopenedObject, b'}', pos); + } + self.stack.pop(); + self.depth -= 1u8.into(); + Ok(()) + } + fn enter_array(&mut self, data: u8, pos: usize) -> Result<(), Error> { + if self.depth == Self::max_depth() { + return Error::new(ErrKind::MaxDepthReached, data, pos); + } + self.stack.push(false); + self.depth += 1u8.into(); + Ok(()) + } + fn exit_array(&mut self, pos: usize) -> Result<(), Error> { + if self.depth == 0u8.into() { + return Error::new(ErrKind::UnopenedArray, b']', pos); + } + self.stack.pop(); + self.depth -= 1u8.into(); + Ok(()) + } + fn is_object(&self) -> bool { + if self.depth == 0u8.into() { + return false; + } + self.stack.top() + } + fn is_array(&self) -> bool { + if self.depth == 0u8.into() { + return false; + } + !self.stack.top() + } +} + +#[derive(Debug, Clone)] +enum State { + Idle, + String { state: String, key: bool }, + Number { state: Num }, + Token { token: Token }, + Object { expect: Object }, + Array { expect: Array }, + Finished, +} + +#[derive(Debug, Clone)] +enum String { + Normal, + Escaping, + Unicode0, // Just tracks number of hex digits seen (0-3) + Unicode1, + Unicode2, + Unicode3, +} + +#[derive(Debug, Clone)] +enum Num { + Sign, + LeadingZero, + BeforeDecimalPoint, + Decimal, + AfterDecimalPoint, + Exponent, + ExponentSign, + AfterExponent, +} + +#[derive(Debug, Clone)] +enum True { + R, + U, + E, +} +#[derive(Debug, Clone)] +enum False { + A, + L, + S, + E, +} +#[derive(Debug, Clone)] +enum Null { + U, + L1, + L2, +} + +#[derive(Debug, Clone)] +enum Token { + True(True), + False(False), + Null(Null), +} + +#[derive(Debug, Clone, PartialEq)] +enum Object { + Key, + Colon, + Value, + CommaOrEnd, +} + +#[derive(Debug, Clone, PartialEq)] +enum Array { + ItemOrEnd, + CommaOrEnd, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum EventToken { + True, + False, + Null, + String, + Key, + Number, + NumberAndArray, // used for closing arrays after numbers + NumberAndObject, // used for closing objects after numbers + UnicodeEscape, + EscapeSequence, // emitted when \ is encountered (start of any escape) + // Simple escape sequences + EscapeQuote, // \" + EscapeBackslash, // \\ + EscapeSlash, // \/ + EscapeBackspace, // \b + EscapeFormFeed, // \f + EscapeNewline, // \n + EscapeCarriageReturn, // \r + EscapeTab, // \t +} + +// todo: expose number events: sign, decimal, fraction, exponent +// update when a part of number has finished tokenizing ? + +#[derive(Debug, Clone, PartialEq)] +pub enum Event { + Begin(EventToken), + End(EventToken), + ObjectStart, + ObjectEnd, + ArrayStart, + ArrayEnd, + #[cfg(test)] + Uninitialized, +} + +pub struct Tokenizer { + state: State, + total_consumed: usize, + context: ParseContext, +} + +#[derive(PartialEq)] +pub struct Error { + kind: ErrKind, + character: u8, + position: usize, +} + +#[derive(PartialEq, Debug)] +pub enum ErrKind { + EmptyStream, + UnfinishedStream, + InvalidRoot, + InvalidToken, + UnescapedControlCharacter, + TrailingComma, + ContentEnded, + UnopenedArray, + UnopenedObject, + MaxDepthReached, + InvalidNumber, + InvalidUnicodeEscape, + InvalidStringEscape, + ExpectedObjectKey, + ExpectedObjectValue, + ExpectedColon, + ExpectedArrayItem, +} + +impl Error { + pub fn new(kind: ErrKind, character: u8, position: usize) -> Result { + Err(Self { + kind, + character, + position, + }) + } +} + +impl core::fmt::Debug for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "{:?}({}) at {}", + self.kind, self.character as char, self.position + ) + } +} + +impl Default for Tokenizer { + fn default() -> Self { + Self::new() + } +} + +impl Tokenizer { + pub fn new() -> Self { + Tokenizer { + state: State::Idle, + total_consumed: 0, + context: ParseContext::new(), + } + } + + fn check_trailing_comma(&mut self, data: u8) -> Result<(), Error> { + // Check for trailing comma if we're at a closing bracket/brace + if (data == b']' || data == b'}') && self.context.after_comma.is_some() { + let (c, pos) = self.context.after_comma.unwrap(); + return Error::new(ErrKind::TrailingComma, c, pos); + } + + // Only reset after_comma for non-whitespace characters + if !matches!(data, b' ' | b'\t' | b'\n' | b'\r') { + self.context.after_comma = None; + } + Ok(()) + } + + pub fn parse_full( + &mut self, + data: &[u8], + callback: &mut dyn FnMut(Event, usize), + ) -> Result { + self.parse_chunk(data, callback)?; + self.finish(callback) + } + + pub fn finish(&mut self, callback: &mut F) -> Result + where + F: FnMut(Event, usize) + ?Sized, + { + // we check that parser was idle, at zero nesting depth + if self.context.depth != 0u8.into() { + return Error::new(ErrKind::UnfinishedStream, b' ', self.total_consumed); + } + if self.total_consumed == 0 { + return Error::new(ErrKind::EmptyStream, b' ', self.total_consumed); + } + + match &self.state { + State::Finished => Ok(self.total_consumed), + State::Number { + state: Num::LeadingZero, + } + | State::Number { + state: Num::BeforeDecimalPoint, + } + | State::Number { + state: Num::AfterDecimalPoint, + } + | State::Number { + state: Num::AfterExponent, + } => { + callback(Event::End(EventToken::Number), self.total_consumed); + Ok(self.total_consumed) + } + _ => Error::new(ErrKind::UnfinishedStream, b' ', self.total_consumed), + } + } + + pub fn parse_chunk(&mut self, data: &[u8], callback: &mut F) -> Result + where + F: FnMut(Event, usize) + ?Sized, + { + self.p(data, callback)?; + Ok(self.total_consumed) + } + + // testing helper + #[cfg(test)] + fn t(&mut self, data: &[u8]) -> Result { + self.p(data, &mut |_, _| {}) + } + // testing helper + fn p(&mut self, data: &[u8], callback: &mut F) -> Result + where + F: FnMut(Event, usize) + ?Sized, + { + let consumed = self.parse_chunk_inner(data, callback)?; + self.total_consumed += consumed; + Ok(consumed) + } + + fn maybe_exit_level(&self) -> State { + if self.context.is_object() { + State::Object { + expect: Object::CommaOrEnd, + } + } else if self.context.is_array() { + State::Array { + expect: Array::CommaOrEnd, + } + } else if self.context.depth == 0u8.into() { + State::Finished + } else { + State::Idle + } + } + + fn saw_a_comma_now_what(&mut self) -> State { + if self.context.is_object() { + State::Object { + expect: Object::Key, + } + } else if self.context.is_array() { + State::Array { + expect: Array::ItemOrEnd, + } + } else { + State::Idle + } + } + + fn start_token( + &mut self, + token: u8, + pos: usize, + callback: &mut dyn FnMut(Event, usize), + ) -> Result { + match token { + b't' => { + callback(Event::Begin(EventToken::True), pos); + Ok(State::Token { + token: Token::True(True::R), + }) + } + b'f' => { + callback(Event::Begin(EventToken::False), pos); + Ok(State::Token { + token: Token::False(False::A), + }) + } + b'n' => { + callback(Event::Begin(EventToken::Null), pos); + Ok(State::Token { + token: Token::Null(Null::U), + }) + } + _ => Error::new(ErrKind::InvalidToken, token, pos), + } + } + + fn parse_chunk_inner(&mut self, data: &[u8], mut callback: &mut F) -> Result + where + F: FnMut(Event, usize) + ?Sized, + { + let mut pos = 0; + while pos < data.len() { + // Special case - this needs to be done for every Array match arm + if let State::Array { + expect: Array::ItemOrEnd, + } = &self.state + { + self.check_trailing_comma(data[pos])?; + } + + self.state = match (&self.state, data[pos]) { + (State::Number { state: Num::Sign }, b'0') => State::Number { + state: Num::LeadingZero, + }, + (State::Number { state: Num::Sign }, b'1'..=b'9') => State::Number { + state: Num::BeforeDecimalPoint, + }, + (State::Number { state: Num::Sign }, _) => { + return Error::new(ErrKind::InvalidNumber, data[pos], pos); + } + ( + State::Number { + state: Num::LeadingZero, + }, + b'e' | b'E', + ) => State::Number { + state: Num::Exponent, + }, + ( + State::Number { + state: Num::LeadingZero, + }, + b'.', + ) => State::Number { + state: Num::Decimal, + }, + ( + State::Number { + state: Num::BeforeDecimalPoint, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::BeforeDecimalPoint, + }, + ( + State::Number { + state: Num::BeforeDecimalPoint, + }, + b'.', + ) => State::Number { + state: Num::Decimal, + }, + ( + State::Number { + state: Num::BeforeDecimalPoint, + }, + b'e' | b'E', + ) => State::Number { + state: Num::Exponent, + }, + ( + State::Number { + state: Num::Decimal, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::AfterDecimalPoint, + }, + ( + State::Number { + state: Num::Decimal, + }, + _, + ) => { + return Error::new(ErrKind::InvalidNumber, data[pos], pos); + } + ( + State::Number { + state: Num::AfterDecimalPoint, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::AfterDecimalPoint, + }, + ( + State::Number { + state: Num::AfterDecimalPoint, + }, + b'e' | b'E', + ) => State::Number { + state: Num::Exponent, + }, + ( + State::Number { + state: Num::Exponent, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::AfterExponent, + }, + ( + State::Number { + state: Num::Exponent, + }, + b'+' | b'-', + ) => State::Number { + state: Num::ExponentSign, + }, + ( + State::Number { + state: Num::Exponent, + }, + _, + ) => { + return Error::new(ErrKind::InvalidNumber, data[pos], pos); + } + ( + State::Number { + state: Num::ExponentSign, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::AfterExponent, + }, + ( + State::Number { + state: Num::ExponentSign, + }, + _, + ) => { + return Error::new(ErrKind::InvalidNumber, data[pos], pos); + } + ( + State::Number { + state: Num::AfterExponent, + }, + b'0'..=b'9', + ) => State::Number { + state: Num::AfterExponent, + }, + (State::Number { state: _ }, b',') => { + callback(Event::End(EventToken::Number), pos); + self.context.after_comma = Some((data[pos], pos)); + self.saw_a_comma_now_what() + } + (State::Number { state: _ }, b' ' | b'\t' | b'\n' | b'\r') => { + callback(Event::End(EventToken::Number), pos); + self.maybe_exit_level() + } + (State::Number { state: _ }, b']') => { + callback(Event::End(EventToken::NumberAndArray), pos); + callback(Event::ArrayEnd, pos); + self.context.exit_array(pos)?; + self.maybe_exit_level() + } + (State::Number { state: _ }, b'}') => { + callback(Event::End(EventToken::NumberAndObject), pos); + callback(Event::ObjectEnd, pos); + self.context.exit_object(pos)?; + self.maybe_exit_level() + } + (State::Number { state: _ }, _) => { + return Error::new(ErrKind::InvalidNumber, data[pos], pos); + } + ( + State::String { + state: String::Normal, + key, + }, + b'"', + ) => { + if *key { + callback(Event::End(EventToken::Key), pos); + State::Object { + expect: Object::Colon, + } + } else { + callback(Event::End(EventToken::String), pos); + self.maybe_exit_level() + } + } + ( + State::String { + state: String::Normal, + key, + }, + b'\\', + ) => { + callback(Event::Begin(EventToken::EscapeSequence), pos); + State::String { + state: String::Escaping, + key: *key, + } + } + ( + State::String { + state: String::Normal, + key: _, + }, + b'\x00'..=b'\x1F', + ) => { + return Error::new(ErrKind::UnescapedControlCharacter, data[pos], pos); + } + ( + State::String { + state: String::Normal, + key: _, + }, + _, + ) => self.state.clone(), + // Handle simple escape sequences with lookup table + ( + State::String { + state: String::Escaping, + key, + }, + escape_char @ (b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't'), + ) => { + let escape_token = match escape_char { + b'"' => EventToken::EscapeQuote, + b'\\' => EventToken::EscapeBackslash, + b'/' => EventToken::EscapeSlash, + b'b' => EventToken::EscapeBackspace, + b'f' => EventToken::EscapeFormFeed, + b'n' => EventToken::EscapeNewline, + b'r' => EventToken::EscapeCarriageReturn, + b't' => EventToken::EscapeTab, + _ => unreachable!(), + }; + callback(Event::Begin(escape_token.clone()), pos); + callback(Event::End(escape_token), pos); + State::String { + state: String::Normal, + key: *key, + } + } + ( + State::String { + state: String::Escaping, + key, + }, + b'u', + ) => State::String { + state: String::Unicode0, + key: *key, + }, + ( + State::String { + state: String::Unicode0, + key, + }, + b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F', + ) => { + callback(Event::Begin(EventToken::UnicodeEscape), pos); + State::String { + state: String::Unicode1, + key: *key, + } + } + ( + State::String { + state: String::Unicode1, + key, + }, + b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F', + ) => State::String { + state: String::Unicode2, + key: *key, + }, + ( + State::String { + state: String::Unicode2, + key, + }, + b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F', + ) => State::String { + state: String::Unicode3, + key: *key, + }, + ( + State::String { + state: String::Unicode3, + key, + }, + b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F', + ) => { + callback(Event::End(EventToken::UnicodeEscape), pos); + State::String { + state: String::Normal, + key: *key, + } + } + ( + State::String { + state: String::Unicode0, + key: _, + } + | State::String { + state: String::Unicode1, + key: _, + } + | State::String { + state: String::Unicode2, + key: _, + } + | State::String { + state: String::Unicode3, + key: _, + }, + _, + ) => { + return Error::new(ErrKind::InvalidUnicodeEscape, data[pos], pos); + } + ( + State::Idle + | State::Object { expect: _ } + | State::Array { expect: _ } + | State::Finished, + b' ' | b'\t' | b'\n' | b'\r', + ) => self.state.clone(), + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'[', + ) => { + self.context.enter_array(data[pos], pos)?; + callback(Event::ArrayStart, pos); + State::Array { + expect: Array::ItemOrEnd, + } + } + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'{', + ) => { + self.context.enter_object(data[pos], pos)?; + callback(Event::ObjectStart, pos); + State::Object { + expect: Object::Key, + } + } + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'"', + ) => { + callback(Event::Begin(EventToken::String), pos); + State::String { + state: String::Normal, + key: false, + } + } + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b't' | b'f' | b'n', + ) => self.start_token(data[pos], pos, &mut callback)?, + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'-', /*| b'+' */ + ) => { + callback(Event::Begin(EventToken::Number), pos); + State::Number { state: Num::Sign } + } + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'0', + ) => { + callback(Event::Begin(EventToken::Number), pos); + State::Number { + state: Num::LeadingZero, + } + } + ( + State::Idle + | State::Object { + expect: Object::Value, + } + | State::Array { + expect: Array::ItemOrEnd, + }, + b'1'..=b'9', + ) => { + callback(Event::Begin(EventToken::Number), pos); + State::Number { + state: Num::BeforeDecimalPoint, + } + } + ( + State::Object { + expect: Object::Value, + }, + _, + ) => return Error::new(ErrKind::ExpectedObjectValue, data[pos], pos), + ( + State::Array { + expect: Array::ItemOrEnd, + }, + b']', + ) => { + callback(Event::ArrayEnd, pos); + self.context.exit_array(pos)?; + self.maybe_exit_level() + } + ( + State::Object { + expect: Object::Key, + }, + b'"', + ) => { + callback(Event::Begin(EventToken::Key), pos); + State::String { + state: String::Normal, + key: true, + } + } + ( + State::Object { + expect: Object::Key, + }, + b'}', + ) => { + if self.context.after_comma.is_some() { + return Error::new( + ErrKind::TrailingComma, + self.context.after_comma.unwrap().0, + pos, + ); + } + self.context.exit_object(pos)?; + callback(Event::ObjectEnd, pos); + self.maybe_exit_level() + } + ( + State::Object { + expect: Object::Colon, + }, + b':', + ) => State::Object { + expect: Object::Value, + }, + ( + State::Object { + expect: Object::CommaOrEnd, + }, + b',', + ) => State::Object { + expect: Object::Key, + }, + ( + State::Object { + expect: Object::CommaOrEnd, + }, + b'}', + ) => { + self.context.exit_object(pos)?; + callback(Event::ObjectEnd, pos); + self.maybe_exit_level() + } + ( + State::Array { + expect: Array::CommaOrEnd, + }, + b',', + ) => { + self.context.after_comma = Some((data[pos], pos)); + State::Array { + expect: Array::ItemOrEnd, + } + } + ( + State::Array { + expect: Array::CommaOrEnd, + }, + b']', + ) => { + callback(Event::ArrayEnd, pos); + self.context.exit_array(pos)?; + self.maybe_exit_level() + } + ( + State::Token { + token: Token::True(True::R), + }, + b'r', + ) => State::Token { + token: Token::True(True::U), + }, + ( + State::Token { + token: Token::True(True::U), + }, + b'u', + ) => State::Token { + token: Token::True(True::E), + }, + ( + State::Token { + token: Token::True(True::E), + }, + b'e', + ) => { + callback(Event::End(EventToken::True), pos); + self.maybe_exit_level() + } + ( + State::Token { + token: Token::False(False::A), + }, + b'a', + ) => State::Token { + token: Token::False(False::L), + }, + ( + State::Token { + token: Token::False(False::L), + }, + b'l', + ) => State::Token { + token: Token::False(False::S), + }, + ( + State::Token { + token: Token::False(False::S), + }, + b's', + ) => State::Token { + token: Token::False(False::E), + }, + ( + State::Token { + token: Token::False(False::E), + }, + b'e', + ) => { + callback(Event::End(EventToken::False), pos); + self.maybe_exit_level() + } + ( + State::Token { + token: Token::Null(Null::U), + }, + b'u', + ) => State::Token { + token: Token::Null(Null::L1), + }, + ( + State::Token { + token: Token::Null(Null::L1), + }, + b'l', + ) => State::Token { + token: Token::Null(Null::L2), + }, + ( + State::Token { + token: Token::Null(Null::L2), + }, + b'l', + ) => { + callback(Event::End(EventToken::Null), pos); + self.maybe_exit_level() + } + + // Wrong tokens + (State::Idle, _) => { + return Error::new(ErrKind::InvalidRoot, data[pos], pos); + } + ( + State::String { + state: String::Escaping, + key: _, + }, + _, + ) => return Error::new(ErrKind::InvalidStringEscape, data[pos], pos), + ( + State::Object { + expect: Object::Key, + }, + _, + ) => return Error::new(ErrKind::ExpectedObjectKey, data[pos], pos), + ( + State::Object { + expect: Object::Colon, + }, + _, + ) => return Error::new(ErrKind::ExpectedColon, data[pos], pos), + ( + State::Object { + expect: Object::CommaOrEnd, + }, + _, + ) => return Error::new(ErrKind::ExpectedObjectValue, data[pos], pos), + ( + State::Array { + expect: Array::ItemOrEnd, + } + | State::Array { + expect: Array::CommaOrEnd, + }, + _, + ) => return Error::new(ErrKind::ExpectedArrayItem, data[pos], pos), + (State::Finished, _) => return Error::new(ErrKind::ContentEnded, data[pos], pos), + (State::Token { token: _ }, _) => { + return Error::new(ErrKind::InvalidToken, data[pos], pos) + } + }; + pos += 1; + } + Ok(pos) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use log::warn; + use test_log::test; + + #[test] + fn test_zero_input() { + let res = Tokenizer::::new().t(b""); + assert_eq!(res, Ok(0)); + } + #[test] + fn test_root_is_garbage() { + assert_eq!( + Tokenizer::::new().t(b"a"), + Error::new(ErrKind::InvalidRoot, b'a', 0) + ); + assert_eq!( + Tokenizer::::new().t(b" a"), + Error::new(ErrKind::InvalidRoot, b'a', 1) + ); + } + #[test] + fn test_root_is_a_token() { + assert_eq!(Tokenizer::::new().t(b"t"), Ok(1)); + assert_eq!(Tokenizer::::new().t(b"f"), Ok(1)); + assert_eq!(Tokenizer::::new().t(b"n"), Ok(1)); + } + #[test] + fn test_root_is_an_object() { + assert_eq!(Tokenizer::::new().t(b"{"), Ok(1)); + } + #[test] + fn test_root_is_an_array() { + assert_eq!(Tokenizer::::new().t(b"["), Ok(1)); + } + #[test] + fn test_root_is_a_string() { + assert_eq!(Tokenizer::::new().t(b"\"a\""), Ok(3)); + } + + #[test] + fn test_no_garbage_after_root() { + let mut parser = Tokenizer::new(); + let mut events: [Event; 16] = core::array::from_fn(|_| Event::Uninitialized); + let result = collect_with_result(&mut parser, b"true extra", &mut events); + assert_eq!(result, Error::new(ErrKind::ContentEnded, b'e', 5)); + } + + fn collect<'a, 'b, 'c>( + parser: &'c mut Tokenizer, + data: &'b [u8], + store: &'a mut [Event], + ) -> (usize, &'a [Event]) + where + 'b: 'a, + { + let mut index = 0; + let consumed = parser + .p(data, &mut |event, _pos| { + warn!("Event: {:?}", event); + store[index] = event.clone(); + index += 1; + }) + .unwrap(); + (consumed, &store[..index]) + } + + fn collect_with_result<'a, 'b, 'c>( + parser: &'c mut Tokenizer, + data: &'b [u8], + store: &'a mut [Event], + ) -> Result<(usize, &'a [Event]), Error> { + let mut index = 0; + let consumed = parser.p(data, &mut |event, _pos| { + warn!("Event: {:?}", event); + store[index] = event.clone(); + index += 1; + })?; + Ok((consumed, &store[..index])) + } + + #[test] + fn test_parse_root_token_true() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b" true ", &mut m); + assert_eq!( + r, + ( + 6, + [Event::Begin(EventToken::True), Event::End(EventToken::True),].as_slice() + ) + ); + + // sending the same in two, three chunks should yield the same + let mut parser = Tokenizer::::new(); + parser + .p(b" tr", &mut |ev, _pos| { + assert_eq!(ev, Event::Begin(EventToken::True)); + }) + .unwrap(); + parser + .p(b"ue ", &mut |ev, _pos| { + assert_eq!(ev, Event::End(EventToken::True)); + }) + .unwrap(); + } + + #[test] + fn test_after_root_should_not_accept_comma() { + let mut m: [Event; 2] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b" true,", &mut m); + assert_eq!(r, Error::new(ErrKind::ContentEnded, b',', 5)); + } + + #[test] + fn test_parse_root_token_false() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b" false ", &mut m); + assert_eq!( + r, + ( + 7, + [ + Event::Begin(EventToken::False), + Event::End(EventToken::False), + ] + .as_slice() + ) + ); + } + + #[test] + fn test_parse_root_token_null() { + let mut m: [Event; 4] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"\tnull\n\r", &mut m); + assert_eq!( + r, + ( + 7, + [Event::Begin(EventToken::Null), Event::End(EventToken::Null),].as_slice() + ) + ); + } + + #[test] + fn test_parse_root_token_string() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b" \"a\" ", &mut m); + assert_eq!( + r, + ( + 5, + [ + Event::Begin(EventToken::String), + Event::End(EventToken::String), + ] + .as_slice() + ) + ); + } + + #[test] + fn test_boolean_null() { + let mut parser = Tokenizer::new(); + let mut events: [Event; 16] = core::array::from_fn(|_| Event::Uninitialized); + let (consumed, result) = collect(&mut parser, b"{\"flag\":true,\"nil\":null}", &mut events); + assert_eq!(consumed, 24); + assert_eq!( + result, + [ + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::True), + Event::End(EventToken::True), + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::Null), + Event::End(EventToken::Null), + Event::ObjectEnd, + ] + ); + } + + #[test] + fn test_empty_object() { + let mut m: [Event; 2] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"{}", &mut m); + assert_eq!(r, (2, [Event::ObjectStart, Event::ObjectEnd].as_slice())); + } + + #[test] + fn test_object_with_whitespace() { + let mut m: [Event; 2] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"{ \n\t\r}", &mut m); + assert_eq!(r, (6, [Event::ObjectStart, Event::ObjectEnd].as_slice())); + } + + #[test] + fn test_invalid_object_key() { + let mut m: [Event; 1] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"{true", &mut m); + assert_eq!(r, Error::new(ErrKind::ExpectedObjectKey, b't', 1)); + } + + #[test] + fn test_object_missing_colon() { + let mut m: [Event; 3] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"{\"key\"true}", &mut m); + assert_eq!(r, Error::new(ErrKind::ExpectedColon, b't', 6)); + } + + #[test] + fn test_object_missing_value() { + let mut m: [Event; 3] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"{\"key\":}", &mut m); + assert_eq!(r, Error::new(ErrKind::ExpectedObjectValue, b'}', 7)); + } + + #[test] + fn test_object_missing_comma() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"{\"a\":true\"b\":true}", &mut m); + assert_eq!(r, Error::new(ErrKind::ExpectedObjectValue, b'"', 9)); + } + + #[test] + fn test_nested_empty_objects() { + let mut m: [Event; 10] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"{\"a\":{}}", &mut m); + assert_eq!( + r, + ( + 8, + [ + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::ObjectEnd, + Event::ObjectEnd, + ] + .as_slice() + ) + ); + } + + #[test] + fn test_deeply_nested_object() { + let mut m: [Event; 16] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect( + &mut Tokenizer::new(), + b"{\"a\":{\"b\":{\"c\":true}}}", + &mut m, + ); + assert_eq!( + r, + ( + 22, + [ + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::True), + Event::End(EventToken::True), + Event::ObjectEnd, + Event::ObjectEnd, + Event::ObjectEnd, + ] + .as_slice() + ) + ); + } + + #[test] + fn test_multiple_nested_objects() { + let mut m: [Event; 20] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect( + &mut Tokenizer::new(), + b"{\"a\":{\"x\":true},\"b\":{\"y\":null}}", + &mut m, + ); + assert_eq!( + r, + ( + 31, + [ + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::True), + Event::End(EventToken::True), + Event::ObjectEnd, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::Null), + Event::End(EventToken::Null), + Event::ObjectEnd, + Event::ObjectEnd, + ] + .as_slice() + ) + ); + } + + #[test] + fn test_partial_nested_object() { + let mut m: [Event; 10] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"{\"a\":{\"b\":true", &mut m); + assert_eq!( + r, + ( + 14, + [ + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::True), + Event::End(EventToken::True), + ] + .as_slice() + ) + ); + } + + #[test] + fn test_simple_array() { + let mut m: [Event; 8] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"[true, false, null]", &mut m); + assert_eq!( + r, + ( + 19, + [ + Event::ArrayStart, + Event::Begin(EventToken::True), + Event::End(EventToken::True), + Event::Begin(EventToken::False), + Event::End(EventToken::False), + Event::Begin(EventToken::Null), + Event::End(EventToken::Null), + Event::ArrayEnd, + ] + .as_slice() + ) + ); + } + + #[test] + fn test_array_with_objects() { + let mut m: [Event; 14] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect( + &mut Tokenizer::new(), + b"[{\"a\":true}, {\"b\":null}]", + &mut m, + ); + assert_eq!( + r, + ( + 24, + [ + Event::ArrayStart, + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::True), + Event::End(EventToken::True), + Event::ObjectEnd, + Event::ObjectStart, + Event::Begin(EventToken::Key), + Event::End(EventToken::Key), + Event::Begin(EventToken::Null), + Event::End(EventToken::Null), + Event::ObjectEnd, + Event::ArrayEnd, + ] + .as_slice() + ) + ); + } + + #[test] + fn test_empty_array() { + let mut m: [Event; 2] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"[]", &mut m); + assert_eq!(r, (2, [Event::ArrayStart, Event::ArrayEnd].as_slice())); + } + + #[test] + fn test_array_with_trailing_comma() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"[1,]", &mut m); + assert_eq!(r, Error::new(ErrKind::TrailingComma, b',', 2)); + } + + #[test] + fn test_array_with_trailing_comma_true() { + let mut m: [Event; 6] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"[true,]", &mut m); + assert_eq!(r, Error::new(ErrKind::TrailingComma, b',', 5)); + } + + #[test] + fn test_array_with_trailing_comma_in_nested_array() { + let mut m: [Event; 16] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"{ \"d\": [\"f\",\"b\",] }", &mut m); + assert_eq!(r, Error::new(ErrKind::TrailingComma, b',', 15)); + } + + #[test] + fn test_unicode_escape() { + let mut m: [Event; 5] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect(&mut Tokenizer::new(), b"\"\\u0041\"", &mut m); + assert_eq!( + r, + ( + 8, + [ + Event::Begin(EventToken::String), + Event::Begin(EventToken::EscapeSequence), + Event::Begin(EventToken::UnicodeEscape), + Event::End(EventToken::UnicodeEscape), + Event::End(EventToken::String), + ] + .as_slice() + ) + ); + } + + #[test] + fn test_invalid_unicode_escape() { + let mut m: [Event; 4] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"\"\\u00g\"", &mut m); + assert_eq!(r, Error::new(ErrKind::InvalidUnicodeEscape, b'g', 5)); + } + + #[test] + fn test_incomplete_unicode_escape() { + let mut m: [Event; 4] = core::array::from_fn(|_| Event::Uninitialized); + let r = collect_with_result(&mut Tokenizer::new(), b"\"\\u001\"", &mut m); + assert_eq!(r, Error::new(ErrKind::InvalidUnicodeEscape, b'"', 6)); + } + + #[test] + fn test_u8_bitstack() { + // Test BitStack with u8 type (8-bit depth) + let mut parser: Tokenizer = Tokenizer::new(); + + // Test simple array - should work with 8-bit depth + let mut events = Vec::new(); + let result = parser.parse_full(b"[1,2,3]", &mut |event, _pos| { + events.push(event); + }); + + assert!(result.is_ok()); + assert_eq!(events.len(), 8); // ArrayStart + 3*(Begin+End Number) + ArrayEnd + } + + #[test] + fn test_u64_bitstack() { + // Test BitStack with u64 type (64-bit depth = much deeper nesting) + let mut parser: Tokenizer = Tokenizer::new(); + + // Test deeply nested structure + let json = b"[[[[1]]]]"; // 4 levels of nesting + let mut events = Vec::new(); + let result = parser.parse_full(json, &mut |event, _pos| { + events.push(event); + }); + + assert!(result.is_ok()); + // Should handle deep nesting easily with 64-bit storage + assert!(events.len() > 8); // Multiple ArrayStart/End + Number events + } + + // TODO: Array BitStack support needs custom implementation + // Arrays don't implement the required bit operations for BitStack trait +} + +#[cfg(test)] +mod conformance { + use super::*; + use test_log::test; + + fn assert_check( + actual: (Result, &[(Event, usize)]), + expected: (Result, &[(Event, usize)]), + file: &str, + line: u32, + ) { + if actual != expected { + panic!( + "assertion failed at {}:{}\n left: {:?}\n right: {:?}", + file, line, actual, expected + ); + } + } + + fn check_impl( + data: &[u8], + expect: Result, + expected_events: &[(Event, usize)], + file: &str, + line: u32, + ) { + let mut parser = Tokenizer::::new(); + let mut results: [(Event, usize); 1024] = + core::array::from_fn(|_| (Event::Uninitialized, 0)); + let mut received = 0; + let parse_result = parser.parse_full(data, &mut |event, pos| { + results[received] = (event, pos); + received += 1; + }); + let result_slice = &results[0..received]; + assert_check( + (parse_result, result_slice), + (expect, expected_events), + file, + line, + ); + } + + macro_rules! check { + ($data:expr, $expect:expr, $events:expr) => { + check_impl($data, $expect, $events, file!(), line!()) + }; + } + + #[test] + fn test_conformance_null() { + check!( + b"[null] ", + Ok(7), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Null), 1), + (Event::End(EventToken::Null), 4), + (Event::ArrayEnd, 5) + ] + ); + check!( + b"[true] ", + Ok(7), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::True), 1), + (Event::End(EventToken::True), 4), + (Event::ArrayEnd, 5) + ] + ); + check!( + b"[false] ", + Ok(8), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::False), 1), + (Event::End(EventToken::False), 5), + (Event::ArrayEnd, 6) + ] + ); + check!( + b"[\"a\"] ", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::End(EventToken::String), 3), + (Event::ArrayEnd, 4) + ] + ); + } + + #[test] + fn test_conformance_1() { + check!( + b"[2] ", + Ok(4), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 2), + (Event::ArrayEnd, 2) + ] + ); + } + + #[test] + fn test_negative_number() { + check!( + b"[-1]", + Ok(4), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 3), + (Event::ArrayEnd, 3) + ] + ); + check!( + b"[-1.0]", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 5), + (Event::ArrayEnd, 5) + ] + ); + } + + // Add some tests for string escape sequences + #[test] + fn test_conformance_string_escape_sequences() { + check!( + b"[\"\\\"\"]", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + (Event::Begin(EventToken::EscapeQuote), 3), + (Event::End(EventToken::EscapeQuote), 3), + (Event::End(EventToken::String), 4), + (Event::ArrayEnd, 5) + ] + ); + } + + #[test] + fn test_confformance_invalid_string_escape() { + // valid escapes are \\, \t and \n and so on, lets do \x + check!( + b"[\"\\x\"]", + Error::new(ErrKind::InvalidStringEscape, b'x', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + ] + ); + } + + // Try leaving an array and an object with a "broken" numer that ends in sign + // or an exponent + #[test] + fn test_conformance_broken_numbers_in_array() { + // leave at minus sign + check!( + b"[-]", + Error::new(ErrKind::InvalidNumber, b']', 2), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + ] + ); + // leave at decimal point + check!( + b"[123.]", + Error::new(ErrKind::InvalidNumber, b']', 5), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + ] + ); + // leave at exponent + check!( + b"[123e]", + Error::new(ErrKind::InvalidNumber, b']', 5), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + ] + ); + } + + // number followed by space, tab, newline + #[test] + fn test_conformance_number_followed_by_space_tab_newline() { + check!( + b"123 ", + Ok(4), + &[ + (Event::Begin(EventToken::Number), 0), + (Event::End(EventToken::Number), 3), + ] + ); + check!( + b"123.42\t", + Ok(7), + &[ + (Event::Begin(EventToken::Number), 0), + (Event::End(EventToken::Number), 6), + ] + ); + } + + // Same tests for objects + #[test] + fn test_conformance_broken_numbers_in_object() { + // leave at minus sign + check!( + b"{ \"a\" : -}", + Error::new(ErrKind::InvalidNumber, b'}', 9), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 2), + (Event::End(EventToken::Key), 4), + (Event::Begin(EventToken::Number), 8), + ] + ); + // leave at decimal point + check!( + b"{ \"a\" : 123.}", + Error::new(ErrKind::InvalidNumber, b'}', 12), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 2), + (Event::End(EventToken::Key), 4), + (Event::Begin(EventToken::Number), 8), + ] + ); + // leave at exponent sign + check!( + b"{ \"a\" : 123e+}", + Error::new(ErrKind::InvalidNumber, b'}', 13), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 2), + (Event::End(EventToken::Key), 4), + (Event::Begin(EventToken::Number), 8), + ] + ); + + // leave at exponent + check!( + b"{ \"a\" : 123e}", + Error::new(ErrKind::InvalidNumber, b'}', 12), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 2), + (Event::End(EventToken::Key), 4), + (Event::Begin(EventToken::Number), 8), + ] + ); + } + + #[test] + fn test_confformance_2_str() { + check!( + b"[\"a\",,\"b\"]", + Error::new(ErrKind::ExpectedArrayItem, b',', 5), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::End(EventToken::String), 3) + ] + ); + } + + #[test] + fn test_confformance_2_num() { + check!( + b"[1,,2]", + Error::new(ErrKind::ExpectedArrayItem, b',', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::Number), 2) + ] + ); + } + + #[test] + fn test_conformance_unopened_array() { + check!( + b"1]", + Error::new(ErrKind::UnopenedArray, b']', 1), + &[ + (Event::Begin(EventToken::Number), 0), + (Event::End(EventToken::NumberAndArray), 1), + (Event::ArrayEnd, 1) + ] + ); + } + + #[test] + fn test_conformance_lonely_int() { + check!( + b"42", + Ok(2), + &[ + (Event::Begin(EventToken::Number), 0), + (Event::End(EventToken::Number), 2) + ] + ); + } + + #[test] + fn test_conformance_trailing_object_comm() { + check!( + b"{\"id\":0,}", + Error::new(ErrKind::TrailingComma, b',', 8), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 1), + (Event::End(EventToken::Key), 4), + (Event::Begin(EventToken::Number), 6), + (Event::End(EventToken::Number), 7) + ] + ); + } + + #[test] + fn test_conformance_double_array() { + check!( + b"false false", + Error::new(ErrKind::ContentEnded, b'f', 6), + &[ + (Event::Begin(EventToken::False), 0), + (Event::End(EventToken::False), 4) + ] + ); + } + + #[test] + fn test_conformance_i_structure_500_nested_arrays() { + let data = include_bytes!("testdata/i_structure_500_nested_arrays.json"); + let starts: [(Event, usize); 255] = core::array::from_fn(|x: usize| (Event::ArrayStart, x)); + check!( + data, + Error::new(ErrKind::MaxDepthReached, b'[', 255), + starts.as_slice() + ); + } + + #[test] + fn concormance_test_n_array_just_minus() { + check!( + b"[-]", + Error::new(ErrKind::InvalidNumber, b']', 2), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_n_number_real_without_fractional_part() { + check!( + b"[1.]", + Error::new(ErrKind::InvalidNumber, b']', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_n_number_plus_one() { + check!( + b"[+1]", + Error::new(ErrKind::ExpectedArrayItem, b'+', 1), + &[(Event::ArrayStart, 0)] + ); + } + + #[test] + fn conformance_test_n_number_minus_zero_one() { + check!( + b"[-01]", + Error::new(ErrKind::InvalidNumber, b'1', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_n_number_neg_int_starting_with_zero() { + check!( + b"[-012]", + Error::new(ErrKind::InvalidNumber, b'1', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_n_number_with_leading_zero() { + check!( + b"[012]", + Error::new(ErrKind::InvalidNumber, b'1', 2), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_y_number() { + check!( + b"[123e65]", + Ok(8), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 7), + (Event::ArrayEnd, 7) + ] + ); + } + + #[test] + fn conformance_test_y_number_0e_plus_1() { + check!( + b"[0e+1]", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 5), + (Event::ArrayEnd, 5) + ] + ); + } + + #[test] + fn conformance_test_y_number_0e_1() { + check!( + b"[0e1]", + Ok(5), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 4), + (Event::ArrayEnd, 4) + ] + ); + } + + #[test] + fn conformance_testy_number_0e_1_with_object() { + check!( + b"{\"a\":0e1}", + Ok(9), + &[ + (Event::ObjectStart, 0), + (Event::Begin(EventToken::Key), 1), + (Event::End(EventToken::Key), 3), + (Event::Begin(EventToken::Number), 5), + (Event::End(EventToken::NumberAndObject), 8), + (Event::ObjectEnd, 8) + ] + ); + } + + #[test] + fn conformance_test_y_number_int_with_exp() { + check!( + b"[20e1]", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 5), + (Event::ArrayEnd, 5) + ] + ); + } + + #[test] + fn conformance_test_y_number_real_capital_e() { + check!( + b"[1E22]", + Ok(6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 5), + (Event::ArrayEnd, 5) + ] + ); + } + + #[test] + fn conformance_test_y_number_real_fraction_exponent() { + check!( + b"[123.456e78]", + Ok(12), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1), + (Event::End(EventToken::NumberAndArray), 11), + (Event::ArrayEnd, 11) + ] + ); + } + + #[test] + fn conformance_test_n_number_1_0e_minus() { + check!( + b"[1.0e-]", + Error::new(ErrKind::InvalidNumber, b']', 6), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::Number), 1) + ] + ); + } + + #[test] + fn conformance_test_y_structure_lonely_negative_real() { + check!( + b"-0.1", + Ok(4), + &[ + (Event::Begin(EventToken::Number), 0), + (Event::End(EventToken::Number), 4) + ] + ); + } + + #[test] + fn conformance_n_structure_no_data() { + check!(b"", Error::new(ErrKind::EmptyStream, b' ', 0), &[]); + } + + #[test] + fn conformance_n_string_unescaped_tab() { + check!( + b"[\"\t\"]", + Error::new(ErrKind::UnescapedControlCharacter, b'\t', 2), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1) + ] + ); + } + #[test] + fn conformance_n_unescaped_ctrl_char() { + check!( + b"[\"a\x00a\"]", + Error::new(ErrKind::UnescapedControlCharacter, b'\x00', 3), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1) + ] + ); + } + + #[test] + fn conformance_test_n_single_space() { + check!(b" ", Error::new(ErrKind::UnfinishedStream, b' ', 1), &[]); + } + + #[test] + fn conformance_test_n_string_1_surrogate_then_escape_u1() { + check!( + b"[\"\\uD800\\u1\"]", + Error::new(ErrKind::InvalidUnicodeEscape, b'"', 11), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + (Event::Begin(EventToken::UnicodeEscape), 4), + (Event::End(EventToken::UnicodeEscape), 7), + (Event::Begin(EventToken::EscapeSequence), 8), + (Event::Begin(EventToken::UnicodeEscape), 10) + ] + ); + } + + #[test] + fn conformance_test_n_string_1_surrogate_then_escape_u1x() { + check!( + b"[\"\\uD800\\u1x\"]", + Error::new(ErrKind::InvalidUnicodeEscape, b'x', 11), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + (Event::Begin(EventToken::UnicodeEscape), 4), + (Event::End(EventToken::UnicodeEscape), 7), + (Event::Begin(EventToken::EscapeSequence), 8), + (Event::Begin(EventToken::UnicodeEscape), 10) + ] + ); + } + + #[test] + fn conformance_test_n_string_unescaped_tab() { + check!( + b"[\"\t\"]", + Error::new(ErrKind::UnescapedControlCharacter, b'\t', 2), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1) + ] + ); + } + + #[test] + fn conformance_test_n_string_incomplete_escaped_character() { + check!( + b"[\"\\u00A\"]", + Error::new(ErrKind::InvalidUnicodeEscape, b'"', 7), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + (Event::Begin(EventToken::UnicodeEscape), 4), + ] + ); + } + + #[test] + fn conformance_test_n_string_incomplete_surrogate() { + check!( + b"[\"\\uD834\\uDd\"]", + Error::new(ErrKind::InvalidUnicodeEscape, b'"', 12), + &[ + (Event::ArrayStart, 0), + (Event::Begin(EventToken::String), 1), + (Event::Begin(EventToken::EscapeSequence), 2), + (Event::Begin(EventToken::UnicodeEscape), 4), + (Event::End(EventToken::UnicodeEscape), 7), + (Event::Begin(EventToken::EscapeSequence), 8), + (Event::Begin(EventToken::UnicodeEscape), 10) + ] + ); + } +} diff --git a/tokenizer/src/tokenizer/testdata/i_structure_500_nested_arrays.json b/tokenizer/src/tokenizer/testdata/i_structure_500_nested_arrays.json new file mode 100644 index 0000000..48b442a --- /dev/null +++ b/tokenizer/src/tokenizer/testdata/i_structure_500_nested_arrays.json @@ -0,0 +1 @@ +[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]] diff --git a/tokenizer/tests/array_bitstack_test.rs b/tokenizer/tests/array_bitstack_test.rs new file mode 100644 index 0000000..589b966 --- /dev/null +++ b/tokenizer/tests/array_bitstack_test.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 + +use ujson::bitstack::{ArrayBitStack, BitStack}; + +#[test] +fn test_array_bitstack_basic() { + // Test ArrayBitStack with 2 u8 elements (16-bit total capacity) + let mut bitstack: ArrayBitStack<2, u8> = ArrayBitStack::default(); + + // Test basic push/pop operations + bitstack.push(true); + bitstack.push(false); + bitstack.push(true); + + // Verify top() doesn't modify stack + assert_eq!(bitstack.top(), true); + assert_eq!(bitstack.top(), true); + + // Verify LIFO order + assert_eq!(bitstack.pop(), true); + assert_eq!(bitstack.pop(), false); + assert_eq!(bitstack.pop(), true); +} + +#[test] +fn test_array_bitstack_large_capacity() { + // Test larger ArrayBitStack (320-bit capacity with 10 u32 elements) + let mut bitstack: ArrayBitStack<10, u32> = ArrayBitStack::default(); + + // Push many bits to test multi-element handling + let pattern = [true, false, true, true, false, false, true, false]; + for &bit in &pattern { + bitstack.push(bit); + } + + // Pop and verify reverse order (LIFO) + for &expected in pattern.iter().rev() { + assert_eq!(bitstack.pop(), expected); + } +} + +#[test] +fn test_array_bitstack_element_overflow() { + // Test ArrayBitStack with 2 u8 elements to verify cross-element operations + let mut bitstack: ArrayBitStack<2, u8> = ArrayBitStack::default(); + + // Push more than 8 bits to force usage of multiple elements + let bits = [ + true, false, true, false, true, false, true, false, true, true, + ]; + for &bit in &bits { + bitstack.push(bit); + } + + // Pop all bits and verify order + for &expected in bits.iter().rev() { + assert_eq!(bitstack.pop(), expected); + } +} + +#[test] +fn test_array_bitstack_empty_behavior() { + // Test behavior when popping from an empty ArrayBitStack + // With the new API, empty stacks return false (no depth tracking needed) + let mut bitstack: ArrayBitStack<2, u8> = ArrayBitStack::default(); + + // CURRENT BEHAVIOR: Empty stack returns false (was Some(false) before API change) + // This behavior is now the intended design - no depth tracking needed + assert_eq!(bitstack.pop(), false, "Empty stack returns false"); + assert_eq!(bitstack.top(), false, "Empty stack top() returns false"); + + // Test that underflow doesn't panic (at least it's safe) + assert_eq!( + bitstack.pop(), + false, + "Multiple underflow calls don't panic" + ); + assert_eq!( + bitstack.pop(), + false, + "Multiple underflow calls don't panic" + ); +} + +#[test] +fn test_array_bitstack_underflow_does_not_panic() { + // Test that multiple underflow attempts are safe (don't panic) + // This is important for robustness even with the current incorrect API + let mut bitstack: ArrayBitStack<1, u8> = ArrayBitStack::default(); + + // Multiple calls to pop() on empty stack should not panic + for i in 0..5 { + let result = bitstack.pop(); + // With new API, just ensure it doesn't panic and returns a bool + assert_eq!( + result, + false, + "Empty ArrayBitStack pop() attempt {} should return false", + i + 1 + ); + + let top_result = bitstack.top(); + assert_eq!( + top_result, + false, + "Empty ArrayBitStack top() attempt {} should return false", + i + 1 + ); + } +}