From c78555ef9f336668b49f7a0d2e4d4b48656d765c Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 21 Jul 2023 16:12:18 +0200 Subject: [PATCH] Use ByteArray as input for json.Parser The JSON parser now takes a `ref ByteArray` as its input, instead of a `String`. This makes parsing files and the likes a bit more efficient, as you don't need to first convert the bytes into a `String` just so you can parse it. As part of this, json.parse is moved to Json.parse_string, and Json.parse_bytes is added to parse a `ByteArray`. Finally, the runtime functions for parsing strings are changed to simply take a pointer and a size, instead of a string and a range. This removes the need for an intermediate `String`. This fixes https://github.com/inko-lang/inko/issues/357. Changelog: changed --- rt/src/runtime/string.rs | 30 +++++-------- std/src/std/float.inko | 7 ++- std/src/std/int.inko | 17 +++++--- std/src/std/json.inko | 85 ++++++++++++++++++++++--------------- std/test/std/test_json.inko | 29 +++++++++---- 5 files changed, 95 insertions(+), 73 deletions(-) diff --git a/rt/src/runtime/string.rs b/rt/src/runtime/string.rs index bf8fd6558..c0c2c482b 100644 --- a/rt/src/runtime/string.rs +++ b/rt/src/runtime/string.rs @@ -7,6 +7,7 @@ use std::cmp::min; use std::ffi::CStr; use std::os::raw::c_char; use std::slice; +use std::str; use unicode_segmentation::{Graphemes, UnicodeSegmentation}; #[no_mangle] @@ -77,16 +78,11 @@ pub unsafe extern "system" fn inko_string_to_byte_array( #[no_mangle] pub unsafe extern "system" fn inko_string_to_float( state: *const State, - string: *const InkoString, - start: i64, - end: i64, + bytes: *mut u8, + size: i64, ) -> InkoResult { - let string = InkoString::read(string); - let slice = if start >= 0 && end >= 0 { - &string[start as usize..end as usize] - } else { - string - }; + let slice = + str::from_utf8_unchecked(slice::from_raw_parts(bytes, size as _)); let parsed = match slice { "Infinity" => Ok(f64::INFINITY), @@ -103,27 +99,21 @@ pub unsafe extern "system" fn inko_string_to_float( pub unsafe extern "system" fn inko_string_to_int( state: *const State, process: ProcessPointer, - string: *const InkoString, + bytes: *mut u8, + size: i64, radix: i64, - start: i64, - end: i64, ) -> InkoResult { - let string = InkoString::read(string); - if !(2..=36).contains(&radix) { panic(process, &format!("The radix '{}' is invalid", radix)); } - let slice = if start >= 0 && end >= 0 { - &string[start as usize..end as usize] - } else { - string - }; + let slice = + str::from_utf8_unchecked(slice::from_raw_parts(bytes, size as _)); // Rust doesn't handle parsing strings like "-0x4a3f043013b2c4d1" out of the // box. let parsed = if radix == 16 { - if let Some(tail) = string.strip_prefix("-0x") { + if let Some(tail) = slice.strip_prefix("-0x") { i64::from_str_radix(tail, 16).map(|v| 0_i64.wrapping_sub(v)) } else { i64::from_str_radix(slice, 16) diff --git a/std/src/std/float.inko b/std/src/std/float.inko index 6a7275218..4899bf166 100644 --- a/std/src/std/float.inko +++ b/std/src/std/float.inko @@ -15,9 +15,8 @@ class extern AnyResult { fn extern inko_float_to_string(state: Pointer[Int8], float: Float64) -> String fn extern inko_string_to_float( state: Pointer[Int8], - string: String, - start: Int, - end: Int, + bytes: Pointer[Int8], + size: Int, ) -> AnyResult # A type that can be converted to a Float. @@ -74,7 +73,7 @@ class builtin Float { # # Float.parse('1.2e1') # => Option.Some(12.0) fn pub static parse(string: String) -> Option[Float] { - match inko_string_to_float(_INKO.state, string, -1, -1) { + match inko_string_to_float(_INKO.state, string.to_pointer, string.size) { case { @tag = 0, @value = v } -> Option.Some(v as Float) case _ -> Option.None } diff --git a/std/src/std/int.inko b/std/src/std/int.inko index d5d3b3315..3eebc8195 100644 --- a/std/src/std/int.inko +++ b/std/src/std/int.inko @@ -31,10 +31,9 @@ class extern IntResult { fn extern inko_string_to_int( state: Pointer[Int8], process: Pointer[Int8], - string: String, + bytes: Pointer[Int8], + size: Int, radix: Int, - start: Int, - end: Int, ) -> IntResult fn extern inko_int_pow(process: Pointer[Int8], left: Int, right: Int) -> Int64 @@ -59,7 +58,9 @@ class builtin Int { # Int.from_base2('11') # => Option.Some(3) # Int.from_base2('ff') # => Option.None fn pub static from_base2(string: String) -> Option[Int] { - match inko_string_to_int(_INKO.state, _INKO.process, string, 2, -1, -1) { + match inko_string_to_int( + _INKO.state, _INKO.process, string.to_pointer, string.size, 2 + ) { case { @tag = 0, @value = v } -> Option.Some(v) case _ -> Option.None } @@ -75,7 +76,9 @@ class builtin Int { # Int.from_base10('12') # => Option.Some(12) # Int.from_base10('ff') # => Option.None fn pub static from_base10(string: String) -> Option[Int] { - match inko_string_to_int(_INKO.state, _INKO.process, string, 10, -1, -1) { + match inko_string_to_int( + _INKO.state, _INKO.process, string.to_pointer, string.size, 10 + ) { case { @tag = 0, @value = v } -> Option.Some(v) case _ -> Option.None } @@ -95,7 +98,9 @@ class builtin Int { # Int.from_base16('ef') # => Option.Some(239) # Int.from_base16('zz') # => Option.None fn pub static from_base16(string: String) -> Option[Int] { - match inko_string_to_int(_INKO.state, _INKO.process, string, 16, -1, -1) { + match inko_string_to_int( + _INKO.state, _INKO.process, string.to_pointer, string.size, 16 + ) { case { @tag = 0, @value = v } -> Option.Some(v) case _ -> Option.None } diff --git a/std/src/std/json.inko b/std/src/std/json.inko index e4eec8d5f..1f94714ba 100644 --- a/std/src/std/json.inko +++ b/std/src/std/json.inko @@ -76,17 +76,15 @@ class extern AnyResult { fn extern inko_string_to_int( state: Pointer[Int8], process: Pointer[Int8], - string: String, + bytes: Pointer[Int8], + size: Int, radix: Int, - start: Int, - end: Int, ) -> IntResult fn extern inko_string_to_float( state: Pointer[Int8], - string: String, - start: Int, - end: Int, + bytes: Pointer[Int8], + size: Int, ) -> AnyResult let EOF = -1 @@ -215,6 +213,30 @@ class pub enum Json { case Bool(Bool) case Null + # Parses a JSON `String` into a `Json` value. + # + # # Examples + # + # import std.json.Json + # + # Json.parse_string('[10]').unwrap # => Result.Ok(Json.Array([Json.Int(10)])) + fn pub static parse_string(string: String) -> Result[Json, Error] { + let bytes = string.to_byte_array + + Parser.new(bytes).parse + } + + # Parses a `ByteArra` into a `Json` value. + # + # # Examples + # + # import std.json.Json + # + # Json.parse_bytes('[10]'.to_byte_array) # => Result.Ok(Json.Array([Json.Int(10)])) + fn pub static parse_bytes(bytes: ref ByteArray) -> Result[Json, Error] { + Parser.new(bytes).parse + } + # Formats `self` as a JSON string using indentation for nested objects. # # This method uses two spaces per indentation. To customise the amount of @@ -319,7 +341,7 @@ impl Equal[Json] for Json { # A type for parsing a stream of bytes into a JSON object. # -# This parser only supports parsing `String` values as input. If you need to +# This parser only supports parsing `ByteArray` values as input. If you need to # parse very large documents, it's best to separate the objects on a per line # basis, then parse the document one line at a time. # @@ -336,7 +358,7 @@ impl Equal[Json] for Json { # 10 MiB _per string_. You can change this limit by adjusting the value of the # `max_string_size` field. class pub Parser { - let @string: String + let @input: ref ByteArray let @index: Int let @size: Int let @line: Int @@ -352,12 +374,12 @@ class pub Parser { # When parsing a string that exceeds this limit, an error is thrown. let pub @max_string_size: Int - # Returns a new parser that will parse the given `String`. - fn pub static new(string: String) -> Parser { + # Returns a new parser that will parse the given `ByteArray`. + fn pub static new(input: ref ByteArray) -> Parser { Parser { - @string = string, + @input = input, @index = 0, - @size = string.size, + @size = input.size, @line = 1, @depth = 0, @max_depth = 100, @@ -451,8 +473,7 @@ class pub Parser { } fn mut string_with_escape_sequence(started_at: Int) -> Result[String, Error] { - let buffer = - @string.slice_bytes(started_at, @index - started_at).to_byte_array + let buffer = @input.slice(started_at, size: @index - started_at) loop { match current { @@ -532,9 +553,10 @@ class pub Parser { throw error("Expected four hexadecimal digits, but we ran out of input") } - match inko_string_to_int( - _INKO.state, _INKO.process, @string, 16, start, @index - ) { + let ptr = @input.to_pointer as Int + start as Pointer[Int8] + let size = @index - start + + match inko_string_to_int(_INKO.state, _INKO.process, ptr, size, 16) { case { @tag = 0, @value = v } -> Result.Ok(v) case _ -> Result.Error( error("'{slice_string(start)}' is an invalid Unicode codepoint") @@ -635,9 +657,10 @@ class pub Parser { # number parser. As part of parsing the JSON number we already validate # it. This means we can bypass `Int.from_base10` (and `Float.parse` # below), and instead use the underlying runtime functions. - match inko_string_to_int( - _INKO.state, _INKO.process, @string, 10, start, @index - ) { + let ptr = @input.to_pointer as Int + start as Pointer[Int8] + let size = @index - start + + match inko_string_to_int(_INKO.state, _INKO.process, ptr, size, 10) { # If the number is too big to fit in an integer, we'll promote the # number to a float. case { @tag = 0, @value = v } -> return Result.Ok(Json.Int(v)) @@ -649,8 +672,11 @@ class pub Parser { # At this point we've already validated the input format, and it's # compatible with the underlying float parser, so no extra checks are # needed. + let ptr = @input.to_pointer as Int + start as Pointer[Int8] + let size = @index - start + Result.Ok(Json.Float( - inko_string_to_float(_INKO.state, @string, start, @index).value as Float + inko_string_to_float(_INKO.state, ptr, size).value as Float )) } @@ -675,7 +701,7 @@ class pub Parser { fn current -> Int { if @index < @size { - @string.byte(@index) + @input.get(@index) } else { EOF } @@ -684,7 +710,7 @@ class pub Parser { fn peek -> Int { let index = @index + 1 - if index < @size { @string.byte(index) } else { EOF } + if index < @size { @input.get(index) } else { EOF } } fn mut identifier(name: String) -> Result[Nil, Error] { @@ -741,7 +767,7 @@ class pub Parser { } fn slice_string(start: Int) -> String { - @string.slice_bytes(start, size: @index - start) + @input.slice(start, size: @index - start).into_string } fn error(message: String) -> Error { @@ -857,14 +883,3 @@ class pub Generator { @buffer.push(if @pretty { ",\n" } else { ', ' }) } } - -# Parses a JSON string into a `Json` value. -# -# # Examples -# -# import std.json -# -# json.parse('[10]').unwrap # => Json.Array([Json.Int(10)]) -fn pub parse(string: String) -> Result[Json, Error] { - Parser.new(string).parse -} diff --git a/std/test/std/test_json.inko b/std/test/std/test_json.inko index f202f1917..6da662111 100644 --- a/std/test/std/test_json.inko +++ b/std/test/std/test_json.inko @@ -1,13 +1,17 @@ import helpers.(fmt) -import std.json.(self, Error, Json, Parser) +import std.json.(Error, Json, Parser) import std.test.Tests fn parse(input: String) -> Result[Json, Error] { - Parser.new(input).parse + let bytes = input.to_byte_array + + Parser.new(bytes).parse } fn parse_invalid(input: String) -> Option[String] { - Parser.new(input).parse.error.map fn (v) { v.to_string } + let bytes = input.to_byte_array + + Parser.new(bytes).parse.error.map fn (v) { v.to_string } } fn pub tests(t: mut Tests) { @@ -261,7 +265,8 @@ fn pub tests(t: mut Tests) { t.true(parse('[],').error?) { - let parser = Parser.new('[[[[10]]]]') + let bytes = '[[[[10]]]]'.to_byte_array + let parser = Parser.new(bytes) parser.max_depth = 2 t.true(parser.parse.error?) @@ -328,7 +333,8 @@ fn pub tests(t: mut Tests) { t.true(parse('"\uDFFF\uDFFF"').error?) { - let parser = Parser.new('"foo"') + let bytes = '"foo"'.to_byte_array + let parser = Parser.new(bytes) parser.max_string_size = 2 t.true(parser.parse.error?) @@ -380,7 +386,8 @@ fn pub tests(t: mut Tests) { t.true(parse('{"a": true} "x"').error?) { - let parser = Parser.new('{"a": {"b": {"c": 10}}}') + let bytes = '{"a": {"b": {"c": 10}}}'.to_byte_array + let parser = Parser.new(bytes) parser.max_depth = 2 t.true(parser.parse.error?) @@ -398,7 +405,13 @@ fn pub tests(t: mut Tests) { t.true(parse("\u{EF}\u{BB}\u{BF}10").error?) } - t.test('json.parse') fn (t) { - t.equal(json.parse('[10]'), Result.Ok(Json.Array([Json.Int(10)]))) + t.test('Json.parse_string') fn (t) { + t.equal(Json.parse_string('[10]'), Result.Ok(Json.Array([Json.Int(10)]))) + } + + t.test('Json.parse_bytes') fn (t) { + let bytes = '[10]'.to_byte_array + + t.equal(Json.parse_bytes(bytes), Result.Ok(Json.Array([Json.Int(10)]))) } }