diff --git a/CHANGELOG.md b/CHANGELOG.md index dd9709f..68aa994 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Python JSONPath RFC 9535 Change Log +## Version 0.1.3 (unreleased) + +**Fixes** + +- Fixed decoding of escape sequences in quoted name selectors and string literals. We now raise a `JSONPathSyntaxError` for invalid code points. +- Fixed parsing of number literals with an exponent. We now allow 'e' to be upper case. +- Fixed handling of trailing commas in bracketed segments. We now raise a `JSONPathSyntaxError` in such cases. +- Fixed handling of invalid number literals. We now raise a syntax error for invalid leading zeros and extra negative signs. + ## Version 0.1.2 **Fixes** diff --git a/jsonpath_rfc9535/__about__.py b/jsonpath_rfc9535/__about__.py index b3f4756..ae73625 100644 --- a/jsonpath_rfc9535/__about__.py +++ b/jsonpath_rfc9535/__about__.py @@ -1 +1 @@ -__version__ = "0.1.2" +__version__ = "0.1.3" diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py index f7c65df..ba081be 100644 --- a/jsonpath_rfc9535/lex.py +++ b/jsonpath_rfc9535/lex.py @@ -18,8 +18,8 @@ RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*") RE_INDEX = re.compile(r"-?[0-9]+") RE_INT = re.compile(r"-?[0-9]+") -RE_EXPONENT = re.compile(r"e[+-]?[0-9]+") -RE_NEGATIVE_EXPONENT = re.compile(r"e-[0-9]+") +RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+") +RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+") RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*") RE_AND = re.compile(r"&&") RE_OR = re.compile(r"\|\|") diff --git a/jsonpath_rfc9535/parse.py b/jsonpath_rfc9535/parse.py index fab3d05..e722bc1 100644 --- a/jsonpath_rfc9535/parse.py +++ b/jsonpath_rfc9535/parse.py @@ -2,7 +2,6 @@ from __future__ import annotations -import json from typing import TYPE_CHECKING from typing import Callable from typing import Dict @@ -312,6 +311,7 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto if stream.peek.type_ != TokenType.RBRACKET: stream.expect_peek(TokenType.COMMA) stream.next_token() + stream.expect_peek_not(TokenType.RBRACKET, "unexpected trailing comma") stream.next_token() @@ -362,11 +362,29 @@ def parse_string_literal(self, stream: TokenStream) -> Expression: ) def parse_integer_literal(self, stream: TokenStream) -> Expression: + value = stream.current.value + if value.startswith("0") and len(value) > 1: + raise JSONPathSyntaxError("invalid integer literal", token=stream.current) + # Convert to float first to handle scientific notation. - return IntegerLiteral(stream.current, value=int(float(stream.current.value))) + try: + return IntegerLiteral(stream.current, value=int(float(value))) + except ValueError as err: + raise JSONPathSyntaxError( + "invalid integer literal", token=stream.current + ) from err def parse_float_literal(self, stream: TokenStream) -> Expression: - return FloatLiteral(stream.current, value=float(stream.current.value)) + value = stream.current.value + if value.startswith("0") and len(value.split(".")[0]) > 1: + raise JSONPathSyntaxError("invalid float literal", token=stream.current) + + try: + return FloatLiteral(stream.current, value=float(stream.current.value)) + except ValueError as err: + raise JSONPathSyntaxError( + "invalid float literal", token=stream.current + ) from err def parse_prefix_expression(self, stream: TokenStream) -> Expression: tok = stream.next_token() @@ -514,12 +532,127 @@ def _decode_string_literal(self, token: Token) -> str: value = token.value.replace('"', '\\"').replace("\\'", "'") else: value = token.value - try: - rv = json.loads(f'"{value}"') - assert isinstance(rv, str) - return rv - except json.JSONDecodeError as err: - raise JSONPathSyntaxError(str(err).split(":")[1], token=token) from None + + return self._unescape_string(value, token) + + def _unescape_string(self, value: str, token: Token) -> str: + unescaped: List[str] = [] + index = 0 + + while index < len(value): + ch = value[index] + if ch == "\\": + index += 1 + _ch, index = self._decode_escape_sequence(value, index, token) + unescaped.append(_ch) + else: + self._string_from_codepoint(ord(ch), token) + unescaped.append(ch) + index += 1 + return "".join(unescaped) + + def _decode_escape_sequence( # noqa: PLR0911 + self, value: str, index: int, token: Token + ) -> Tuple[str, int]: + ch = value[index] + if ch == '"': + return '"', index + if ch == "\\": + return "\\", index + if ch == "/": + return "/", index + if ch == "b": + return "\x08", index + if ch == "f": + return "\x0c", index + if ch == "n": + return "\n", index + if ch == "r": + return "\r", index + if ch == "t": + return "\t", index + if ch == "u": + codepoint, index = self._decode_hex_char(value, index, token) + return self._string_from_codepoint(codepoint, token), index + + raise JSONPathSyntaxError( + f"unknown escape sequence at index {token.index + index - 1}", + token=token, + ) + + def _decode_hex_char(self, value: str, index: int, token: Token) -> Tuple[int, int]: + length = len(value) + + if index + 4 >= length: + raise JSONPathSyntaxError( + f"incomplete escape sequence at index {token.index + index - 1}", + token=token, + ) + + index += 1 # move past 'u' + codepoint = self._parse_hex_digits(value[index : index + 4], token) + + if self._is_low_surrogate(codepoint): + raise JSONPathSyntaxError( + f"unexpected low surrogate at index {token.index + index - 1}", + token=token, + ) + + if self._is_high_surrogate(codepoint): + # expect a surrogate pair + if not ( + index + 9 < length + and value[index + 4] == "\\" + and value[index + 5] == "u" + ): + raise JSONPathSyntaxError( + f"incomplete escape sequence at index {token.index + index - 2}", + token=token, + ) + + low_surrogate = self._parse_hex_digits(value[index + 6 : index + 10], token) + + if not self._is_low_surrogate(low_surrogate): + raise JSONPathSyntaxError( + f"unexpected codepoint at index {token.index + index + 4}", + token=token, + ) + + codepoint = 0x10000 + ( + ((codepoint & 0x03FF) << 10) | (low_surrogate & 0x03FF) + ) + + return (codepoint, index + 9) + + return (codepoint, index + 3) + + def _parse_hex_digits(self, digits: str, token: Token) -> int: + codepoint = 0 + for digit in digits.encode(): + codepoint <<= 4 + if digit >= 48 and digit <= 57: + codepoint |= digit - 48 + elif digit >= 65 and digit <= 70: + codepoint |= digit - 65 + 10 + elif digit >= 97 and digit <= 102: + codepoint |= digit - 97 + 10 + else: + raise JSONPathSyntaxError( + "invalid \\uXXXX escape sequence", + token=token, + ) + return codepoint + + def _string_from_codepoint(self, codepoint: int, token: Token) -> str: + if codepoint <= 0x1F: + raise JSONPathSyntaxError("invalid character", token=token) + return chr(codepoint) + + def _is_high_surrogate(self, codepoint: int) -> bool: + return codepoint >= 0xD800 and codepoint <= 0xDBFF + + def _is_low_surrogate(self, codepoint: int) -> bool: + return codepoint >= 0xDC00 and codepoint <= 0xDFFF def _raise_for_non_comparable_function( self, expr: Expression, token: Token diff --git a/jsonpath_rfc9535/tokens.py b/jsonpath_rfc9535/tokens.py index a51c524..93d6674 100644 --- a/jsonpath_rfc9535/tokens.py +++ b/jsonpath_rfc9535/tokens.py @@ -193,3 +193,8 @@ def expect_peek(self, *typ: TokenType) -> None: f"expected {_typ}, found {self.peek.type_.name!r}", token=self.peek, ) + + def expect_peek_not(self, typ: TokenType, message: str) -> None: + """Raise an exception if the next token type is not one of _type_.""" + if self.peek.type_ == typ: + raise JSONPathSyntaxError(message, token=self.peek) diff --git a/pyproject.toml b/pyproject.toml index fa6c675..af19fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -179,6 +179,7 @@ convention = "google" "scripts/__init__.py" = ["D104"] "tests/*" = ["D100", "D101", "D104", "D103"] "jsonpath_rfc9535/lex.py" = ["E741"] +"jsonpath_rfc9535/parse.py" = ["PLR2004"] "jsonpath_rfc9535/utils/nondeterministic_descent.py" = [ "D103", "D101", diff --git a/tests/cts b/tests/cts index 18a5245..733fe52 160000 --- a/tests/cts +++ b/tests/cts @@ -1 +1 @@ -Subproject commit 18a52450177dad29e0c122a3e3f712c8fc4a8034 +Subproject commit 733fe526ceccb183ed0e3397340f2ee9feec0d67 diff --git a/tests/test_compliance.py b/tests/test_compliance.py index d2c5ae5..2623bdb 100644 --- a/tests/test_compliance.py +++ b/tests/test_compliance.py @@ -7,6 +7,7 @@ import json import operator from dataclasses import dataclass +from dataclasses import field from typing import Any from typing import Dict from typing import List @@ -26,6 +27,7 @@ class Case: result: Any = None results: Optional[List[Any]] = None invalid_selector: Optional[bool] = None + tags: List[str] = field(default_factory=list) SKIP: Dict[str, str] = {} diff --git a/tests/test_cts_nondeterminism.py b/tests/test_cts_nondeterminism.py index 0d97522..ea22e42 100644 --- a/tests/test_cts_nondeterminism.py +++ b/tests/test_cts_nondeterminism.py @@ -7,6 +7,7 @@ import json import operator from dataclasses import dataclass +from dataclasses import field from typing import Any from typing import List from typing import Optional @@ -26,6 +27,7 @@ class Case: result: Any = None results: Optional[List[Any]] = None invalid_selector: Optional[bool] = None + tags: List[str] = field(default_factory=list) def cases() -> List[Case]: