From 0bca70bac47f718b68df383c5b69a9f3f01669f0 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 13 Jan 2020 06:41:49 -0800 Subject: [PATCH] Correctly handle celestial plane codepoints in ES5.1. Fixes #3498. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com --- jerry-core/lit/lit-char-helpers.c | 4 ++++ jerry-core/parser/js/js-lexer.c | 24 ++++++++++++++++++------ jerry-core/parser/js/js-lexer.h | 1 - tests/unit-core/test-api-errortype.c | 12 ++++++++++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index df12732745..74c235cbc8 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -223,6 +223,8 @@ lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point /* TODO: detect these ranges correctly. */ return (code_point >= 0x10C80 && code_point <= 0x10CF2); } +#else /* !ENABLED (JERRY_ES2015) */ + JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN); #endif /* ENABLED (JERRY_ES2015) */ return lit_char_is_unicode_letter ((ecma_char_t) code_point); @@ -252,6 +254,8 @@ lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point /* TODO: detect these ranges correctly. */ return (code_point >= 0x10C80 && code_point <= 0x10CF2); } +#else /* !ENABLED (JERRY_ES2015) */ + JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN); #endif /* ENABLED (JERRY_ES2015) */ return (lit_char_is_unicode_letter ((ecma_char_t) code_point) diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c index 6921ab77a5..809532673b 100644 --- a/jerry-core/parser/js/js-lexer.c +++ b/jerry-core/parser/js/js-lexer.c @@ -693,12 +693,12 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */ if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER)) { +#if ENABLED (JERRY_ES2015) utf8_length = lit_read_code_point_from_utf8 (source_p, (lit_utf8_size_t) (source_end_p - source_p), &code_point); decoded_length = utf8_length; -#if ENABLED (JERRY_ES2015) /* Only ES2015 supports code points outside of the basic plane which can be part of an identifier. */ if ((code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) && source_p + 3 < source_end_p) @@ -717,11 +717,23 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */ char_count = 2; } } - else if (source_p[0] >= LEXER_UTF8_4BYTE_START) + else if (source_p[0] >= LIT_UTF8_4_BYTE_MARKER) { decoded_length = 2 * 3; has_escape = true; } +#else /* !ENABLED (JERRY_ES2015) */ + if (code_point < LIT_UTF8_4_BYTE_MARKER) + { + utf8_length = lit_read_code_point_from_utf8 (source_p, + (lit_utf8_size_t) (source_end_p - source_p), + &code_point); + decoded_length = utf8_length; + } + else + { + code_point = 0; + } #endif /* ENABLED (JERRY_ES2015) */ } @@ -1091,7 +1103,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */ } #endif /* ENABLED (JERRY_ES2015) */ - if (*source_p >= LEXER_UTF8_4BYTE_START) + if (*source_p >= LIT_UTF8_4_BYTE_MARKER) { /* Processing 4 byte unicode sequence (even if it is * after a backslash). Always converted to two 3 byte @@ -1893,7 +1905,7 @@ lexer_convert_ident_to_cesu8 (uint8_t *destination_p, /**< destination string */ } #if ENABLED (JERRY_ES2015) - if (*source_p >= LEXER_UTF8_4BYTE_START) + if (*source_p >= LIT_UTF8_4_BYTE_MARKER) { lit_four_byte_utf8_char_to_cesu8 (destination_p, source_p); @@ -2113,7 +2125,7 @@ lexer_convert_literal_to_chars (parser_context_t *context_p, /**< context */ } #endif /* ENABLED (JERRY_ES2015) */ - if (*source_p >= LEXER_UTF8_4BYTE_START) + if (*source_p >= LIT_UTF8_4_BYTE_MARKER) { /* Processing 4 byte unicode sequence (even if it is * after a backslash). Always converted to two 3 byte @@ -3028,7 +3040,7 @@ lexer_compare_identifier_to_chars (const uint8_t *left_p, /**< left identifier * escape_size = lit_code_point_to_cesu8_bytes (utf8_buf, code_point); } - else if (*left_p >= LEXER_UTF8_4BYTE_START) + else if (*left_p >= LIT_UTF8_4_BYTE_MARKER) { lit_four_byte_utf8_char_to_cesu8 (utf8_buf, left_p); escape_size = 3 * 2; diff --git a/jerry-core/parser/js/js-lexer.h b/jerry-core/parser/js/js-lexer.h index 5ab77c3d2e..0b39f0721e 100644 --- a/jerry-core/parser/js/js-lexer.h +++ b/jerry-core/parser/js/js-lexer.h @@ -201,7 +201,6 @@ typedef enum #define LEXER_NEWLINE_LS_PS_BYTE_1 0xe2 #define LEXER_NEWLINE_LS_PS_BYTE_23(source) \ ((source)[1] == LIT_UTF8_2_BYTE_CODE_POINT_MIN && ((source)[2] | 0x1) == 0xa9) -#define LEXER_UTF8_4BYTE_START 0xf0 #define LEXER_IS_LEFT_BRACKET(type) \ ((type) == LEXER_LEFT_BRACE || (type) == LEXER_LEFT_PAREN || (type) == LEXER_LEFT_SQUARE) diff --git a/tests/unit-core/test-api-errortype.c b/tests/unit-core/test-api-errortype.c index 1c755a1d53..305f5c3161 100644 --- a/tests/unit-core/test-api-errortype.c +++ b/tests/unit-core/test-api-errortype.c @@ -62,5 +62,17 @@ main (void) jerry_release_value (test_values[idx]); } + char test_source[] = "\xF0\x9D\x84\x9E"; + + jerry_value_t result = jerry_parse (NULL, + 0, + (const jerry_char_t *) test_source, + sizeof (test_source) - 1, + JERRY_PARSE_NO_OPTS); + TEST_ASSERT (jerry_value_is_error (result)); + TEST_ASSERT (jerry_get_error_type (result) == JERRY_ERROR_SYNTAX); + + jerry_release_value (result); + jerry_cleanup (); } /* main */