Skip to content

Commit

Permalink
Rework RegExp engine and add support for proper unicode matching (#3746)
Browse files Browse the repository at this point in the history
This change includes several bugfixes, general improvements, and support
for additional features.
- Added full support for web compatibility syntax defined in Annex B
- Implemented parsing and matching patterns in unicode mode
- Fixed capture results when iterating with nested capturing groups
- Significantly reduced regexp bytecode size
- Reduced stack usage during regexp execution
- Improved matching performance

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
  • Loading branch information
dbatyai committed May 26, 2020
1 parent 908240b commit 8f76a1f
Show file tree
Hide file tree
Showing 30 changed files with 3,373 additions and 2,379 deletions.
7 changes: 2 additions & 5 deletions jerry-core/api/jerry-snapshot.c
Expand Up @@ -559,18 +559,15 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th
#if ENABLED (JERRY_BUILTIN_REGEXP)
if (!(bytecode_p->status_flags & CBC_CODE_FLAGS_FUNCTION))
{
const re_compiled_code_t *re_bytecode_p = NULL;

const uint8_t *regex_start_p = ((const uint8_t *) bytecode_p) + sizeof (ecma_compiled_code_t);

/* Real size is stored in refs. */
ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p,
bytecode_p->refs);

re_compile_bytecode (&re_bytecode_p,
pattern_str_p,
bytecode_p->status_flags);

const re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p,
bytecode_p->status_flags);
ecma_deref_ecma_string (pattern_str_p);

return (ecma_compiled_code_t *) re_bytecode_p;
Expand Down
2 changes: 1 addition & 1 deletion jerry-core/ecma/base/ecma-gc.c
Expand Up @@ -1467,7 +1467,7 @@ ecma_gc_run (void)

#if ENABLED (JERRY_BUILTIN_REGEXP)
/* Free RegExp bytecodes stored in cache */
re_cache_gc_run ();
re_cache_gc ();
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
} /* ecma_gc_run */

Expand Down
6 changes: 2 additions & 4 deletions jerry-core/ecma/base/ecma-helpers-string.c
Expand Up @@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
if (!lit_char_is_white_space (ch))
{
nonws_start_p = current_p;
break;
Expand All @@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
if (!lit_char_is_white_space (ch))
{
break;
}
Expand Down
23 changes: 10 additions & 13 deletions jerry-core/ecma/builtin-objects/ecma-builtin-global.c
Expand Up @@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
continue;
}

ecma_char_t decoded_byte;

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
if (hex_value == UINT32_MAX)
{
return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
}

ecma_char_t decoded_byte = (ecma_char_t) hex_value;
input_char_p += URI_ENCODED_BYTE_SIZE;

if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
Expand Down Expand Up @@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
/* Input decode. */
if (*input_char_p != '%')
{
*output_char_p = *input_char_p;
output_char_p++;
input_char_p++;
*output_char_p++ = *input_char_p++;
continue;
}

ecma_char_t decoded_byte;

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
if (hex_value == UINT32_MAX)
{
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
break;
}

ecma_char_t decoded_byte = (ecma_char_t) hex_value;
input_char_p += URI_ENCODED_BYTE_SIZE;

if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
Expand Down Expand Up @@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
}
else
{
ecma_char_t chr;
hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
|| ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
{
is_valid = false;
break;
}

octets[i] = (lit_utf8_byte_t) chr;
input_char_p += URI_ENCODED_BYTE_SIZE;
octets[i] = (lit_utf8_byte_t) hex_value;
}
}

Expand Down
11 changes: 3 additions & 8 deletions jerry-core/ecma/builtin-objects/ecma-builtin-json.c
Expand Up @@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
}
case LIT_CHAR_LOWERCASE_U:
{
if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
if (hex_value == UINT32_MAX)
{
goto invalid_string;
}

ecma_char_t code_unit;
if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
{
goto invalid_string;
}

ecma_stringbuilder_append_char (&result_builder, code_unit);
ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value);
current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
break;
}
Expand Down
8 changes: 3 additions & 5 deletions jerry-core/ecma/builtin-objects/ecma-builtins.c
Expand Up @@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */

ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;

const re_compiled_code_t *bc_p = NULL;
ecma_value_t ret_value = re_compile_bytecode (&bc_p,
ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
RE_FLAG_EMPTY);
re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
RE_FLAG_EMPTY);

JERRY_ASSERT (ecma_is_value_empty (ret_value));
JERRY_ASSERT (bc_p != NULL);

ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);

Expand Down

0 comments on commit 8f76a1f

Please sign in to comment.