Skip to content

Commit

Permalink
Rework RegExp engine and add support for proper unicode matching
Browse files Browse the repository at this point in the history
This change includes several bugfixes, general improvements, and support
for additional features.
- Added full support for web compatibility syntax defined in Annex B
- Implemented parsing and matching patterns in unicode mode
- Fixed capture results when iterating with nested capturing groups
- Significantly reduced regexp bytecode size
- Reduced stack usage during regexp execution
- Improved matching performance

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
  • Loading branch information
dbatyai committed May 21, 2020
1 parent 87b1d1e commit 4ea579f
Show file tree
Hide file tree
Showing 30 changed files with 3,388 additions and 2,360 deletions.
7 changes: 2 additions & 5 deletions jerry-core/api/jerry-snapshot.c
Expand Up @@ -559,18 +559,15 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th
#if ENABLED (JERRY_BUILTIN_REGEXP)
if (!(bytecode_p->status_flags & CBC_CODE_FLAGS_FUNCTION))
{
const re_compiled_code_t *re_bytecode_p = NULL;

const uint8_t *regex_start_p = ((const uint8_t *) bytecode_p) + sizeof (ecma_compiled_code_t);

/* Real size is stored in refs. */
ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p,
bytecode_p->refs);

re_compile_bytecode (&re_bytecode_p,
pattern_str_p,
bytecode_p->status_flags);

const re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p,
bytecode_p->status_flags);
ecma_deref_ecma_string (pattern_str_p);

return (ecma_compiled_code_t *) re_bytecode_p;
Expand Down
2 changes: 1 addition & 1 deletion jerry-core/ecma/base/ecma-gc.c
Expand Up @@ -1464,7 +1464,7 @@ ecma_gc_run (void)

#if ENABLED (JERRY_BUILTIN_REGEXP)
/* Free RegExp bytecodes stored in cache */
re_cache_gc_run ();
re_cache_gc ();
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
} /* ecma_gc_run */

Expand Down
6 changes: 2 additions & 4 deletions jerry-core/ecma/base/ecma-helpers-string.c
Expand Up @@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
if (!lit_char_is_white_space (ch))
{
nonws_start_p = current_p;
break;
Expand All @@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
if (!lit_char_is_white_space (ch))
{
break;
}
Expand Down
27 changes: 14 additions & 13 deletions jerry-core/ecma/builtin-objects/ecma-builtin-global.c
Expand Up @@ -223,13 +223,12 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
continue;
}

ecma_char_t decoded_byte;

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
if (!lit_char_hex_lookup (input_char_p + 1, input_end_p, 2))
{
return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
}

ecma_char_t decoded_byte = lit_read_code_unit_from_hex (input_char_p + 1, 2);
input_char_p += URI_ENCODED_BYTE_SIZE;

if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
Expand Down Expand Up @@ -272,20 +271,17 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
/* Input decode. */
if (*input_char_p != '%')
{
*output_char_p = *input_char_p;
output_char_p++;
input_char_p++;
*output_char_p++ = *input_char_p++;
continue;
}

ecma_char_t decoded_byte;

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
if (!lit_char_hex_lookup (input_char_p + 1, input_end_p, 2))
{
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
break;
}

ecma_char_t decoded_byte = lit_read_code_unit_from_hex (input_char_p + 1, 2);
input_char_p += URI_ENCODED_BYTE_SIZE;

if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
Expand Down Expand Up @@ -337,17 +333,22 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
}
else
{
ecma_char_t chr;
if (!lit_char_hex_lookup (input_char_p + 1, input_end_p, 2))
{
is_valid = false;
break;
}

ecma_char_t ch = lit_read_code_unit_from_hex (input_char_p + 1, 2);

if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
|| ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
if ((ch & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
{
is_valid = false;
break;
}

octets[i] = (lit_utf8_byte_t) chr;
input_char_p += URI_ENCODED_BYTE_SIZE;
octets[i] = (lit_utf8_byte_t) ch;
}
}

Expand Down
10 changes: 3 additions & 7 deletions jerry-core/ecma/builtin-objects/ecma-builtin-json.c
Expand Up @@ -174,17 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
}
case LIT_CHAR_LOWERCASE_U:
{
if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
{
goto invalid_string;
}

ecma_char_t code_unit;
if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
if (!lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
{
goto invalid_string;
}

const ecma_char_t code_unit = lit_read_code_unit_from_hex (current_p + 1,
ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
ecma_stringbuilder_append_char (&result_builder, code_unit);
current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
break;
Expand Down
8 changes: 3 additions & 5 deletions jerry-core/ecma/builtin-objects/ecma-builtins.c
Expand Up @@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */

ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;

const re_compiled_code_t *bc_p = NULL;
ecma_value_t ret_value = re_compile_bytecode (&bc_p,
ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
RE_FLAG_EMPTY);
re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
RE_FLAG_EMPTY);

JERRY_ASSERT (ecma_is_value_empty (ret_value));
JERRY_ASSERT (bc_p != NULL);

ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);

Expand Down

0 comments on commit 4ea579f

Please sign in to comment.