26 changes: 22 additions & 4 deletions doc/API
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Oniguruma API Version 6.9.2 2019/03/25
Oniguruma API Version 6.9.3 2019/07/06

#include <oniguruma.h>

Expand Down Expand Up @@ -168,6 +168,9 @@ Oniguruma API Version 6.9.2 2019/03/25
# int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
OnigCompileInfo* ci, OnigErrorInfo* einfo)

This function is deprecate, and it does not allow the case where
the encoding of pattern and target is different.

Create a regex object.
This function is deluxe version of onig_new().

Expand Down Expand Up @@ -299,6 +302,7 @@ Oniguruma API Version 6.9.2 2019/03/25
const UChar* range, OnigRegion* region, OnigOptionType option)

Search string and return search result and matching region.
Do not pass invalid byte string in the regex character encoding.

normal return: match position offset (i.e. p - str >= 0)
not found: ONIG_MISMATCH (< 0)
Expand All @@ -323,15 +327,19 @@ Oniguruma API Version 6.9.2 2019/03/25
const UChar* start, const UChar* range, OnigRegion* region,
OnigOptionType option, OnigMatchParam* mp)

arguments
1-7: same as onig_search()
8 mp: match parameter values (match_stack_limit, retry_limit_in_match)
Search string and return search result and matching region.
Do not pass invalid byte string in the regex character encoding.

arguments
1-7: same as onig_search()
8 mp: match parameter values (match_stack_limit, retry_limit_in_match)


# int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at,
OnigRegion* region, OnigOptionType option)

Match string and return result and matching region.
Do not pass invalid byte string in the regex character encoding.

normal return: match length (>= 0)
not match: ONIG_MISMATCH ( < 0)
Expand All @@ -353,6 +361,9 @@ Oniguruma API Version 6.9.2 2019/03/25
const UChar* at, OnigRegion* region,
OnigOptionType option, OnigMatchParam* mp)

Match string and return result and matching region.
Do not pass invalid byte string in the regex character encoding.

arguments
1-6: same as onig_match()
7 mp: match parameter values (match_stack_limit, retry_limit_in_match)
Expand All @@ -364,6 +375,7 @@ Oniguruma API Version 6.9.2 2019/03/25
void* callback_arg)

Scan string and callback with matching region.
Do not pass invalid byte string in the regex character encoding.

normal return: number of matching times
error: error code
Expand Down Expand Up @@ -611,14 +623,20 @@ Oniguruma API Version 6.9.2 2019/03/25


# int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end)

Return number of characters in the string.


# int onigenc_strlen_null(OnigEncoding enc, const UChar* s)

Return number of characters in the string.
Do not pass invalid byte string in the character encoding.


# int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)

Return number of bytes in the string.
Do not pass invalid byte string in the character encoding.


# int onig_set_default_syntax(OnigSyntaxType* syntax)
Expand Down
20 changes: 19 additions & 1 deletion doc/API.ja
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
鬼車インターフェース Version 6.9.2 2019/03/29
鬼車インターフェース Version 6.9.3 2019/07/06

#include <oniguruma.h>

Expand Down Expand Up @@ -167,6 +167,9 @@
# int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
OnigCompileInfo* ci, OnigErrorInfo* einfo)

この関数は廃止予定。
パターンと対象文字列の文字エンコーディングが異なる場合を許さなくなった。

正規表現オブジェクト(regex)を作成する。
この関数は、onig_new()のデラックス版。

Expand Down Expand Up @@ -298,6 +301,7 @@
const UChar* range, OnigRegion* region, OnigOptionType option)

正規表現で文字列を検索し、検索結果とマッチ領域を返す。
正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。

正常終了戻り値: マッチ位置 (p - str >= 0)
検索失敗: ONIG_MISMATCH (< 0)
Expand All @@ -322,6 +326,9 @@
const UChar* start, const UChar* range, OnigRegion* region,
OnigOptionType option, OnigMatchParam* mp)

正規表現で文字列を検索し、検索結果とマッチ領域を返す。
正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。

引数
1-7: onig_search()と同じ
8 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match)
Expand All @@ -331,6 +338,7 @@
const UChar* at, OnigRegion* region, OnigOptionType option)

文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。
正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。

正常終了戻り値: マッチしたバイト長 (>= 0)
not match: ONIG_MISMATCH ( < 0)
Expand All @@ -352,6 +360,9 @@
const UChar* at, OnigRegion* region,
OnigOptionType option, OnigMatchParam* mp)

文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。
正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。

引数
1-6: onig_match()と同じ
7 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match)
Expand All @@ -363,6 +374,7 @@
void* callback_arg)

正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。
正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。

正常終了: マッチ回数 (0回も含める)
エラー: エラーコード (< 0)
Expand Down Expand Up @@ -616,14 +628,20 @@


# int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end)

文字列の文字数を返す。


# int onigenc_strlen_null(OnigEncoding enc, const UChar* s)

文字列の文字数を返す。
文字エンコーディングに対して、不正な文字列を渡してはいけない。


# int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)

文字列のバイト数を返す。
文字エンコーディングに対して、不正な文字列を渡してはいけない。


# int onig_set_default_syntax(OnigSyntaxType* syntax)
Expand Down
2 changes: 1 addition & 1 deletion doc/UNICODE_PROPERTIES
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Unicode Properties (from Unicode Version: 12.0.0)
Unicode Properties (from Unicode Version: 12.1.0)

15: ASCII_Hex_Digit
16: Adlam
Expand Down
111 changes: 111 additions & 0 deletions harnesses/ascii_compatible.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# First-pass fuzzing dictionary for Oniguruma by Mark Griffin
"\\o{17777777777}"
"\\777"
"\\u"
"\\uFFFF"
"\\xFF"
"\\x{70000000}"
"\\C-"
"\\M-\\C-"
"\\X"
"\\p{"
"\\p{^"
"}"
"]"
")"
"\\n"
"\\r"
"\\R"
"\\W"
"\\w"
"\\s"
"\\S"
"\\d"
"\\O"
"\\X"
"\\b"
"\\y"
"\\Y"
"\\A"
"\\z"
"\\K"
"\\G"
"\\p{Print}"
"\\p{ASCII}"
"\\p{Alnum}"
"{0,2}"
"{3,}"
"{,3}"
"{5}"
"{4,2}"
"??"
"*?"
"+?"
"*+"
"{1,3}+"
"(?>"
"\\B"
"(?y{"
"[abcd1-9]"
"[\\w\\d"
"[\\p{Alphabetic}"
"[\\P{Arabic}"
"[\\x{ffff}"
"[a-w&&"
"[^"
"[:graph:]"
"[^:cntrl:]"
"(?i:"
"(?i)"
"(?m:"
"(?x:"
"(?W:"
"(?y-:"
"(?y{w}:"
"(?P:"
"(?#"
"(?:"
"(?="
"(?!"
"(?<="
"(?<!"
"(?>"
"(?<name>"
"(?{"
"(?{....}[x])"
"(?{.}[x]>)"
"(?{{{.}}})"
"(?~"
"(?~a)"
"(?~|a|.*)"
"(?~|(?:a|b))"
"(?~|)"
"(?(.) |.)"
"(?('-n'))"
"(?(n+0))"
"(?(n+1))"
"(?(n-1))"
"(?(<name+0>))"
"(?(<name+1>))"
"(?(<name-1>))"
"(*ERROR{-2000})"
"(*COUNT[tag]{X})"
"\\1"
"\\2"
"\\k<name>"
"\\k<1>"
"\\k<2>"
"\\k<-1>"
"\\k<-2>"
"\\k<name+0>"
"\\k<name+1>"
"\\k<name-1>"
"\\g<-1>"
"\\g<name>"
"name"
"(?<name>a|b\\g<name>c)"
"(?-i:\\g<name>)"
"\\N{name}"
"\\p{Hiragana}"
"\\p{Katakana}"
"\\p{Emoji}"
239 changes: 239 additions & 0 deletions harnesses/deluxe-encode-harness.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
/*
* deluxe-encode-harness.c
* contributed by Mark Griffin
*/
#include <stdio.h>
#include "oniguruma.h"

#include <stdlib.h>
#include <string.h>

#define DEFAULT_LIMIT 120
typedef unsigned char uint8_t;

static int
search(regex_t* reg, unsigned char* str, unsigned char* end)
{
int r;
unsigned char *start, *range;
OnigRegion *region;

region = onig_region_new();

start = str;
range = end;
r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
if (r >= 0) {
int i;

fprintf(stdout, "match at %d (%s)\n", r,
ONIGENC_NAME(onig_get_encoding(reg)));
for (i = 0; i < region->num_regs; i++) {
fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
}
}
else if (r == ONIG_MISMATCH) {
fprintf(stdout, "search fail (%s)\n",
ONIGENC_NAME(onig_get_encoding(reg)));
}
else { /* error */
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stdout, "ERROR: %s\n", s);
fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return -1;
}

onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return 0;
}

static int
exec(OnigEncoding enc, OnigOptionType options,
char* apattern, char* apattern_end, char* astr, char* astr_end)
{
int r;
regex_t* reg;
OnigErrorInfo einfo;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;
UChar* pattern_end = (UChar* )apattern_end;
unsigned char *end = (unsigned char* )astr_end;

onig_initialize(&enc, 1);
onig_set_retry_limit_in_match(DEFAULT_LIMIT);
onig_set_parse_depth_limit(DEFAULT_LIMIT);

r = onig_new(&reg, pattern, pattern_end,
options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stdout, "ERROR: %s\n", s);
onig_end();
return -1;
}

r = search(reg, str, end);

onig_free(reg);
onig_end();
return 0;
}

static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN;

static int
exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc,
OnigOptionType options, char* apattern, char* apattern_end,
char* astr, char* astr_end)
{
int r;
regex_t* reg;
OnigCompileInfo ci;
OnigErrorInfo einfo;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;
UChar* pattern_end = (UChar* )apattern_end;
unsigned char* end = (unsigned char* )astr_end;

onig_initialize(&str_enc, 1);
onig_set_retry_limit_in_match(DEFAULT_LIMIT);
onig_set_parse_depth_limit(DEFAULT_LIMIT);

ci.num_of_elements = 5;
ci.pattern_enc = pattern_enc;
ci.target_enc = str_enc;
ci.syntax = ONIG_SYNTAX_DEFAULT;
ci.option = options;
ci.case_fold_flag = CF;

r = onig_new_deluxe(&reg, pattern, pattern_end, &ci, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stdout, "ERROR: %s\n", s);
onig_end();
return -1;
}

if (onigenc_is_valid_mbc_string(str_enc, str, end) != 0) {
r = search(reg, str, end);
}

onig_free(reg);
onig_end();
return 0;
}

#define PATTERN_SIZE 48
#define NUM_CONTROL_BYTES 1
#define MIN_STR_SIZE 2
int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
{
int r;
size_t remaining_size;
unsigned char *data;
unsigned char pat_encoding_choice;
unsigned char str_encoding_choice;
unsigned char *pattern;
unsigned char *str;
unsigned char *pattern_end;
unsigned char *str_end;
unsigned int num_encodings;
OnigEncodingType *pattern_enc;
OnigEncodingType *str_enc;

OnigEncodingType *encodings[] = {
ONIG_ENCODING_ASCII,
ONIG_ENCODING_ISO_8859_1,
ONIG_ENCODING_ISO_8859_2,
ONIG_ENCODING_ISO_8859_3,
ONIG_ENCODING_ISO_8859_4,
ONIG_ENCODING_ISO_8859_5,
ONIG_ENCODING_ISO_8859_6,
ONIG_ENCODING_ISO_8859_7,
ONIG_ENCODING_ISO_8859_8,
ONIG_ENCODING_ISO_8859_9,
ONIG_ENCODING_ISO_8859_10,
ONIG_ENCODING_ISO_8859_11,
ONIG_ENCODING_ISO_8859_13,
ONIG_ENCODING_ISO_8859_14,
ONIG_ENCODING_ISO_8859_15,
ONIG_ENCODING_ISO_8859_16,
ONIG_ENCODING_UTF8,
ONIG_ENCODING_UTF16_BE,
ONIG_ENCODING_UTF16_LE,
ONIG_ENCODING_UTF32_BE,
ONIG_ENCODING_UTF32_LE,
ONIG_ENCODING_EUC_JP,
ONIG_ENCODING_EUC_TW,
ONIG_ENCODING_EUC_KR,
ONIG_ENCODING_EUC_CN,
ONIG_ENCODING_SJIS,
//ONIG_ENCODING_KOI8,
ONIG_ENCODING_KOI8_R,
ONIG_ENCODING_CP1251,
ONIG_ENCODING_BIG5,
ONIG_ENCODING_GB18030,
};

if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE))
return 0;
if (Size > 0x1000)
return 0;

remaining_size = Size;
data = (unsigned char *)(Data);

// pull off bytes to switch off
pat_encoding_choice = data[0];
data++;
remaining_size--;
str_encoding_choice = data[0];
data++;
remaining_size--;

// copy first PATTERN_SIZE bytes off to be the pattern
pattern = (unsigned char *)malloc(PATTERN_SIZE+4);
memset(pattern, 0, PATTERN_SIZE+4);
memcpy(pattern, data, PATTERN_SIZE);
pattern_end = pattern + PATTERN_SIZE;
data += PATTERN_SIZE;
remaining_size -= PATTERN_SIZE;

str = (unsigned char*)malloc(remaining_size+4);
memset(str, 0, remaining_size+4);
memcpy(str, data, remaining_size);
str_end = str + remaining_size;

num_encodings = sizeof(encodings) / sizeof(encodings[0]);
pattern_enc = encodings[pat_encoding_choice % num_encodings];
str_enc = encodings[str_encoding_choice % num_encodings];

r = exec_deluxe(pattern_enc, str_enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, (char *)str, (char *)str_end);

free(pattern);
free(str);

return r;
}


#ifdef WITH_READ_MAIN

#include <unistd.h>

extern int main(int argc, char* argv[])
{
size_t n;
uint8_t Data[10000];

n = read(0, Data, sizeof(Data));
fprintf(stdout, "n: %ld\n", n);
LLVMFuzzerTestOneInput(Data, n);

return 0;
}
#endif /* WITH_READ_MAIN */
72 changes: 72 additions & 0 deletions harnesses/dict_conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
# dict_conv.py (Python3 script)

import sys

ENC_UTF16_BE = 1
ENC_UTF16_LE = 2

def add_char(enc, s, c):
if enc == ENC_UTF16_BE:
s += "\\x00"

s += c
if enc == ENC_UTF16_LE:
s += "\\x00"

return s

def conv(enc, s):
n = len(s)
r = ""
i = 0
while i < n:
c = s[i]
if c == '\\':
c = s[i+1]
if c == '\\' or c == '"':
r = add_char(enc, r, "\\" + c)
i += 2
continue
else:
raise("Unknown escape {0}".format(s))

r = add_char(enc, r, c)
i += 1

return r

def main(enc):
print("# This file was generated by dict_conv.py.")
for line in sys.stdin:
s = line.strip()
if s[0] == '#':
print(s)
continue

if s[0] == '"' and s[-1] == '"':
s = conv(enc, s[1:-1])
print("\"{0}\"".format(s))
else:
raise("Invalid format {0}".format(s))

def usage(argv):
raise RuntimeError("Usage: python {0} utf16_be/utf16_le".format(argv[0]))


if __name__ == "__main__":
argv = sys.argv
argc = len(argv)

if argc >= 2:
s = argv[1]
if s == 'utf16_be':
enc = ENC_UTF16_BE
elif s == 'utf16_le':
enc = ENC_UTF16_LE
else:
usage(argv)
else:
usage(argv)

main(enc)
170 changes: 170 additions & 0 deletions harnesses/encode-harness.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* encode-harness.c
* contributed by Mark Griffin
*/
#include <stdio.h>
#include "oniguruma.h"

#include <stdlib.h>
#include <string.h>

#define PARSE_DEPTH_LIMIT 120
#define RETRY_LIMIT 4000

typedef unsigned char uint8_t;

static int
search(regex_t* reg, unsigned char* str, unsigned char* end)
{
int r;
unsigned char *start, *range;
OnigRegion *region;

region = onig_region_new();

start = str;
range = end;
r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
if (r >= 0) {
int i;

fprintf(stdout, "match at %d (%s)\n", r,
ONIGENC_NAME(onig_get_encoding(reg)));
for (i = 0; i < region->num_regs; i++) {
fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
}
}
else if (r == ONIG_MISMATCH) {
fprintf(stdout, "search fail (%s)\n",
ONIGENC_NAME(onig_get_encoding(reg)));
}
else { /* error */
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stdout, "ERROR: %s\n", s);
fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return -1;
}

onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return 0;
}

static int
exec(OnigEncoding enc, OnigOptionType options,
char* apattern, char* apattern_end, char* astr, UChar* end)
{
int r;
regex_t* reg;
OnigErrorInfo einfo;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;
UChar* pattern_end = (UChar* )apattern_end;

onig_initialize(&enc, 1);
onig_set_retry_limit_in_match(RETRY_LIMIT);
onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT);

r = onig_new(&reg, pattern, pattern_end,
options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stdout, "ERROR: %s\n", s);
onig_end();
return -1;
}

if (onigenc_is_valid_mbc_string(enc, str, end) != 0) {
r = search(reg, str, end);
}

onig_free(reg);
onig_end();
return 0;
}

#define PATTERN_SIZE 32
#define NUM_CONTROL_BYTES 1
#define MIN_STR_SIZE 1
int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
{
if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE))
return 0;
if (Size > 0x1000)
return 0;

unsigned char *pattern_end;
unsigned char *str_null_end;

size_t remaining_size = Size;
unsigned char *data = (unsigned char *)(Data);

// pull off one byte to switch off
unsigned char encoding_choice = data[0];
data++;
remaining_size--;

// copy first PATTERN_SIZE bytes off to be the pattern
unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4);
memset(pattern, 0, PATTERN_SIZE+4);
memcpy(pattern, data, PATTERN_SIZE);
pattern_end = pattern + PATTERN_SIZE;
data += PATTERN_SIZE;
remaining_size -= PATTERN_SIZE;

unsigned char *str = (unsigned char*)malloc(remaining_size+4);
memset(str, 0, remaining_size+4);
memcpy(str, data, remaining_size);
str_null_end = str + remaining_size;

int r;
OnigEncodingType *encodings[] = {
ONIG_ENCODING_SJIS,
ONIG_ENCODING_EUC_JP,
ONIG_ENCODING_CP1251,
ONIG_ENCODING_ISO_8859_1,
ONIG_ENCODING_UTF8,
ONIG_ENCODING_KOI8_R,
ONIG_ENCODING_BIG5
};

OnigEncodingType *enc;

#ifdef UTF16_BE
enc = ONIG_ENCODING_UTF16_BE;
#else
#ifdef UTF16_LE
enc = ONIG_ENCODING_UTF16_LE;
#else
int num_encodings = sizeof(encodings)/sizeof(encodings[0]);
enc = encodings[encoding_choice % num_encodings];
#endif
#endif

r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end,
(char *)str, str_null_end);

free(pattern);
free(str);

return r;
}

#ifdef WITH_READ_MAIN

#include <unistd.h>

extern int main(int argc, char* argv[])
{
size_t n;
uint8_t Data[10000];

n = read(0, Data, sizeof(Data));
fprintf(stdout, "n: %ld\n", n);
LLVMFuzzerTestOneInput(Data, n);

return 0;
}
#endif /* WITH_READ_MAIN */
120 changes: 120 additions & 0 deletions harnesses/syntax-harness.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* syntax-harness.c
* contributed by Mark Griffin
*/
#include <stdio.h>
#include <string.h>
#include "oniguruma.h"

#include <stdlib.h>

#define DEFAULT_LIMIT 120
typedef unsigned char uint8_t;

extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr)
{
int r;
unsigned char *start, *range, *end;
regex_t* reg;
OnigErrorInfo einfo;
OnigRegion *region;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;

r = onig_new(&reg, pattern, pattern + strlen((char* )pattern),
ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stdout, "ERROR: %s\n", s);
return -1;
}

region = onig_region_new();

end = str + strlen((char* )str);
start = str;
range = end;
r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
if (r >= 0) {
int i;

fprintf(stdout, "match at %d\n", r);
for (i = 0; i < region->num_regs; i++) {
fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
}
}
else if (r == ONIG_MISMATCH) {
fprintf(stdout, "search fail\n");
}
else { /* error */
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stdout, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
return -1;
}

onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
return 0;
}

#define PATTERN_SIZE 64
#define NUM_CONTROL_BYTES 1
#define MIN_STR_SIZE 1
int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
{
if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE))
return 0;
if (Size > 0x1000)
return 0;
size_t remaining_size = Size;
unsigned char *data = (unsigned char *)(Data);

// pull off one byte to switch syntax choice
unsigned char syntax_choice = data[0];
data++;
remaining_size--;

// copy first PATTERN_SIZE bytes off to be the pattern
unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1);
memset(pattern, 0, PATTERN_SIZE+1);
memcpy(pattern, data, PATTERN_SIZE);
data += PATTERN_SIZE;
remaining_size -= PATTERN_SIZE;

unsigned char *str = (unsigned char*)malloc(remaining_size+1);
memset(str, 0, remaining_size+1);
memcpy(str, data, remaining_size);

OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII };
onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0]));

onig_set_retry_limit_in_match(DEFAULT_LIMIT);
onig_set_parse_depth_limit(DEFAULT_LIMIT);

OnigSyntaxType *syntaxes[] = {
ONIG_SYNTAX_POSIX_EXTENDED,
ONIG_SYNTAX_EMACS,
ONIG_SYNTAX_GREP,
ONIG_SYNTAX_GNU_REGEX,
ONIG_SYNTAX_JAVA,
ONIG_SYNTAX_PERL_NG,
ONIG_SYNTAX_RUBY,
ONIG_SYNTAX_ONIGURUMA,
};
OnigSyntaxType *syntax = syntaxes[syntax_choice % 8];

int r;
r = exec(syntax, (char *)pattern, (char *)str);
// r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc");

onig_end();

free(pattern);
free(str);

return 0;
}
4 changes: 3 additions & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@
<h1>Oniguruma</h1> (<a href="index_ja.html">Japanese</a>)

<p>
(c) K.Kosako, updated at: 2018/12/06
(c) K.Kosako, updated at: 2019/08/05
</p>

<dl>
<font color="orange">
<dt><b>What's new</b>
</font>
<ul>
<li>2019/08/06: Version 6.9.3 released.</li>
<li>2019/05/07: Version 6.9.2 released.</li>
<li>2018/12/11: Version 6.9.1 released.</li>
<li>2018/09/03: Version 6.9.0 released.</li>
<li>2018/04/17: Version 6.8.2 released.</li>
Expand Down
4 changes: 3 additions & 1 deletion index_ja.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@
<h1>鬼車</h1>

<p>
(c) K.Kosako, 最終更新: 2018/12/06
(c) K.Kosako, 最終更新: 2019/08/05
</p>

<dl>
<font color="orange">
<dt><b>更新情報</b>
</font>
<ul>
<li>2019/08/06: Version 6.9.3 リリース</li>
<li>2019/05/07: Version 6.9.2 リリース</li>
<li>2018/12/11: Version 6.9.1 リリース</li>
<li>2018/09/03: Version 6.9.0 リリース</li>
<li>2018/04/17: Version 6.8.2 リリース</li>
Expand Down
56 changes: 9 additions & 47 deletions sample/bug_fix.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
#include <stdio.h>
#include "oniguruma.h"

static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN;

static int
search(regex_t* reg, unsigned char* str, unsigned char* end)
{
Expand Down Expand Up @@ -36,52 +34,14 @@ search(regex_t* reg, unsigned char* str, unsigned char* end)
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return -1;
}

onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return 0;
}

static int
exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc,
OnigOptionType options, char* apattern, char* astr)
{
int r;
unsigned char *end;
regex_t* reg;
OnigCompileInfo ci;
OnigErrorInfo einfo;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;

onig_initialize(&str_enc, 1);

ci.num_of_elements = 5;
ci.pattern_enc = pattern_enc;
ci.target_enc = str_enc;
ci.syntax = ONIG_SYNTAX_DEFAULT;
ci.option = options;
ci.case_fold_flag = CF;

r = onig_new_deluxe(&reg, pattern,
pattern + onigenc_str_bytelen_null(pattern_enc, pattern),
&ci, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stderr, "ERROR: %s\n", s);
return -1;
}

end = str + onigenc_str_bytelen_null(str_enc, str);
r = search(reg, str, end);

onig_free(reg);
onig_end();
return 0;
}

static int
exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
{
Expand All @@ -92,8 +52,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;

onig_initialize(&enc, 1);

r = onig_new(&reg, pattern,
pattern + onigenc_str_bytelen_null(enc, pattern),
options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
Expand All @@ -108,24 +66,28 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
r = search(reg, str, end);

onig_free(reg);
onig_end();
return 0;
}



extern int main(int argc, char* argv[])
{
OnigEncoding use_encs[1];

use_encs[0] = ONIG_ENCODING_UTF8;
onig_initialize(use_encs, 1);

/* fix ignore case in look-behind
commit: 3340ec2cc5627172665303fe248c9793354d2251 */
exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8,
ONIG_OPTION_IGNORECASE,
"(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */
exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE,
"(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */

exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */

exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST,
"a*", "aa aaa aaaa aaaaa "); /* match 12-17 */

onig_end();
return 0;
}
2 changes: 2 additions & 0 deletions sample/crnl.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ x(int no, char* pattern_arg, char* str_arg,
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str(s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
return -1;
}

Expand Down
142 changes: 24 additions & 118 deletions sample/encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end)
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return -1;
}

Expand Down Expand Up @@ -72,55 +73,6 @@ exec(OnigEncoding enc, OnigOptionType options,
return 0;
}

static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN;

#if 0
static void
set_case_fold(OnigCaseFoldType cf)
{
CF = cf;
}
#endif

static int
exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc,
OnigOptionType options, char* apattern, char* astr)
{
int r;
unsigned char *end;
regex_t* reg;
OnigCompileInfo ci;
OnigErrorInfo einfo;
UChar* pattern = (UChar* )apattern;
UChar* str = (UChar* )astr;

onig_initialize(&str_enc, 1);

ci.num_of_elements = 5;
ci.pattern_enc = pattern_enc;
ci.target_enc = str_enc;
ci.syntax = ONIG_SYNTAX_DEFAULT;
ci.option = options;
ci.case_fold_flag = CF;

r = onig_new_deluxe(&reg, pattern,
pattern + onigenc_str_bytelen_null(pattern_enc, pattern),
&ci, &einfo);
if (r != ONIG_NORMAL) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stderr, "ERROR: %s\n", s);
return -1;
}

end = str + onigenc_str_bytelen_null(str_enc, str);
r = search(reg, str, end);

onig_free(reg);
onig_end();
return 0;
}

extern int main(int argc, char* argv[])
{
int r;
Expand Down Expand Up @@ -196,39 +148,6 @@ extern int main(int argc, char* argv[])
r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
"is", "iss");

r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_NONE, "a+",
"\000b\000a\000a\000a\000c\000c\000\000");

r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_LE,
ONIG_OPTION_NONE, "a+",
"b\000a\000a\000a\000a\000c\000\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_LE,
ONIG_OPTION_NONE,
"\000b\000a\000a\000a\000c\000c\000\000",
"x\000b\000a\000a\000a\000c\000c\000\000\000");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\337", "\000S\000S\000\000");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"SS", "\000\337\000\000");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_LE,
ONIG_OPTION_IGNORECASE,
"\337", "S\000S\000\000\000");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_BE,
ONIG_OPTION_IGNORECASE,
"SS", "\000\000\000\337\000\000\000\000");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_LE,
ONIG_OPTION_IGNORECASE,
"\337", "S\000\000\000S\000\000\000\000\000\000\000");

r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE,
"\000[\000[\000:\000a\000l\000n\000u\000m\000:\000]\000]\000+\000\000",
"\000#\002\120\000a\000Z\012\077\012\076\012\075\000\000");
Expand All @@ -242,62 +161,49 @@ extern int main(int argc, char* argv[])
r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_IGNORECASE,
"(Aa\\d)+", "BaA5Aa0234");

r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_NONE,
"^\\P{Hiragana}\\p{^Hiragana}(\\p{Hiragana}+)$",
"\060\100\060\240\060\101\060\102\060\226\060\237\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000[\000\337\000]\000\000", "\000S\000S\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000[\000\337\000]\000\000", "\000S\000S\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000[\000\337\000]\000\000", "\000s\000S\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000[\000\337\000]\000\000", "\000s\000S\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000^\000[\000\001\000-\377\375\000]\000$\000\000",
"\000s\000S\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000^\000[\000\001\000-\377\375\000]\000$\000\000",
"\000s\000S\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000S\000S\000\000",
"\000S\000T\000\337\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000S\000S\000\000",
"\000S\000T\000\337\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000S\000T\000S\000S\000\000",
"\000S\000t\000s\000S\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000S\000T\000S\000S\000\000",
"\000S\000t\000s\000S\000\000");

{
UChar pat[] = { 0x1f, 0xfc, 0x00, 0x00 };
UChar str1[] = { 0x21, 0x26, 0x1f, 0xbe, 0x00, 0x00 };
UChar str2[] = { 0x1f, 0xf3, 0x00, 0x00 };

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str1);
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
(char* )pat, (char* )str1);

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str2);
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
(char* )pat, (char* )str2);
}

#if 0
/* You should define USE_UNICODE_CASE_FOLD_TURKISH_AZERI in regenc.h. */

set_case_fold(ONIGENC_CASE_FOLD_TURKISH_AZERI);

r = exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8,
ONIG_OPTION_IGNORECASE,
"Ii", "\304\261\304\260");
r = exec(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE,
"Ii", "\304\261\304\260");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\000I\000i\000\000", "\001\061\001\060\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\000I\000i\000\000", "\001\061\001\060\000\000");

r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE,
ONIG_OPTION_IGNORECASE,
"\001\061\001\060\000\000", "\000I\000i\000\000");
r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
"\001\061\001\060\000\000", "\000I\000i\000\000");

set_case_fold(ONIGENC_CASE_FOLD_MIN);
#endif
Expand Down
2 changes: 2 additions & 0 deletions sample/listcap.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ extern int ex(unsigned char* str, unsigned char* pattern,
else { /* error */
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
return -1;
}

Expand Down
3 changes: 3 additions & 0 deletions sample/names.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ extern int main(int argc, char* argv[])
else { /* error */
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
onig_end();
return -1;
}

Expand Down
5 changes: 5 additions & 0 deletions sample/posix.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ extern int main(int argc, char* argv[])
regerror(r, &reg, buf, sizeof(buf));
fprintf(stderr, "ERROR: %s\n", buf);
regfree(&reg);
onig_end();
return -1;
}
x(&reg, pattern, (UChar* )"aaabbbbd");
Expand All @@ -60,6 +61,7 @@ extern int main(int argc, char* argv[])
regerror(r, &reg, buf, sizeof(buf));
fprintf(stderr, "ERROR: %s\n", buf);
regfree(&reg);
onig_end();
return -1;
}
x(&reg, pattern, (UChar* )"a+b{2,7}d?|uuu");
Expand All @@ -71,6 +73,7 @@ extern int main(int argc, char* argv[])
regerror(r, &reg, buf, sizeof(buf));
fprintf(stderr, "ERROR: %s\n", buf);
regfree(&reg);
onig_end();
return -1;
}
x(&reg, pattern, (UChar* )"aaaabbbbbbd");
Expand All @@ -83,6 +86,7 @@ extern int main(int argc, char* argv[])
regerror(r, &reg, buf, sizeof(buf));
fprintf(stderr, "ERROR: %s\n", buf);
regfree(&reg);
onig_end();
return -1;
}
x(&reg, pattern, (UChar* )"aaabbbbd)");
Expand All @@ -93,6 +97,7 @@ extern int main(int argc, char* argv[])
regerror(r, &reg, buf, sizeof(buf));
fprintf(stderr, "ERROR: %s\n", buf);
regfree(&reg);
onig_end();
return -1;
}
x(&reg, pattern, (UChar* )"a\nb\n");
Expand Down
2 changes: 2 additions & 0 deletions sample/scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ scan(regex_t* reg, unsigned char* str, unsigned char* end)
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((OnigUChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
return -1;
}

Expand Down Expand Up @@ -63,6 +64,7 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((OnigUChar* )s, r, &einfo);
fprintf(stderr, "ERROR: %s\n", s);
onig_end();
return -1;
}

Expand Down
3 changes: 3 additions & 0 deletions sample/simple.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ extern int main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
onig_end();
return -1;
}

Expand Down
4 changes: 4 additions & 0 deletions sample/sql.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ extern int main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stderr, "ERROR: %s\n", s);
onig_end();
return -1;
}

Expand All @@ -66,6 +67,9 @@ extern int main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
onig_end();
return -1;
}

Expand Down
2 changes: 2 additions & 0 deletions sample/syntax.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr)
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
return -1;
}

Expand Down
5 changes: 5 additions & 0 deletions sample/user_property.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_end();
return -1;
}

Expand All @@ -52,6 +53,7 @@ main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
fprintf(stderr, "onig_new: ERROR: %s\n", s);
onig_end();
return -1;
}

Expand All @@ -76,6 +78,9 @@ main(int argc, char* argv[])
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r);
fprintf(stderr, "ERROR: %s\n", s);
onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
onig_free(reg);
onig_end();
return -1;
}

Expand Down
6 changes: 3 additions & 3 deletions src/gb18030.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
gb18030.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2005-2018 KUBO Takehiro <kubo AT jiubao DOT org>
* Copyright (c) 2005-2019 KUBO Takehiro <kubo AT jiubao DOT org>
* K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
Expand Down Expand Up @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p)
{
if (GB18030_MAP[*p] != CM)
return 1;

p++;
if (GB18030_MAP[*p] == C4)
return 4;
if (GB18030_MAP[*p] == C1)
return 1; /* illegal sequence */

return 2;
}

Expand Down
11 changes: 9 additions & 2 deletions src/oniguruma.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ extern "C" {
#define ONIGURUMA
#define ONIGURUMA_VERSION_MAJOR 6
#define ONIGURUMA_VERSION_MINOR 9
#define ONIGURUMA_VERSION_TEENY 2
#define ONIGURUMA_VERSION_TEENY 3

#define ONIGURUMA_VERSION_INT 60902
#define ONIGURUMA_VERSION_INT 60903

#ifndef P_
#if defined(__STDC__) || defined(_WIN32)
Expand All @@ -52,6 +52,7 @@ extern "C" {
# define PV_(args) args
#endif

#ifndef ONIG_STATIC
#ifndef ONIG_EXTERN
#if defined(_WIN32) && !defined(__GNUC__)
#if defined(ONIGURUMA_EXPORT)
Expand All @@ -65,6 +66,9 @@ extern "C" {
#ifndef ONIG_EXTERN
#define ONIG_EXTERN extern
#endif
#else
#define ONIG_EXTERN extern
#endif

/* PART: character encoding */

Expand Down Expand Up @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */
#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22)
#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */
#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26)
/* syntax (behavior) warning */
#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */
#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */
Expand Down Expand Up @@ -766,6 +771,8 @@ int onig_init P_((void));
ONIG_EXTERN
int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...));
ONIG_EXTERN
int onig_is_error_code_needs_param PV_((int code));
ONIG_EXTERN
void onig_set_warn_func P_((OnigWarnFunc f));
ONIG_EXTERN
void onig_set_verb_warn_func P_((OnigWarnFunc f));
Expand Down
156 changes: 93 additions & 63 deletions src/regcomp.c

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/regenc.c
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)
{
if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;

if ((code & 0xff00) != 0) return 2;
else return 1;
}
Expand Down
17 changes: 17 additions & 0 deletions src/regerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,
}


extern int
onig_is_error_code_needs_param(int code)
{
switch (code) {
case ONIGERR_UNDEFINED_NAME_REFERENCE:
case ONIGERR_UNDEFINED_GROUP_REFERENCE:
case ONIGERR_MULTIPLEX_DEFINED_NAME:
case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL:
case ONIGERR_INVALID_GROUP_NAME:
case ONIGERR_INVALID_CHAR_IN_GROUP_NAME:
case ONIGERR_INVALID_CHAR_PROPERTY_NAME:
return 1;
default:
return 0;
}
}

/* for ONIG_MAX_ERROR_MESSAGE_LEN */
#define MAX_ERROR_PAR_LEN 30

Expand Down
130 changes: 90 additions & 40 deletions src/regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)
#define STK_CALL_FRAME 0x0400
#define STK_RETURN 0x0500
#define STK_SAVE_VAL 0x0600
#define STK_PREC_READ_START 0x0700
#define STK_PREC_READ_END 0x0800

/* stack type check mask */
#define STK_MASK_POP_USED STK_ALT_FLAG
Expand Down Expand Up @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base,

#define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev)
#define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev)
#define STACK_PUSH_POS(s,sprev) \
STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev)
#define STACK_PUSH_PREC_READ_START(s,sprev) \
STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev)
#define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \
STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev)
#define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START)
Expand Down Expand Up @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base,
}\
} while(0)

#define STACK_GET_PREC_READ_START(k) do {\
int level = 0;\
k = stk;\
while (1) {\
k--;\
STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\
if (IS_TO_VOID_TARGET(k)) {\
k->type = STK_VOID;\
}\
else if (k->type == STK_PREC_READ_START) {\
if (level == 0) {\
break;\
}\
level--;\
}\
else if (k->type == STK_PREC_READ_END) {\
level++;\
}\
}\
} while(0)

#define STACK_EMPTY_CHECK(isnull,sid,s) do {\
StackType* k = stk;\
while (1) {\
Expand All @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base,
}\
} while (0)

#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT
#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
#define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\
StackType* k = stk;\
while (1) {\
Expand All @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base,
}\
else {\
UChar* endp;\
int level = 0;\
(isnull) = 1;\
while (k < stk) {\
if (k->type == STK_MEM_START) {\
if (k->type == STK_MEM_START && level == 0) {\
STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\
if (endp == 0) {\
(isnull) = 0; break;\
Expand All @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base,
(isnull) = -1; /* empty, but position changed */ \
}\
}\
else if (k->type == STK_PREC_READ_START) {\
level++;\
}\
else if (k->type == STK_PREC_READ_END) {\
level--;\
}\
k++;\
}\
break;\
Expand All @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base,
}\
else {\
UChar* endp;\
int prec_level = 0;\
(isnull) = 1;\
while (k < stk) {\
if (k->type == STK_MEM_START) {\
if (level == 0) {\
if (level == 0 && prec_level == 0) {\
STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\
if (endp == 0) {\
(isnull) = 0; break;\
Expand All @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base,
else if (k->type == STK_EMPTY_CHECK_END) {\
if (k->zid == (sid)) level--;\
}\
else if (k->type == STK_PREC_READ_START) {\
prec_level++;\
}\
else if (k->type == STK_PREC_READ_END) {\
prec_level--;\
}\
k++;\
}\
break;\
Expand Down Expand Up @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base,
}\
}\
} while(0)
#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */
#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */

#define STACK_GET_REPEAT(sid, k) do {\
int level = 0;\
Expand Down Expand Up @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
NEXT_OUT;

CASE_OP(CCLASS_MB)
DATA_ENSURE(1);
if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail;

cclass_mb:
Expand Down Expand Up @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
? STACK_AT(mem_end_stk[mem])->u.mem.pstr
: (UChar* )((void* )mem_end_stk[mem]));
n = (int )(pend - pstart);
DATA_ENSURE(n);
sprev = s;
STRING_CMP(pstart, s, n);
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
if (n != 0) {
DATA_ENSURE(n);
sprev = s;
STRING_CMP(s, pstart, n);
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
}
}
INC_OP;
JUMP_OUT;
Expand All @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
? STACK_AT(mem_end_stk[mem])->u.mem.pstr
: (UChar* )((void* )mem_end_stk[mem]));
n = (int )(pend - pstart);
DATA_ENSURE(n);
sprev = s;
STRING_CMP_IC(case_fold_flag, pstart, &s, n);
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
if (n != 0) {
DATA_ENSURE(n);
sprev = s;
STRING_CMP_IC(case_fold_flag, pstart, &s, n);
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
}
}
INC_OP;
JUMP_OUT;
Expand All @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
? STACK_AT(mem_end_stk[mem])->u.mem.pstr
: (UChar* )((void* )mem_end_stk[mem]));
n = (int )(pend - pstart);
DATA_ENSURE(n);
sprev = s;
swork = s;
STRING_CMP_VALUE(pstart, swork, n, is_fail);
if (is_fail) continue;
s = swork;
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;

if (n != 0) {
DATA_ENSURE(n);
sprev = s;
swork = s;
STRING_CMP_VALUE(swork, pstart, n, is_fail);
if (is_fail) continue;
s = swork;
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
}
break; /* success */
}
if (i == tlen) goto fail;
Expand Down Expand Up @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
? STACK_AT(mem_end_stk[mem])->u.mem.pstr
: (UChar* )((void* )mem_end_stk[mem]));
n = (int )(pend - pstart);
DATA_ENSURE(n);
sprev = s;
swork = s;
STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
if (is_fail) continue;
s = swork;
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;

if (n != 0) {
DATA_ENSURE(n);
sprev = s;
swork = s;
STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
if (is_fail) continue;
s = swork;
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
}
break; /* success */
}
if (i == tlen) goto fail;
Expand All @@ -3560,17 +3604,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
int len;
int level;
MemNumType* mems;
UChar* ssave;

n = 0;
backref_with_level:
level = p->backref_general.nest_level;
tlen = p->backref_general.num;
mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns;

sprev = s;
ssave = s;
if (backref_match_at_nested_level(reg, stk, stk_base, n,
case_fold_flag, level, (int )tlen, mems, &s, end)) {
if (sprev < end) {
if (ssave != s) {
sprev = ssave;
while (sprev + (len = enclen(encode, sprev)) < s)
sprev += len;
}
Expand Down Expand Up @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
JUMP_OUT;

#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT
#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
CASE_OP(EMPTY_CHECK_END_MEMST)
{
int is_empty;
Expand All @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
int is_empty;

mem = p->empty_check_end.mem; /* mem: null check id */
#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT
#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg);
#else
STACK_EMPTY_CHECK_REC(is_empty, mem, s);
Expand Down Expand Up @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
goto repeat_inc_ng;

CASE_OP(PREC_READ_START)
STACK_PUSH_POS(s, sprev);
STACK_PUSH_PREC_READ_START(s, sprev);
INC_OP;
JUMP_OUT;

CASE_OP(PREC_READ_END)
STACK_EXEC_TO_VOID(stkp);
STACK_GET_PREC_READ_START(stkp);
s = stkp->u.state.pstr;
sprev = stkp->u.state.pstr_prev;
STACK_PUSH(STK_PREC_READ_END,0,0,0);
INC_OP;
JUMP_OUT;

Expand Down Expand Up @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED)
if (n >= 0) {
n = ONIGERR_INVALID_CALLOUT_BODY;
}
else if (onig_is_error_code_needs_param(n)) {
n = ONIGERR_INVALID_CALLOUT_BODY;
}

return n;
}
Expand Down
6 changes: 3 additions & 3 deletions src/regext.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "regint.h"

#if 0
static void
conv_ext0be32(const UChar* s, const UChar* end, UChar* conv)
{
Expand Down Expand Up @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e

return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;
}
#endif

extern int
onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
Expand All @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;

if (ci->pattern_enc != ci->target_enc) {
r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end,
&cpat, &cpat_end);
if (r != 0) return r;
return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;
}
else {
cpat = (UChar* )pattern;
Expand Down
6 changes: 3 additions & 3 deletions src/regint.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
#define USE_CALL
#define USE_CALLOUT
#define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */
#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */
#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */
#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */
#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
#define USE_RETRY_LIMIT_IN_MATCH
Expand Down Expand Up @@ -348,8 +348,8 @@ typedef unsigned int MemStatusType;
#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \
((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR)

#define REPEAT_INFINITE -1
#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE)
#define INFINITE_REPEAT -1
#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT)

/* bitset */
#define BITS_PER_BYTE 8
Expand Down
190 changes: 111 additions & 79 deletions src/regparse.c

Large diffs are not rendered by default.

22 changes: 9 additions & 13 deletions src/regparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ enum GimmickType {
#endif
};

enum BodyEmpty {
BODY_IS_NOT_EMPTY = 0,
BODY_IS_EMPTY = 1,
BODY_IS_EMPTY_MEM = 2,
BODY_IS_EMPTY_REC = 3
enum BodyEmptyType {
BODY_IS_NOT_EMPTY = 0,
BODY_IS_EMPTY_POSSIBILITY = 1,
BODY_IS_EMPTY_POSSIBILITY_MEM = 2,
BODY_IS_EMPTY_POSSIBILITY_REC = 3
};

typedef struct {
Expand Down Expand Up @@ -101,7 +101,7 @@ typedef struct {
int lower;
int upper;
int greedy;
enum BodyEmpty empty_info;
enum BodyEmptyType emptiness;
struct _Node* head_exact;
struct _Node* next_head_exact;
int is_refered; /* include called node. don't eliminate even if {0} */
Expand Down Expand Up @@ -252,10 +252,6 @@ typedef struct _Node {
#define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL)
#define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK)

#define NODE_IS_SIMPLE_TYPE(node) \
((NODE_TYPE2BIT(NODE_TYPE(node)) & \
(NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0)

#define NODE_TYPE(node) ((node)->u.base.node_type)
#define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype)

Expand Down Expand Up @@ -314,7 +310,7 @@ typedef struct _Node {
#define NODE_ST_CLEN_FIXED (1<<2)
#define NODE_ST_MARK1 (1<<3)
#define NODE_ST_MARK2 (1<<4)
#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5)
#define NODE_ST_STRICT_REAL_REPEAT (1<<5)
#define NODE_ST_RECURSION (1<<6)
#define NODE_ST_CALLED (1<<7)
#define NODE_ST_ADDR_FIXED (1<<8)
Expand Down Expand Up @@ -357,8 +353,8 @@ typedef struct _Node {
#define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0)
#define NODE_IS_PROHIBIT_RECURSION(node) \
((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0)
#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \
((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0)
#define NODE_IS_STRICT_REAL_REPEAT(node) \
((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)

#define NODE_BODY(node) ((node)->u.base.body)
#define NODE_QUANT_BODY(node) ((node)->body)
Expand Down
35 changes: 31 additions & 4 deletions src/utf16_be.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
utf16_be.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p)
static int
is_valid_mbc_string(const UChar* s, const UChar* end)
{
return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end);
while (s < end) {
int len = utf16be_mbc_enc_len(s);
if (len == 4) {
if (s + 2 >= end)
return FALSE;
if (! UTF16_IS_SURROGATE_SECOND(*(s+2)))
return FALSE;
}
else
if (UTF16_IS_SURROGATE_SECOND(*s))
return FALSE;

s += len;
}

if (s != end)
return FALSE;
else
return TRUE;
}

static int
Expand Down Expand Up @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
static int
utf16be_code_to_mbclen(OnigCodePoint code)
{
return (code > 0xffff ? 4 : 2);
if (code > 0xffff) {
if (code > 0x10ffff)
return ONIGERR_INVALID_CODE_POINT_VALUE;
else
return 4;
}
else {
return 2;
}
}

static int
Expand Down Expand Up @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s)
s--;
}

if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 &&
UTF16_IS_SURROGATE_FIRST(*(s-2)))
s -= 2;

return (UChar* )s;
Expand Down
26 changes: 22 additions & 4 deletions src/utf16_le.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
utf16_le.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = {
static int
utf16le_code_to_mbclen(OnigCodePoint code)
{
return (code > 0xffff ? 4 : 2);
if (code > 0xffff) {
if (code > 0x10ffff)
return ONIGERR_INVALID_CODE_POINT_VALUE;
else
return 4;
}
else {
return 2;
}
}

static int
Expand All @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end)
const UChar* end1 = end - 1;

while (p < end1) {
p += utf16le_mbc_enc_len(p);
int len = utf16le_mbc_enc_len(p);
if (len == 4) {
if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3)))
return FALSE;
}
else
if (UTF16_IS_SURROGATE_SECOND(*(p + 1)))
return FALSE;

p += len;
}

if (p != end)
Expand Down Expand Up @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
s--;
}

if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 &&
UTF16_IS_SURROGATE_FIRST(*(s-1)))
s -= 2;

return (UChar* )s;
Expand Down
13 changes: 13 additions & 0 deletions test/test_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -1202,10 +1202,23 @@ extern int main(int argc, char* argv[])
x2("a{3,2}b", "aab", 0, 3);
x2("a{3,2}?", "", 0, 0); /* == (?:a{3,2})?*/
x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/
x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1);
x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3);

n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */
/* can't use \xfc00.. because compiler error: hex escape sequence out of range */
n("()0\\xfc00000\\xfc00000\\xfc00000\xfc", ""); /* https://bugs.php.net/bug.php?id=77371 */
x2("000||0\xfa", "0", 0, 0); /* https://bugs.php.net/bug.php?id=77381 */
e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */
n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */
n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */


x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */
x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */

e("\\x{7fffffff}", "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE);
e("[\\x{7fffffff}]", "", ONIGERR_INVALID_CODE_POINT_VALUE);
e("\\u040", "@", ONIGERR_INVALID_CODE_POINT_VALUE);
e("(?<abc>\\g<abc>)", "zzzz", ONIGERR_NEVER_ENDING_RECURSION);
e("(?<=(?>abc))", "abc", ONIGERR_INVALID_LOOK_BEHIND_PATTERN);
Expand Down
15 changes: 0 additions & 15 deletions test/testu.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,28 +116,13 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not)

#else
regex_t* reg;
OnigCompileInfo ci;
OnigErrorInfo einfo;

uconv(pattern, cpat, ulen(pattern));
uconv(str, cstr, ulen(str));

#if 0
r = onig_new(&reg, (UChar* )pattern, (UChar* )(pattern + ulen(pattern)),
ONIG_OPTION_DEFAULT, ENC, ONIG_SYNTAX_DEFAULT, &einfo);
#else
ci.num_of_elements = 5;
ci.pattern_enc = ENC;
ci.target_enc = ENC;
ci.syntax = ONIG_SYNTAX_DEFAULT;
ci.option = ONIG_OPTION_DEFAULT;
ci.case_fold_flag = ONIGENC_CASE_FOLD_DEFAULT;

r = onig_new_deluxe(&reg, (UChar* )pattern,
(UChar* )(pattern + ulen(pattern)),
&ci, &einfo);
#endif

if (r) {
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str((UChar* )s, r, &einfo);
Expand Down