Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Aho-Corasick for alts of literals during regexp AST compilation #221

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
510c688
Pass in an existing fsm for `ast_compile()`.
katef Jul 6, 2020
b732ae2
Pass in an existing fsm for `trie_to_fsm()`.
katef Jul 6, 2020
9a58888
Pass in an existing fsm for `re_comp()` and `re_strings()`.
katef Jul 7, 2020
07f8a67
Regenerated for `re_comp()`.
katef Jul 7, 2020
a1924a4
Catch-all for when `-DNDEBUG` is absent.
katef Jul 7, 2020
58a5443
Clarification.
katef Jul 7, 2020
97c9cde
No need for `re_strings_build_new()`; we can construct an fsm inside …
katef Jul 7, 2020
31c7d57
Missing free.
katef Jul 7, 2020
fd67a02
Oops... the start state here should have been exposed when passing in…
katef Jul 7, 2020
469f77d
Whitespace.
katef Jul 7, 2020
4023754
A bugfix; `fsm_unionxy()` may modify the start state due to `fsm_merg…
katef Jul 14, 2020
bb9124c
An interface to add a trie entry by walking an array of AST nodes.
katef Jul 15, 2020
83c3eaa
No need to set an end state when the caller already does this.
katef Jul 16, 2020
5e027ef
Add `re_strings_build_into()`, to pass in a pre-existing end state.
katef Jul 16, 2020
b418b9f
Handling for accepting states which are not leaf nodes in the trie.
katef Jul 18, 2020
96c1055
A first attempt at using Aho Corasick for suitable alt nodes.
katef Jul 18, 2020
b74e7d8
Typo.
katef Jul 19, 2020
0666f88
Perhaps sensible thresholds?
katef Jul 19, 2020
0d380c9
Sprinkle const.
katef Aug 1, 2020
fc8a967
Move out `ast_compile_altlist()`.
katef Aug 1, 2020
076e0ca
Special case for using A-C for an anchored list of alts at the root n…
katef Aug 2, 2020
2d03655
Recurr for special top-level handling of group nodes.
katef Aug 2, 2020
69523b2
Some tests for top-level Aho Corasick behaviour.
katef Aug 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ SUBDIR += tests/minimise
SUBDIR += tests/native
SUBDIR += tests/pcre
SUBDIR += tests/pcre-classes
SUBDIR += tests/pcre-ac
SUBDIR += tests/pcre-anchor
SUBDIR += tests/pcre-repeat
SUBDIR += tests/pred
Expand Down
3 changes: 2 additions & 1 deletion examples/bm/libfsm.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <fsm/fsm.h>
#include <fsm/options.h>
#include <fsm/print.h>

#include <re/re.h>

int
Expand Down Expand Up @@ -61,7 +62,7 @@ main(int argc, char *argv[])
opt.io = FSM_IO_STR;

p = argv[0];
fsm = re_comp(RE_PCRE, fsm_sgetc, &p, &opt, flags, &e);
fsm = re_comp_new(RE_PCRE, fsm_sgetc, &p, &opt, flags, &e);
if (fsm == NULL) {
re_perror(RE_LITERAL, &e, NULL, s);
return 1;
Expand Down
16 changes: 13 additions & 3 deletions examples/words/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
fsm_state_t rs;
struct fsm *r;

r = re_comp(native ? RE_NATIVE : RE_LITERAL, fsm_sgetc, &p, &opt, 0, &e);
r = re_comp_new(native ? RE_NATIVE : RE_LITERAL, fsm_sgetc, &p, &opt, 0, &e);
if (r == NULL) {
re_perror(native ? RE_NATIVE : RE_LITERAL, &e, NULL, s);
return 1;
Expand Down Expand Up @@ -161,19 +161,29 @@ int main(int argc, char *argv[]) {

if (ahocorasick) {
struct timespec pre, post;
fsm_state_t start;

if (-1 == clock_gettime(CLOCK_MONOTONIC, &pre)) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}

fsm = re_strings_build(g,
&opt, unanchored ? 0 : (RE_STRINGS_ANCHOR_LEFT | RE_STRINGS_ANCHOR_RIGHT));
fsm = fsm_new(&opt);
if (fsm == NULL) {
perror("fsm_new");
exit(EXIT_FAILURE);
}

if (!re_strings_build(fsm, &start, g,
unanchored ? 0 : (RE_STRINGS_ANCHOR_LEFT | RE_STRINGS_ANCHOR_RIGHT)))
{
perror("re_strings_builder_build");
fsm_free(fsm);
exit(EXIT_FAILURE);
}

fsm_setstart(fsm, start);

if (-1 == clock_gettime(CLOCK_MONOTONIC, &post)) {
perror("clock_gettime");
exit(EXIT_FAILURE);
Expand Down
19 changes: 16 additions & 3 deletions include/re/re.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,11 @@ re_getchar_fun(void *opaque);

/*
* Compile a regexp of the given dialect.
* States are populated into the given fsm. The start state is written out
* through the *start argument.
*
* Returns NULL on error. If non-NULL, the *err struct is populated with the
* type and 0-indexed byte offset of the error.
* Returns 1 on success, or 0 on error. If non-NULL, the *err struct is
* populated with the type and 0-indexed byte offset of the error.
*
* libfsm provides getc callbacks suitable for use with re_comp; see <fsm/fsm.h>.
* For example:
Expand All @@ -144,8 +146,19 @@ re_getchar_fun(void *opaque);
* There's nothing special about libfsm's implementation of these; they could
* equally well be user defined.
*/
int
re_comp(struct fsm *fsm, fsm_state_t *start, enum re_dialect dialect,
re_getchar_fun *f, void *opaque,
const struct fsm_options *opt,
enum re_flags flags, struct re_err *err);

/*
* A convenience to construct a new fsm.
* Returns NULL on error.
* See re_comp() for details.
*/
struct fsm *
re_comp(enum re_dialect dialect,
re_comp_new(enum re_dialect dialect,
re_getchar_fun *f, void *opaque,
const struct fsm_options *opt,
enum re_flags flags, struct re_err *err);
Expand Down
15 changes: 12 additions & 3 deletions include/re/strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ enum re_strings_flags {
RE_STRINGS_AC_AUTOMATON = 1 << 2
};

/*
* A convenience to iterate over an array of strings, and call
* re_strings_add_str() for each.
*/
struct fsm *
re_strings(const struct fsm_options *opt, const char *a[], size_t n,
enum re_strings_flags flags);
Expand All @@ -47,9 +51,14 @@ re_strings_add_raw(struct re_strings *g, const void *p, size_t n);
int
re_strings_add_str(struct re_strings *g, const char *s);

struct fsm *
re_strings_build(struct re_strings *g,
const struct fsm_options *opt, enum re_strings_flags flags);
int
re_strings_build_into(struct fsm *fsm, fsm_state_t *start,
int have_end, fsm_state_t end,
struct re_strings *g, enum re_strings_flags flags);

int
re_strings_build(struct fsm *fsm, fsm_state_t *start,
struct re_strings *g, enum re_strings_flags flags);

#endif

2 changes: 1 addition & 1 deletion src/libre/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CFLAGS.${src} += -I src # XXX: for internal.h
DFLAGS.${src} += -I src # XXX: for internal.h
.endfor

.for src in ${SRC:Msrc/libre/ast.c} ${SRC:Msrc/libre/ast_analysis.c} ${SRC:Msrc/libre/ast_compile.c} ${SRC:Msrc/libre/re.c}
.for src in ${SRC:Msrc/libre/ast.c} ${SRC:Msrc/libre/ast_analysis.c} ${SRC:Msrc/libre/ast_compile.c} ${SRC:Msrc/libre/re.c} ${SRC:Msrc/libre/ac.c}
CFLAGS.${src} += -std=c99 # XXX: for ast.h
DFLAGS.${src} += -std=c99 # XXX: for ast.h
.endfor
Expand Down
91 changes: 83 additions & 8 deletions src/libre/ac.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
Expand All @@ -14,7 +15,10 @@

#include <fsm/fsm.h>

#include <re/re.h>

#include "ac.h"
#include "ast.h"

enum { POOL_BLOCK_SIZE = 256 };

Expand Down Expand Up @@ -162,8 +166,53 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n)
return g;
}

/* internal convenience to avoid constructing a string */
struct trie_graph *
trie_add_concat(struct trie_graph *g, const struct ast_expr **a, size_t n)
{
struct trie_state *st;
size_t i;

assert(g != NULL);
assert(a != NULL);
assert(n > 0);

st = g->root;

assert(st != NULL);

for (i = 0; i < n; i++) {
struct trie_state *nx;
int idx;

assert(a[i]->type == AST_EXPR_LITERAL);

idx = (unsigned char)a[i]->u.literal.c;
nx = st->children[idx];

if (nx == NULL) {
nx = newstate(g);
if (nx == NULL) {
return NULL;
}

st->children[idx] = nx;
}

st = nx;
}

st->output = 1;
if (g->depth < n) {
g->depth = n;
}

return g;
}

int
trie_add_failure_edges(struct trie_graph *g) {
trie_add_failure_edges(struct trie_graph *g)
{
struct trie_state **q;
size_t top,bot;
int sym;
Expand Down Expand Up @@ -267,6 +316,22 @@ find_next_state(struct trie_state *s, int sym)
return nx;
}

static int
has_child(const struct trie_state *ts)
{
int sym;

assert(ts != NULL);

for (sym = 0; sym < 256; sym++) {
if (ts->children[sym] != NULL) {
return 1;
}
}

return 0;
}

static int
trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm,
int have_end, fsm_state_t single_end,
Expand All @@ -278,7 +343,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm,
assert(fsm != NULL);
assert(q != NULL);

if (ts->output && have_end) {
if (ts->output && have_end && !has_child(ts)) {
*q = single_end;
return 1;
}
Expand Down Expand Up @@ -314,24 +379,34 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm,
}

if (ts->output) {
fsm_setend(fsm, st, 1);
if (!have_end) {
fsm_setend(fsm, st, 1);
} else {
/*
* What would usually be an end state in the middle of the trie
* needs an epsilon to hook it up to the single exit state.
*
* We can't set this as an end state, because the single_end
* itself might not actually accept - for example in the middle of
* recursive NFA construction when walking the regexp AST.
*/
fsm_addedge_epsilon(fsm, st, single_end);
}
}

*q = st;
return 1;
}

struct fsm *
trie_to_fsm(struct fsm *fsm, struct trie_graph *g, int have_end, fsm_state_t end)
trie_to_fsm(struct fsm *fsm, fsm_state_t *start, struct trie_graph *g, int have_end, fsm_state_t end)
{
fsm_state_t start;
assert(start != NULL);

if (!trie_to_fsm_state(g->root, fsm, have_end, end, &start)) {
if (!trie_to_fsm_state(g->root, fsm, have_end, end, start)) {
return NULL;
}

fsm_setstart(fsm, start);

return fsm;
}

Expand Down
6 changes: 5 additions & 1 deletion src/libre/ac.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct fsm;
struct fsm_state;
struct fsm_options;

struct ast_expr;
struct trie_graph;

struct trie_graph *
Expand All @@ -22,11 +23,14 @@ trie_free(struct trie_graph *g);
struct trie_graph *
trie_add_word(struct trie_graph *g, const char *w, size_t n);

struct trie_graph *
trie_add_concat(struct trie_graph *g, const struct ast_expr **a, size_t n);

int
trie_add_failure_edges(struct trie_graph *g);

struct fsm *
trie_to_fsm(struct fsm *fsm, struct trie_graph *g,
trie_to_fsm(struct fsm *fsm, fsm_state_t *start, struct trie_graph *g,
int have_end, fsm_state_t end);

void
Expand Down
3 changes: 3 additions & 0 deletions src/libre/ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <stdlib.h>
#include <limits.h>

#include <fsm/fsm.h>

#include <re/re.h>

#include "class.h"
Expand Down Expand Up @@ -190,6 +192,7 @@ ast_endpoint_cmp(const struct ast_endpoint *a, const struct ast_endpoint *b)

default:
assert(!"unreached");
abort();
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/libre/ast.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,4 +266,8 @@ re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque,
const struct fsm_options *opt,
enum re_flags flags, struct re_err *err, int *unsatisfiable);

struct re_strings;
int
re_strings_add_concat(struct re_strings *g, const struct ast_expr **a, size_t n);

#endif
2 changes: 2 additions & 0 deletions src/libre/ast_analysis.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <stdlib.h>
#include <stdio.h>

#include <fsm/fsm.h>

#include <re/re.h>

#include "class.h"
Expand Down