Skip to content

Commit

Permalink
re_strings: add support for endids.
Browse files Browse the repository at this point in the history
This adds an extra parameter to `re_strings_add_str` and
`re_strings_add_raw` that (if non-NULL) will associate a single endid
with the string being added. When `re_strings_build` constructs the
DFA it will produce a separate end state for each end.

This needs further testing with multiple overlapping patterns. When
multiple literal strings appear in the input only the latest match
will be reported.
  • Loading branch information
silentbicycle committed Apr 29, 2024
1 parent fdd18f4 commit 3d4beb1
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 11 deletions.
6 changes: 6 additions & 0 deletions include/fsm/fsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ fsm_setend(struct fsm *fsm, fsm_state_t state, int end);
int
fsm_setendid(struct fsm *fsm, fsm_end_id_t id);

/* Associate a numeric ID with a specific end state in an fsm.
* Returns 1 on success, 0 on error.
* */
int
fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id);

/* Get the end IDs associated with an end state, if any.
* If id_buf has enough cells to store all the end IDs (according
* to id_buf_count) then they are written into id_buf[] and
Expand Down
6 changes: 3 additions & 3 deletions include/re/strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#ifndef RE_STRINGS_H
#define RE_STRINGS_H

struct fsm;
#include <fsm/fsm.h>
struct fsm_options;

struct re_strings;
Expand Down Expand Up @@ -42,10 +42,10 @@ void
re_strings_free(struct re_strings *g);

int
re_strings_add_raw(struct re_strings *g, const void *p, size_t n);
re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid);

int
re_strings_add_str(struct re_strings *g, const char *s);
re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid);

struct fsm *
re_strings_build(struct re_strings *g,
Expand Down
10 changes: 10 additions & 0 deletions src/libfsm/endids.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id)
return 1;
}

int
fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id)
{
enum fsm_endid_set_res sres = fsm_endid_set(fsm, end_state, id);
if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) {
return 0;
}
return 1;
}

enum fsm_getendids_res
fsm_getendids(const struct fsm *fsm, fsm_state_t end_state,
size_t id_buf_count, fsm_end_id_t *id_buf,
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/libfsm.syms
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ fsm_getendids
fsm_setendid
fsm_mapendids
fsm_increndids
fsm_setendidstate

fsm_countedges
fsm_countstates
Expand Down
12 changes: 10 additions & 2 deletions src/libre/ac.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "ac.h"

#define ENDID_NONE ((fsm_end_id_t)-1)
enum { POOL_BLOCK_SIZE = 256 };

struct trie_state {
Expand All @@ -25,6 +26,7 @@ struct trie_state {
unsigned int index;
unsigned int output:1;
unsigned int have_st:1;
fsm_end_id_t endid; /* or ENDID_NONE */
};

struct trie_pool {
Expand Down Expand Up @@ -126,7 +128,7 @@ trie_create(void)
}

struct trie_graph *
trie_add_word(struct trie_graph *g, const char *w, size_t n)
trie_add_word(struct trie_graph *g, const char *w, size_t n, const fsm_end_id_t *endid)
{
struct trie_state *st;
size_t i;
Expand Down Expand Up @@ -159,6 +161,7 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n)
g->depth = n;
}

st->endid = (endid == NULL ? ENDID_NONE : *endid);
return g;
}

Expand Down Expand Up @@ -278,7 +281,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm,
assert(fsm != NULL);
assert(q != NULL);

if (ts->output && have_end) {
if (ts->output && have_end && ts->endid == ENDID_NONE) {
*q = single_end;
return 1;
}
Expand Down Expand Up @@ -315,6 +318,11 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm,

if (ts->output) {
fsm_setend(fsm, st, 1);
if (ts->endid != ENDID_NONE) {
if (!fsm_setendidstate(fsm, st, ts->endid)) {
return 0;
}
}
}

*q = st;
Expand Down
5 changes: 4 additions & 1 deletion src/libre/ac.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#ifndef AC_H
#define AC_H

#include "fsm/fsm.h"

struct fsm;
struct fsm_state;
struct fsm_options;
Expand All @@ -20,7 +22,8 @@ void
trie_free(struct trie_graph *g);

struct trie_graph *
trie_add_word(struct trie_graph *g, const char *w, size_t n);
trie_add_word(struct trie_graph *g, const char *w, size_t n,
const fsm_end_id_t *endid);

int
trie_add_failure_edges(struct trie_graph *g);
Expand Down
10 changes: 5 additions & 5 deletions src/libre/re_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ re_strings(const struct fsm_options *opt, const char *a[], size_t n,
}

for (i = 0; i < n; i++) {
if (!re_strings_add_str(g, a[i])) {
if (!re_strings_add_str(g, a[i], NULL)) {
goto error;
}
}
Expand Down Expand Up @@ -64,20 +64,20 @@ re_strings_free(struct re_strings *g)
}

int
re_strings_add_raw(struct re_strings *g, const void *p, size_t n)
re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid)
{
assert(p != NULL);
assert(n > 0);

return trie_add_word((struct trie_graph *) g, p, n) != NULL;
return trie_add_word((struct trie_graph *) g, p, n, endid) != NULL;
}

int
re_strings_add_str(struct re_strings *g, const char *s)
re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid)
{
assert(s != NULL);

return re_strings_add_raw(g, s, strlen(s));
return re_strings_add_raw(g, s, strlen(s), endid);
}

struct fsm *
Expand Down

0 comments on commit 3d4beb1

Please sign in to comment.