Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new utils::strfind() function and update mini-regex code #2632

Merged
merged 5 commits into from Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/src/Developer_utils.rst
Expand Up @@ -104,6 +104,9 @@ and parsing files or arguments.
.. doxygenfunction:: strmatch
:project: progguide

.. doxygenfunction:: strfind
:project: progguide

.. doxygenfunction:: is_integer
:project: progguide

Expand Down
2 changes: 1 addition & 1 deletion doc/src/Packages_details.rst
Expand Up @@ -367,7 +367,7 @@ KIM package

**Contents:**

This package contains a command with a set of subcommands that serve as a
This package contains a command with a set of sub-commands that serve as a
wrapper on the
`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
repository of interatomic models (IMs) enabling compatible ones to be used in
Expand Down
1 change: 1 addition & 0 deletions doc/utils/sphinx-config/false_positives.txt
Expand Up @@ -2367,6 +2367,7 @@ parmin
Parrinello
Partay
Particuology
Pascuet
pastewka
Pastewka
pathangle
Expand Down
149 changes: 113 additions & 36 deletions src/utils.cpp
Expand Up @@ -69,6 +69,10 @@ extern "C"
/** Match text against a (simplified) regular expression
* (regexp will be compiled automatically). */
static int re_match(const char *text, const char *pattern);

/** Match find substring that matches a (simplified) regular expression
* (regexp will be compiled automatically). */
static int re_find(const char *text, const char *pattern, int *matchlen);
}

////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -104,6 +108,21 @@ bool utils::strmatch(const std::string &text, const std::string &pattern)
return (pos >= 0);
}

/** This function is a companion function to utils::strmatch(). Arguments
* and logic is the same, but instead of a boolean, it returns the
* sub-string that matches the regex pattern. There can be only one match.
* This can be used as a more flexible alternative to strstr().
*/
std::string utils::strfind(const std::string &text, const std::string &pattern)
{
int matchlen;
const int pos = re_find(text.c_str(),pattern.c_str(),&matchlen);
if ((pos >=0) && (matchlen > 0))
return text.substr(pos,matchlen);
else
return "";
}

/** This function simplifies the repetitive task of outputting some
* message to both the screen and/or the log file. In combination
* with using fmt::format(), which returns the formatted text
Expand Down Expand Up @@ -1258,16 +1277,26 @@ static void do_merge(int *idx, int *buf, int llo, int lhi, int rlo, int rhi,
/* ------------------------------------------------------------------ */

extern "C" {

/* Typedef'd pointer to get abstract datatype. */
typedef struct regex_t *re_t;

/* Compile regex string pattern to a regex_t-array. */
static re_t re_compile(const char *pattern);


/* Find matches of the compiled pattern inside text. */
static int re_matchp(const char *text, re_t pattern);
static int re_matchp(const char *text, re_t pattern, int *matchlen);

int re_match(const char *text, const char *pattern)
{
int dummy;
return re_matchp(text, re_compile(pattern), &dummy);
}

int re_find(const char *text, const char *pattern, int *matchlen)
{
return re_matchp(text, re_compile(pattern), matchlen);
}

/* Definitions: */

Expand All @@ -1285,14 +1314,14 @@ extern "C" {
union {
unsigned char ch; /* the character itself */
unsigned char *ccl; /* OR a pointer to characters in class */
};
} u;
} regex_t;

/* Private function declarations: */
static int matchpattern(regex_t *pattern, const char *text);
static int matchpattern(regex_t *pattern, const char *text, int *matchlen);
static int matchcharclass(char c, const char *str);
static int matchstar(regex_t p, regex_t *pattern, const char *text);
static int matchplus(regex_t p, regex_t *pattern, const char *text);
static int matchstar(regex_t p, regex_t *pattern, const char *text, int *matchlen);
static int matchplus(regex_t p, regex_t *pattern, const char *text, int *matchlen);
static int matchone(regex_t p, char c);
static int matchdigit(char c);
static int matchint(char c);
Expand All @@ -1301,26 +1330,23 @@ extern "C" {
static int matchwhitespace(char c);
static int matchmetachar(char c, const char *str);
static int matchrange(char c, const char *str);
static int matchdot(char c);
static int ismetachar(char c);

/* Semi-public functions: */
int re_match(const char *text, const char *pattern)
{
return re_matchp(text, re_compile(pattern));
}

int re_matchp(const char *text, re_t pattern)
int re_matchp(const char *text, re_t pattern, int *matchlen)
{
*matchlen = 0;
if (pattern != 0) {
if (pattern[0].type == BEGIN) {
return ((matchpattern(&pattern[1], text)) ? 0 : -1);
return ((matchpattern(&pattern[1], text, matchlen)) ? 0 : -1);
} else {
int idx = -1;

do {
idx += 1;

if (matchpattern(pattern, text)) {
if (matchpattern(pattern, text, matchlen)) {
if (text[0] == '\0')
return -1;

Expand Down Expand Up @@ -1380,7 +1406,7 @@ extern "C" {
/* Escaped character, e.g. '.' or '$' */
default: {
re_compiled[j].type = CHAR;
re_compiled[j].ch = pattern[i];
re_compiled[j].u.ch = pattern[i];
} break;
}
}
Expand All @@ -1396,6 +1422,10 @@ extern "C" {
if (pattern[i+1] == '^') {
re_compiled[j].type = INV_CHAR_CLASS;
i += 1; /* Increment i to avoid including '^' in the char-buffer */
if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
{
return 0;
}
} else {
re_compiled[j].type = CHAR_CLASS;
}
Expand All @@ -1407,6 +1437,10 @@ extern "C" {
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) {
return 0;
}
if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */
{
return 0;
}
ccl_buf[ccl_bufidx++] = pattern[i++];
} else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
return 0;
Expand All @@ -1419,15 +1453,22 @@ extern "C" {
}
/* Null-terminate string end */
ccl_buf[ccl_bufidx++] = 0;
re_compiled[j].ccl = &ccl_buf[buf_begin];
re_compiled[j].u.ccl = &ccl_buf[buf_begin];
} break;

/* Other characters: */
default: {
default:
{
re_compiled[j].type = CHAR;
re_compiled[j].ch = c;
re_compiled[j].u.ch = c;
} break;
}
/* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
if (pattern[i] == 0)
{
return 0;
}

i += 1;
j += 1;
}
Expand Down Expand Up @@ -1477,6 +1518,16 @@ extern "C" {
&& ((c >= str[0]) && (c <= str[2])));
}

static int matchdot(char c)
{
#if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1)
(void)c;
return 1;
#else
return c != '\n' && c != '\r';
#endif
}

static int ismetachar(char c)
{
return ((c == 's') || (c == 'S')
Expand Down Expand Up @@ -1530,9 +1581,9 @@ extern "C" {
static int matchone(regex_t p, char c)
{
switch (p.type) {
case DOT: return 1;
case CHAR_CLASS: return matchcharclass(c, (const char *)p.ccl);
case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.ccl);
case DOT: return matchdot(c);
case CHAR_CLASS: return matchcharclass(c, (const char *)p.u.ccl);
case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.u.ccl);
case DIGIT: return matchdigit(c);
case NOT_DIGIT: return !matchdigit(c);
case INTEGER: return matchint(c);
Expand All @@ -1543,57 +1594,83 @@ extern "C" {
case NOT_ALPHA: return !matchalphanum(c);
case WHITESPACE: return matchwhitespace(c);
case NOT_WHITESPACE: return !matchwhitespace(c);
default: return (p.ch == c);
default: return (p.u.ch == c);
}
}

static int matchstar(regex_t p, regex_t *pattern, const char *text)
static int matchstar(regex_t p, regex_t *pattern, const char *text, int *matchlen)
{
do {
if (matchpattern(pattern, text))
int prelen = *matchlen;
const char *prepos = text;
while ((text[0] != '\0') && matchone(p, *text))
{
text++;
(*matchlen)++;
}
while (text >= prepos)
{
if (matchpattern(pattern, text--, matchlen))
return 1;
(*matchlen)--;
}
while ((text[0] != '\0') && matchone(p, *text++));

*matchlen = prelen;
return 0;
}

static int matchplus(regex_t p, regex_t *pattern, const char *text)
static int matchplus(regex_t p, regex_t *pattern, const char *text, int *matchlen)
{
while ((text[0] != '\0') && matchone(p, *text++)) {
if (matchpattern(pattern, text))
const char *prepos = text;
while ((text[0] != '\0') && matchone(p, *text))
{
text++;
(*matchlen)++;
}
while (text > prepos)
{
if (matchpattern(pattern, text--, matchlen))
return 1;
(*matchlen)--;
}
return 0;
}

static int matchquestion(regex_t p, regex_t *pattern, const char *text)
static int matchquestion(regex_t p, regex_t *pattern, const char *text, int *matchlen)
{
if (p.type == UNUSED)
return 1;
if (matchpattern(pattern, text))
if (matchpattern(pattern, text, matchlen))
return 1;
if (*text && matchone(p, *text++))
return matchpattern(pattern, text);
{
if (matchpattern(pattern, text, matchlen))
{
(*matchlen)++;
return 1;
}
}
return 0;
}

/* Iterative matching */
static int matchpattern(regex_t *pattern, const char *text)
static int matchpattern(regex_t *pattern, const char *text, int *matchlen)
{
int pre = *matchlen;
do {
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) {
return matchquestion(pattern[0], &pattern[2], text);
return matchquestion(pattern[0], &pattern[2], text, matchlen);
} else if (pattern[1].type == STAR) {
return matchstar(pattern[0], &pattern[2], text);
return matchstar(pattern[0], &pattern[2], text, matchlen);
} else if (pattern[1].type == PLUS) {
return matchplus(pattern[0], &pattern[2], text);
return matchplus(pattern[0], &pattern[2], text, matchlen);
} else if ((pattern[0].type == END) && pattern[1].type == UNUSED) {
return (text[0] == '\0');
}
(*matchlen)++;
}
while ((text[0] != '\0') && matchone(*pattern++, *text++));

*matchlen = pre;
return 0;
}

Expand Down
8 changes: 8 additions & 0 deletions src/utils.h
Expand Up @@ -37,6 +37,14 @@ namespace LAMMPS_NS {

bool strmatch(const std::string &text, const std::string &pattern);

/** Find sub-string that matches a simplified regex pattern
*
* \param text the text to be matched against the pattern
* \param pattern the search pattern, which may contain regexp markers
* \return the string that matches the patters or an empty one */

std::string strfind(const std::string &text, const std::string &pattern);

/** Send message to screen and logfile, if available
*
* \param lmp pointer to LAMMPS class instance
Expand Down