Skip to content

Commit

Permalink
ure: Convert byte offsets to unicode indices when necessary.
Browse files Browse the repository at this point in the history
.. and add a test.  Closes: adafruit#9202.
  • Loading branch information
jepler committed Sep 5, 2022
1 parent 0b26efe commit 0605004
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
16 changes: 16 additions & 0 deletions extmod/modure.c
Expand Up @@ -33,6 +33,10 @@
#include "py/objstr.h"
#include "py/stackctrl.h"

#if MICROPY_PY_BUILTINS_STR_UNICODE
#include "py/unicode.h"
#endif

#if MICROPY_PY_URE

#define re1_5_stack_chk() MP_STACK_CHECK()
Expand Down Expand Up @@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
e = self->caps[no * 2 + 1] - begin;
}

#if MICROPY_PY_BUILTINS_STR_UNICODE
if(mp_obj_get_type(self->str) == &mp_type_str) {
const byte *begin = (const byte *)mp_obj_str_get_str(self->str);
if (s != -1) {
s = utf8_ptr_to_index(begin, begin+s);
}
if (e != -1) {
e = utf8_ptr_to_index(begin, begin+e);
}
}
#endif

span[0] = mp_obj_new_int(s);
span[1] = mp_obj_new_int(e);
}
Expand Down
3 changes: 3 additions & 0 deletions tests/extmod/ure_span.py
Expand Up @@ -34,6 +34,9 @@ def print_spans(match):
m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234hello567")
print_spans(m)

m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567")
print_spans(m)

# optional span that matches
print_spans(re.match(r"(a)?b(c)", "abc"))

Expand Down

0 comments on commit 0605004

Please sign in to comment.