Skip to content

Commit

Permalink
Edit Book: Fix replacement of hyphenated words in the spell checker n…
Browse files Browse the repository at this point in the history
…ot working
  • Loading branch information
kovidgoyal committed Jul 12, 2014
1 parent f01c2e9 commit 446e7a9
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 17 deletions.
25 changes: 15 additions & 10 deletions src/calibre/utils/icu.c
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg
#endif

UChar *buf = NULL;
int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1;
int32_t prev = 0, p = 0, sz = 0, ans = -1;
PyObject *token = NULL;

if (!PyArg_ParseTuple(args, "O", &token)) return NULL;
Expand All @@ -617,21 +617,26 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg
prev = p; p = ubrk_next(self->break_iterator);
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
continue; // We are not at the start of a word
tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
#ifdef PY_UNICODE_WIDE
ans = u_countChar32(self->text, prev);
#else
ans = prev;
#endif
break;
if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) {
// Needle is present at text[prev:] we have to check if it is followed by a non-hyphen boundary
if(
ubrk_isBoundary(self->break_iterator, prev + sz) &&
(self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010))
) {
ans = prev; break; // Found word surrounded by non-hyphen boundaries
}
if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary
}
}
#ifdef Py_UNICODE_WIDE
if (ans > 0) ans = u_countChar32(self->text, ans);
#endif
Py_END_ALLOW_THREADS;


end:
free(buf);
return Py_BuildValue("i", ans);
return Py_BuildValue("l", (long int)ans);

} // }}}

Expand Down
25 changes: 18 additions & 7 deletions src/calibre/utils/icu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,24 @@ def test_break_iterator(self):
self.ae(split(u'I I\'m'), ['I', "I'm"])
self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
self.ae(split(u'-one two-'), ['one', 'two'])
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
self.ae(0, index_of('i', 'i'))
self.ae(4, index_of('i', 'six i'))
self.ae(-1, index_of('i', ''))
self.ae(-1, index_of('', ''))
self.ae(-1, index_of('', 'i'))
self.ae(-1, index_of('i', 'six clicks'))
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
for needle, haystack, pos in (
('word', 'a word b', 2),
('word', 'a word', 2),
('one-two', 'a one-two punch', 2),
('one-two', 'one-two punch', 0),
('one-two', 'one-two', 0),
('one', 'one-two one', 8),
('one-two', 'one-two-three one-two', 14),
('one', 'onet one', 5),
('i', 'i', 0),
('i', 'six i', 4),
('i', '', -1), ('', '', -1), ('', 'i', -1),
('i', 'six clicks', -1),
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
):
fpos = index_of(needle, haystack)
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))

class TestRunner(unittest.main):

Expand Down

0 comments on commit 446e7a9

Please sign in to comment.