Skip to content

Commit

Permalink
Switch internal representation to UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Aug 21, 2021
1 parent d615c99 commit 1369cd3
Show file tree
Hide file tree
Showing 25 changed files with 387 additions and 574 deletions.
228 changes: 34 additions & 194 deletions cbits/cbits.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
size_t n)
{
return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
return memcmp(a + aoff, b + boff, n);
}

#define UTF8_ACCEPT 0
Expand Down Expand Up @@ -61,60 +61,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
return *state = utf8d[256 + *state + type];
}

/*
* The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
* code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
* an UTF16 array
*/
void
_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
size_t
_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
const uint8_t *srcend)
{
const uint8_t *dest0 = dest;
const uint8_t *p = src;

#if defined(__i386__) || defined(__x86_64__)
/* This optimization works on a little-endian systems by using
(aligned) 32-bit loads instead of 8-bit loads
*/

/* consume unaligned prefix */
while (p != srcend && (uintptr_t)p & 0x3)
*dest++ = *p++;

#if defined(__x86_64__)
/* All the intrinsics used here are from SSE2,
* so every x86_64 CPU supports them.
*/
const __m128i zeros = _mm_set1_epi32(0);
while (p < srcend - 7) {
/* Load 8 bytes of ASCII data */
const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
/* Interleave with zeros */
const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128((__m128i *)dest, utf16);

dest += 8;
p += 8;
}
#else
/* iterate over 32-bit aligned loads */
while (p < srcend - 3) {
const uint32_t w = *((const uint32_t *)p);

*dest++ = w & 0xff;
*dest++ = (w >> 8) & 0xff;
*dest++ = (w >> 16) & 0xff;
*dest++ = (w >> 24) & 0xff;

p += 4;
while (p != srcend){
uint8_t codepoint = *p++;
if(codepoint < 0x80){
*dest++ = (uint8_t)codepoint;
} else {
*dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
*dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
}
#endif
#endif

/* handle unaligned suffix */
while (p != srcend)
*dest++ = *p++;
return (dest - dest0);
}

/*
Expand Down Expand Up @@ -146,82 +110,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
*/
#if defined(__GNUC__) || defined(__clang__)
static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
__attribute((always_inline));
#endif

static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
{
uint16_t *d = dest + *destoff;
uint8_t *d = dest + *destoff;
const uint8_t *s = *src, *last = *src;
uint32_t state = *state0;
uint32_t codepoint = *codepoint0;

while (s < srcend) {
#if defined(__i386__) || defined(__x86_64__)
/*
* This code will only work on a little-endian system that
* supports unaligned loads.
*
* It gives a substantial speed win on data that is purely or
* partly ASCII (e.g. HTML), at only a slight cost on purely
* non-ASCII text.
*/

if (state == UTF8_ACCEPT) {
#if defined(__x86_64__)
const __m128i zeros = _mm_set1_epi32(0);
while (s < srcend - 8) {
const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
break;
s += 8;

/* Load 8 bytes of ASCII data */
const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
/* Interleave with zeros */
const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128((__m128i *)d, eight_utf16_chars);
d += 8;
}
#else
while (s < srcend - 4) {
codepoint = *((uint32_t *) s);
if ((codepoint & 0x80808080) != 0)
break;
s += 4;
/*
* Tried 32-bit stores here, but the extra bit-twiddling
* slowed the code down.
*/
*d++ = (uint16_t) (codepoint & 0xff);
*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
}
#endif
last = s;
} /* end if (state == UTF8_ACCEPT) */
#endif

if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
if (state != UTF8_REJECT)
continue;
continue;
break;
}

if (codepoint <= 0xffff)
*d++ = (uint16_t) codepoint;
else {
*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
if(codepoint < 0x80){
*d++ = (uint8_t) codepoint;
} else if(codepoint < 0x800){
*d++ = (uint8_t) (0xC0 + (codepoint >> 6));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else if(codepoint < 0x10000){
*d++ = (uint8_t) (0xE0 + (codepoint >> 12));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else {
*d++ = (uint8_t) (0xF0 + (codepoint >> 18));
*d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}

last = s;
}

Expand All @@ -234,7 +161,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
}

uint8_t const *
_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
const uint8_t **src,
const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
Expand All @@ -248,7 +175,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
* Helper to decode buffer and discard final decoder state
*/
const uint8_t *
_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
const uint8_t *src, const uint8_t *const srcend)
{
uint32_t codepoint;
Expand All @@ -257,90 +184,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
&codepoint, &state);
return src;
}

void
_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
size_t srclen)
{
const uint16_t *srcend;
uint8_t *dest = *destp;

src += srcoff;
srcend = src + srclen;

ascii:
#if defined(__x86_64__)
while (srcend - src >= 8) {
union { uint64_t halves[2]; __m128i whole; } eight_chars;
eight_chars.whole = _mm_loadu_si128((__m128i *) src);

const uint64_t w = eight_chars.halves[0];
if (w & 0xFF80FF80FF80FF80ULL) {
if (!(w & 0x000000000000FF80ULL)) {
*dest++ = w & 0xFFFF;
src++;
if (!(w & 0x00000000FF800000ULL)) {
*dest++ = (w >> 16) & 0xFFFF;
src++;
if (!(w & 0x0000FF8000000000ULL)) {
*dest++ = (w >> 32) & 0xFFFF;
src++;
}
}
}
break;
}

if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
break;
}

const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
_mm_storel_epi64((__m128i *)dest, eight_ascii_chars);

dest += 8;
src += 8;
}
#endif

#if defined(__i386__)
while (srcend - src >= 2) {
uint32_t w = *((uint32_t *) src);

if (w & 0xFF80FF80)
break;
*dest++ = w & 0xFFFF;
*dest++ = w >> 16;
src += 2;
}
#endif

while (src < srcend) {
uint16_t w = *src++;

if (w <= 0x7F) {
*dest++ = w;
/* An ASCII byte is likely to begin a run of ASCII bytes.
Falling back into the fast path really helps performance. */
goto ascii;
}
else if (w <= 0x7FF) {
*dest++ = (w >> 6) | 0xC0;
*dest++ = (w & 0x3f) | 0x80;
}
else if (w < 0xD800 || w > 0xDBFF) {
*dest++ = (w >> 12) | 0xE0;
*dest++ = ((w >> 6) & 0x3F) | 0x80;
*dest++ = (w & 0x3F) | 0x80;
} else {
uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
(((uint32_t) *src++) - 0xDC00) + 0x10000;
*dest++ = (c >> 18) | 0xF0;
*dest++ = ((c >> 12) & 0x3F) | 0x80;
*dest++ = ((c >> 6) & 0x3F) | 0x80;
*dest++ = (c & 0x3F) | 0x80;
}
}

*destp = dest;
}
37 changes: 15 additions & 22 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,8 @@ import Data.Text.Internal.Private (span_)
import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
import Data.Text.Show (singleton, unpack, unpackCString#)
import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
reverseIter_, unsafeHead, unsafeTail)
import Data.Text.Internal.Unsafe.Char (unsafeChr)
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import Data.Text.Internal.Search (indices)
#if defined(__HADDOCK__)
import Data.ByteString (ByteString)
Expand Down Expand Up @@ -487,12 +485,9 @@ second f (a, b) = (a, f b)
-- | /O(1)/ Returns the last character of a 'Text', which must be
-- non-empty.
last :: Text -> Char
last (Text arr off len)
| len <= 0 = emptyError "last"
| n < 0xDC00 || n > 0xDFFF = unsafeChr n
| otherwise = U16.chr2 n0 n
where n = A.unsafeIndex arr (off+len-1)
n0 = A.unsafeIndex arr (off+len-2)
last t@(Text _ _ len)
| len <= 0 = emptyError "last"
| otherwise = let Iter c _ = reverseIter t (len - 1) in c
{-# INLINE [1] last #-}

-- | /O(1)/ Returns all characters after the head of a 'Text', which
Expand All @@ -507,24 +502,21 @@ tail t@(Text arr off len)
-- | /O(1)/ Returns all but the last character of a 'Text', which must
-- be non-empty.
init :: Text -> Text
init (Text arr off len) | len <= 0 = emptyError "init"
| n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
| otherwise = text arr off (len-1)
where
n = A.unsafeIndex arr (off+len-1)
init t@(Text arr off len)
| len <= 0 = emptyError "init"
| otherwise = text arr off (len + reverseIter_ t (len - 1))
{-# INLINE [1] init #-}

-- | /O(1)/ Returns all but the last character and the last character of a
-- 'Text', or 'Nothing' if empty.
--
-- @since 1.2.3.0
unsnoc :: Text -> Maybe (Text, Char)
unsnoc (Text arr off len)
| len <= 0 = Nothing
| n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n)
| otherwise = Just (text arr off (len-2), U16.chr2 n0 n)
where n = A.unsafeIndex arr (off+len-1)
n0 = A.unsafeIndex arr (off+len-2)
unsnoc t@(Text arr off len)
| len <= 0 = Nothing
| otherwise = Just (text arr off (len + d), c)
where
Iter c d = reverseIter t (len - 1)
{-# INLINE [1] unsnoc #-}

-- | /O(1)/ Tests whether a 'Text' is empty or not.
Expand Down Expand Up @@ -911,7 +903,7 @@ concat ts = case ts' of
_ -> Text (A.run go) 0 len
where
ts' = L.filter (not . null) ts
len = sumP "concat" $ L.map lengthWord16 ts'
len = sumP "concat" $ L.map lengthWord8 ts'
go :: ST s (A.MArray s)
go = do
arr <- A.new len
Expand Down Expand Up @@ -1569,9 +1561,10 @@ words t@(Text arr off len) = loop 0 0
| n >= len = if start == n
then []
else [Text arr (start+off) (n-start)]
-- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
| isSpace c =
if start == n
then loop (start+1) (start+1)
then loop (n+d) (n+d)
else Text arr (start+off) (n-start) : loop (n+d) (n+d)
| otherwise = loop start (n+d)
where Iter c d = iter t n
Expand Down
Loading

0 comments on commit 1369cd3

Please sign in to comment.