Switch internal representation to UTF-8

haskell · Aug 21, 2021 · 1369cd3 · 1369cd3
1 parent d615c99
commit 1369cd3
Show file tree

Hide file tree

Showing 25 changed files with 387 additions and 574 deletions.
diff --git a/cbits/cbits.c b/cbits/cbits.c
@@ -19,7 +19,7 @@
 int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
                    size_t n)
 {
-  return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
+  return memcmp(a + aoff, b + boff, n);
 }
 
 #define UTF8_ACCEPT 0
@@ -61,60 +61,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   return *state = utf8d[256 + *state + type];
 }
 
-/*
- * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
- * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
- * an UTF16 array
- */
-void
-_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
+size_t
+_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
                        const uint8_t *srcend)
 {
+  const uint8_t *dest0 = dest;
   const uint8_t *p = src;
 
-#if defined(__i386__) || defined(__x86_64__)
-  /* This optimization works on a little-endian systems by using
-     (aligned) 32-bit loads instead of 8-bit loads
-   */
-
-  /* consume unaligned prefix */
-  while (p != srcend && (uintptr_t)p & 0x3)
-    *dest++ = *p++;
-
-#if defined(__x86_64__)
-  /* All the intrinsics used here are from SSE2,
-   * so every x86_64 CPU supports them.
-   */
-  const __m128i zeros = _mm_set1_epi32(0);
-  while (p < srcend - 7) {
-    /* Load 8 bytes of ASCII data */
-    const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
-    /* Interleave with zeros */
-    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
-    /* Store the resulting 16 bytes into destination */
-    _mm_storeu_si128((__m128i *)dest, utf16);
-
-    dest += 8;
-    p += 8;
-  }
-#else
-  /* iterate over 32-bit aligned loads */
-  while (p < srcend - 3) {
-    const uint32_t w = *((const uint32_t *)p);
-
-    *dest++ =  w        & 0xff;
-    *dest++ = (w >> 8)  & 0xff;
-    *dest++ = (w >> 16) & 0xff;
-    *dest++ = (w >> 24) & 0xff;
-
-    p += 4;
+  while (p != srcend){
+    uint8_t codepoint = *p++;
+    if(codepoint < 0x80){
+      *dest++ = (uint8_t)codepoint;
+    } else {
+      *dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    }
   }
-#endif
-#endif
 
-  /* handle unaligned suffix */
-  while (p != srcend)
-    *dest++ = *p++;
+  return (dest - dest0);
 }
 
 /*
@@ -146,82 +110,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
  */
 #if defined(__GNUC__) || defined(__clang__)
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
   __attribute((always_inline));
 #endif
 
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
 {
-  uint16_t *d = dest + *destoff;
+  uint8_t *d = dest + *destoff;
   const uint8_t *s = *src, *last = *src;
   uint32_t state = *state0;
   uint32_t codepoint = *codepoint0;
 
   while (s < srcend) {
-#if defined(__i386__) || defined(__x86_64__)
-    /*
-     * This code will only work on a little-endian system that
-     * supports unaligned loads.
-     *
-     * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
-     */
-
-    if (state == UTF8_ACCEPT) {
-#if defined(__x86_64__)
-      const __m128i zeros = _mm_set1_epi32(0);
-      while (s < srcend - 8) {
-        const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
-        if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
-          break;
-        s += 8;
-
-        /* Load 8 bytes of ASCII data */
-        const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
-        /* Interleave with zeros */
-        const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
-        /* Store the resulting 16 bytes into destination */
-        _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
-        d += 8;
-      }
-#else
-      while (s < srcend - 4) {
-        codepoint = *((uint32_t *) s);
-        if ((codepoint & 0x80808080) != 0)
-          break;
-        s += 4;
-        /*
-         * Tried 32-bit stores here, but the extra bit-twiddling
-         * slowed the code down.
-         */
-        *d++ = (uint16_t) (codepoint & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
-#endif
-      last = s;
-    } /* end if (state == UTF8_ACCEPT) */
-#endif
-
     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
       if (state != UTF8_REJECT)
-	continue;
+	      continue;
       break;
     }
 
-    if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
-    else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+    if(codepoint < 0x80){
+      *d++ = (uint8_t) codepoint;
+    } else if(codepoint < 0x800){
+      *d++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else if(codepoint < 0x10000){
+      *d++ = (uint8_t) (0xE0 + (codepoint >> 12));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else {
+      *d++ = (uint8_t) (0xF0 + (codepoint >> 18));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
     }
+
     last = s;
   }
 
@@ -234,7 +161,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
 }
 
 uint8_t const *
-_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
                            const uint8_t **src,
                            const uint8_t *srcend,
                            uint32_t *codepoint0, uint32_t *state0)
@@ -248,7 +175,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
  * Helper to decode buffer and discard final decoder state
  */
 const uint8_t *
-_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
                      const uint8_t *src, const uint8_t *const srcend)
 {
   uint32_t codepoint;
@@ -257,90 +184,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
                           &codepoint, &state);
   return src;
 }
-
-void
-_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
-		     size_t srclen)
-{
-  const uint16_t *srcend;
-  uint8_t *dest = *destp;
-
-  src += srcoff;
-  srcend = src + srclen;
-
- ascii:
-#if defined(__x86_64__)
-  while (srcend - src >= 8) {
-    union { uint64_t halves[2]; __m128i whole; } eight_chars;
-    eight_chars.whole = _mm_loadu_si128((__m128i *) src);
-
-    const uint64_t w = eight_chars.halves[0];
-    if (w & 0xFF80FF80FF80FF80ULL) {
-      if (!(w & 0x000000000000FF80ULL)) {
-        *dest++ = w & 0xFFFF;
-        src++;
-        if (!(w & 0x00000000FF800000ULL)) {
-          *dest++ = (w >> 16) & 0xFFFF;
-          src++;
-          if (!(w & 0x0000FF8000000000ULL)) {
-            *dest++ = (w >> 32) & 0xFFFF;
-            src++;
-          }
-        }
-      }
-      break;
-    }
-
-    if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
-      break;
-    }
-
-    const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
-    _mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
-
-    dest += 8;
-    src += 8;
-  }
-#endif
-
-#if defined(__i386__)
-  while (srcend - src >= 2) {
-    uint32_t w = *((uint32_t *) src);
-
-    if (w & 0xFF80FF80)
-      break;
-    *dest++ = w & 0xFFFF;
-    *dest++ = w >> 16;
-    src += 2;
-  }
-#endif
-
-  while (src < srcend) {
-    uint16_t w = *src++;
-
-    if (w <= 0x7F) {
-      *dest++ = w;
-      /* An ASCII byte is likely to begin a run of ASCII bytes.
-	 Falling back into the fast path really helps performance. */
-      goto ascii;
-    }
-    else if (w <= 0x7FF) {
-      *dest++ = (w >> 6) | 0xC0;
-      *dest++ = (w & 0x3f) | 0x80;
-    }
-    else if (w < 0xD800 || w > 0xDBFF) {
-      *dest++ = (w >> 12) | 0xE0;
-      *dest++ = ((w >> 6) & 0x3F) | 0x80;
-      *dest++ = (w & 0x3F) | 0x80;
-    } else {
-      uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
-	(((uint32_t) *src++) - 0xDC00) + 0x10000;
-      *dest++ = (c >> 18) | 0xF0;
-      *dest++ = ((c >> 12) & 0x3F) | 0x80;
-      *dest++ = ((c >> 6) & 0x3F) | 0x80;
-      *dest++ = (c & 0x3F) | 0x80;
-    }
-  }
-
-  *destp = dest;
-}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
@@ -227,10 +227,8 @@ import Data.Text.Internal.Private (span_)
 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
-import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
+import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
                          reverseIter_, unsafeHead, unsafeTail)
-import Data.Text.Internal.Unsafe.Char (unsafeChr)
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import Data.Text.Internal.Search (indices)
 #if defined(__HADDOCK__)
 import Data.ByteString (ByteString)
@@ -487,12 +485,9 @@ second f (a, b) = (a, f b)
 -- | /O(1)/ Returns the last character of a 'Text', which must be
 -- non-empty.
 last :: Text -> Char
-last (Text arr off len)
-    | len <= 0                 = emptyError "last"
-    | n < 0xDC00 || n > 0xDFFF = unsafeChr n
-    | otherwise                = U16.chr2 n0 n
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+last t@(Text _ _ len)
+    | len <= 0  = emptyError "last"
+    | otherwise = let Iter c _ = reverseIter t (len - 1) in c
 {-# INLINE [1] last #-}
 
 -- | /O(1)/ Returns all characters after the head of a 'Text', which
@@ -507,24 +502,21 @@ tail t@(Text arr off len)
 -- | /O(1)/ Returns all but the last character of a 'Text', which must
 -- be non-empty.
 init :: Text -> Text
-init (Text arr off len) | len <= 0                   = emptyError "init"
-                        | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
-                        | otherwise                  = text arr off (len-1)
-    where
-      n = A.unsafeIndex arr (off+len-1)
+init t@(Text arr off len)
+    | len <= 0  = emptyError "init"
+    | otherwise = text arr off (len + reverseIter_ t (len - 1))
 {-# INLINE [1] init #-}
 
 -- | /O(1)/ Returns all but the last character and the last character of a
 -- 'Text', or 'Nothing' if empty.
 --
 -- @since 1.2.3.0
 unsnoc :: Text -> Maybe (Text, Char)
-unsnoc (Text arr off len)
-    | len <= 0                 = Nothing
-    | n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n)
-    | otherwise                = Just (text arr off (len-2), U16.chr2 n0 n)
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+unsnoc t@(Text arr off len)
+    | len <= 0  = Nothing
+    | otherwise = Just (text arr off (len + d), c)
+        where
+            Iter c d = reverseIter t (len - 1)
 {-# INLINE [1] unsnoc #-}
 
 -- | /O(1)/ Tests whether a 'Text' is empty or not.
@@ -911,7 +903,7 @@ concat ts = case ts' of
               _ -> Text (A.run go) 0 len
   where
     ts' = L.filter (not . null) ts
-    len = sumP "concat" $ L.map lengthWord16 ts'
+    len = sumP "concat" $ L.map lengthWord8 ts'
     go :: ST s (A.MArray s)
     go = do
       arr <- A.new len
@@ -1569,9 +1561,10 @@ words t@(Text arr off len) = loop 0 0
         | n >= len = if start == n
                      then []
                      else [Text arr (start+off) (n-start)]
+        -- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
         | isSpace c =
             if start == n
-            then loop (start+1) (start+1)
+            then loop (n+d) (n+d)
             else Text arr (start+off) (n-start) : loop (n+d) (n+d)
         | otherwise = loop start (n+d)
         where Iter c d = iter t n