From 87755a002b607a43960f6d2d1a9a7b0d59fb0bc0 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 21 May 2021 01:54:41 +0100
Subject: [PATCH 01/38] Switch internal representation to UTF-8

---
 README.markdown                               |   2 +-
 cbits/cbits.c                                 | 232 +++---------------
 cbits/utils.c                                 |  11 +
 src/Data/Text.hs                              |  50 ++--
 src/Data/Text/Array.hs                        |  38 +--
 src/Data/Text/Encoding.hs                     | 139 ++++-------
 src/Data/Text/Foreign.hs                      | 105 ++++----
 src/Data/Text/Internal.hs                     |  32 +--
 src/Data/Text/Internal/Builder.hs             |   4 +-
 src/Data/Text/Internal/Encoding/Fusion.hs     |   6 +-
 src/Data/Text/Internal/Encoding/Utf8.hs       |  23 +-
 src/Data/Text/Internal/Fusion.hs              |  81 +++---
 src/Data/Text/Internal/Fusion/Common.hs       |  35 ++-
 src/Data/Text/Internal/Fusion/Size.hs         |  12 +-
 src/Data/Text/Internal/Fusion/Types.hs        |   2 +-
 .../Text/Internal/Lazy/Encoding/Fusion.hs     |  10 +-
 src/Data/Text/Internal/Lazy/Fusion.hs         |   4 +-
 src/Data/Text/Internal/Lazy/Search.hs         |  18 +-
 src/Data/Text/Internal/Search.hs              |  10 +-
 src/Data/Text/Internal/Unsafe/Char.hs         |  47 ++--
 src/Data/Text/Lazy.hs                         |  30 +--
 src/Data/Text/Lazy/Builder/Int.hs             |   9 +-
 src/Data/Text/Show.hs                         |   4 +-
 src/Data/Text/Unsafe.hs                       | 101 ++++----
 tests/Tests/Properties/LowLevel.hs            |  14 +-
 tests/Tests/QuickCheckUtils.hs                |   4 +-
 tests/Tests/Regressions.hs                    |   6 +-
 text.cabal                                    |  16 +-
 28 files changed, 428 insertions(+), 617 deletions(-)
 create mode 100644 cbits/utils.c

diff --git a/README.markdown b/README.markdown
index cf355297..8c27cb0e 100644
--- a/README.markdown
+++ b/README.markdown
@@ -29,4 +29,4 @@ based on the stream fusion framework developed by Roman Leshchinskiy,
 Duncan Coutts, and Don Stewart.
 
 The core library was fleshed out, debugged, and tested by Bryan
-O'Sullivan <bos@serpentine.com>, and he is the current maintainer.
+O'Sullivan. Transition from UTF-16 to UTF-8 is by Andrew Lelechenko.
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 1d8322ae..33bab908 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -16,12 +16,6 @@
 
 #include "text_cbits.h"
 
-int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
-                   size_t n)
-{
-  return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
-}
-
 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 12
 
@@ -61,60 +55,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   return *state = utf8d[256 + *state + type];
 }
 
-/*
- * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
- * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
- * an UTF16 array
- */
-void
-_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
+size_t
+_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
                        const uint8_t *srcend)
 {
+  const uint8_t *dest0 = dest;
   const uint8_t *p = src;
 
-#if defined(__i386__) || defined(__x86_64__)
-  /* This optimization works on a little-endian systems by using
-     (aligned) 32-bit loads instead of 8-bit loads
-   */
-
-  /* consume unaligned prefix */
-  while (p != srcend && (uintptr_t)p & 0x3)
-    *dest++ = *p++;
-
-#if defined(__x86_64__)
-  /* All the intrinsics used here are from SSE2,
-   * so every x86_64 CPU supports them.
-   */
-  const __m128i zeros = _mm_set1_epi32(0);
-  while (p < srcend - 7) {
-    /* Load 8 bytes of ASCII data */
-    const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
-    /* Interleave with zeros */
-    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
-    /* Store the resulting 16 bytes into destination */
-    _mm_storeu_si128((__m128i *)dest, utf16);
-
-    dest += 8;
-    p += 8;
-  }
-#else
-  /* iterate over 32-bit aligned loads */
-  while (p < srcend - 3) {
-    const uint32_t w = *((const uint32_t *)p);
-
-    *dest++ =  w        & 0xff;
-    *dest++ = (w >> 8)  & 0xff;
-    *dest++ = (w >> 16) & 0xff;
-    *dest++ = (w >> 24) & 0xff;
-
-    p += 4;
+  while (p != srcend){
+    uint8_t codepoint = *p++;
+    if(codepoint < 0x80){
+      *dest++ = (uint8_t)codepoint;
+    } else {
+      *dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    }
   }
-#endif
-#endif
 
-  /* handle unaligned suffix */
-  while (p != srcend)
-    *dest++ = *p++;
+  return (dest - dest0);
 }
 
 /*
@@ -146,82 +104,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
  */
 #if defined(__GNUC__) || defined(__clang__)
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
   __attribute((always_inline));
 #endif
 
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
 {
-  uint16_t *d = dest + *destoff;
+  uint8_t *d = dest + *destoff;
   const uint8_t *s = *src, *last = *src;
   uint32_t state = *state0;
   uint32_t codepoint = *codepoint0;
 
   while (s < srcend) {
-#if defined(__i386__) || defined(__x86_64__)
-    /*
-     * This code will only work on a little-endian system that
-     * supports unaligned loads.
-     *
-     * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
-     */
-
-    if (state == UTF8_ACCEPT) {
-#if defined(__x86_64__)
-      const __m128i zeros = _mm_set1_epi32(0);
-      while (s < srcend - 8) {
-        const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
-        if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
-          break;
-        s += 8;
-
-        /* Load 8 bytes of ASCII data */
-        const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
-        /* Interleave with zeros */
-        const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
-        /* Store the resulting 16 bytes into destination */
-        _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
-        d += 8;
-      }
-#else
-      while (s < srcend - 4) {
-        codepoint = *((uint32_t *) s);
-        if ((codepoint & 0x80808080) != 0)
-          break;
-        s += 4;
-        /*
-         * Tried 32-bit stores here, but the extra bit-twiddling
-         * slowed the code down.
-         */
-        *d++ = (uint16_t) (codepoint & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
-#endif
-      last = s;
-    } /* end if (state == UTF8_ACCEPT) */
-#endif
-
     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
       if (state != UTF8_REJECT)
-	continue;
+	      continue;
       break;
     }
 
-    if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
-    else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+    if(codepoint < 0x80){
+      *d++ = (uint8_t) codepoint;
+    } else if(codepoint < 0x800){
+      *d++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else if(codepoint < 0x10000){
+      *d++ = (uint8_t) (0xE0 + (codepoint >> 12));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else {
+      *d++ = (uint8_t) (0xF0 + (codepoint >> 18));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
     }
+
     last = s;
   }
 
@@ -234,7 +155,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
 }
 
 uint8_t const *
-_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
                            const uint8_t **src,
                            const uint8_t *srcend,
                            uint32_t *codepoint0, uint32_t *state0)
@@ -248,7 +169,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
  * Helper to decode buffer and discard final decoder state
  */
 const uint8_t *
-_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
                      const uint8_t *src, const uint8_t *const srcend)
 {
   uint32_t codepoint;
@@ -257,90 +178,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
                           &codepoint, &state);
   return src;
 }
-
-void
-_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
-		     size_t srclen)
-{
-  const uint16_t *srcend;
-  uint8_t *dest = *destp;
-
-  src += srcoff;
-  srcend = src + srclen;
-
- ascii:
-#if defined(__x86_64__)
-  while (srcend - src >= 8) {
-    union { uint64_t halves[2]; __m128i whole; } eight_chars;
-    eight_chars.whole = _mm_loadu_si128((__m128i *) src);
-
-    const uint64_t w = eight_chars.halves[0];
-    if (w & 0xFF80FF80FF80FF80ULL) {
-      if (!(w & 0x000000000000FF80ULL)) {
-        *dest++ = w & 0xFFFF;
-        src++;
-        if (!(w & 0x00000000FF800000ULL)) {
-          *dest++ = (w >> 16) & 0xFFFF;
-          src++;
-          if (!(w & 0x0000FF8000000000ULL)) {
-            *dest++ = (w >> 32) & 0xFFFF;
-            src++;
-          }
-        }
-      }
-      break;
-    }
-
-    if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
-      break;
-    }
-
-    const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
-    _mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
-
-    dest += 8;
-    src += 8;
-  }
-#endif
-
-#if defined(__i386__)
-  while (srcend - src >= 2) {
-    uint32_t w = *((uint32_t *) src);
-
-    if (w & 0xFF80FF80)
-      break;
-    *dest++ = w & 0xFFFF;
-    *dest++ = w >> 16;
-    src += 2;
-  }
-#endif
-
-  while (src < srcend) {
-    uint16_t w = *src++;
-
-    if (w <= 0x7F) {
-      *dest++ = w;
-      /* An ASCII byte is likely to begin a run of ASCII bytes.
-	 Falling back into the fast path really helps performance. */
-      goto ascii;
-    }
-    else if (w <= 0x7FF) {
-      *dest++ = (w >> 6) | 0xC0;
-      *dest++ = (w & 0x3f) | 0x80;
-    }
-    else if (w < 0xD800 || w > 0xDBFF) {
-      *dest++ = (w >> 12) | 0xE0;
-      *dest++ = ((w >> 6) & 0x3F) | 0x80;
-      *dest++ = (w & 0x3F) | 0x80;
-    } else {
-      uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
-	(((uint32_t) *src++) - 0xDC00) + 0x10000;
-      *dest++ = (c >> 18) | 0xF0;
-      *dest++ = ((c >> 12) & 0x3F) | 0x80;
-      *dest++ = ((c >> 6) & 0x3F) | 0x80;
-      *dest++ = (c & 0x3F) | 0x80;
-    }
-  }
-
-  *destp = dest;
-}
diff --git a/cbits/utils.c b/cbits/utils.c
new file mode 100644
index 00000000..2baa78c7
--- /dev/null
+++ b/cbits/utils.c
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+int _hs_text_memcmp(const void *arr1, size_t off1, const void *arr2, size_t off2, size_t len)
+{
+  return memcmp(arr1 + off1, arr2 + off2, len);
+}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 9bac1b9c..b035ed71 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -9,6 +9,7 @@
 -- Copyright   : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts,
 --               (c) 2008, 2009 Tom Harper
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -227,10 +228,8 @@ import Data.Text.Internal.Private (span_)
 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
-import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
+import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
                          reverseIter_, unsafeHead, unsafeTail)
-import Data.Text.Internal.Unsafe.Char (unsafeChr)
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import Data.Text.Internal.Search (indices)
 #if defined(__HADDOCK__)
 import Data.ByteString (ByteString)
@@ -291,7 +290,8 @@ import Text.Printf (PrintfArg, formatArg, formatString)
 -- points
 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
 -- as 'Char' values, including code points from this invalid range.
--- This means that there are some 'Char' values that are not valid
+-- This means that there are some 'Char' values
+-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
 -- Unicode scalar values, and the functions in this module must handle
 -- those cases.
 --
@@ -300,12 +300,7 @@ import Text.Printf (PrintfArg, formatArg, formatString)
 -- that are not valid Unicode scalar values with the replacement
 -- character \"&#xfffd;\" (U+FFFD).  Functions that perform this
 -- inspection and replacement are documented with the phrase
--- \"Performs replacement on invalid scalar values\".
---
--- (One reason for this policy of replacement is that internally, a
--- 'Text' value is represented as packed UTF-16 data. Values in the
--- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
--- code points, and so cannot be represented. The functions replace
+-- \"Performs replacement on invalid scalar values\". The functions replace
 -- invalid scalar values, instead of dropping them, as a security
 -- measure. For details, see
 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
@@ -487,12 +482,9 @@ second f (a, b) = (a, f b)
 -- | /O(1)/ Returns the last character of a 'Text', which must be
 -- non-empty.
 last :: Text -> Char
-last (Text arr off len)
-    | len <= 0                 = emptyError "last"
-    | n < 0xDC00 || n > 0xDFFF = unsafeChr n
-    | otherwise                = U16.chr2 n0 n
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+last t@(Text _ _ len)
+    | len <= 0  = emptyError "last"
+    | otherwise = let Iter c _ = reverseIter t (len - 1) in c
 {-# INLINE [1] last #-}
 
 -- | /O(1)/ Returns all characters after the head of a 'Text', which
@@ -507,11 +499,9 @@ tail t@(Text arr off len)
 -- | /O(1)/ Returns all but the last character of a 'Text', which must
 -- be non-empty.
 init :: Text -> Text
-init (Text arr off len) | len <= 0                   = emptyError "init"
-                        | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
-                        | otherwise                  = text arr off (len-1)
-    where
-      n = A.unsafeIndex arr (off+len-1)
+init t@(Text arr off len)
+    | len <= 0  = emptyError "init"
+    | otherwise = text arr off (len + reverseIter_ t (len - 1))
 {-# INLINE [1] init #-}
 
 -- | /O(1)/ Returns all but the last character and the last character of a
@@ -519,12 +509,11 @@ init (Text arr off len) | len <= 0                   = emptyError "init"
 --
 -- @since 1.2.3.0
 unsnoc :: Text -> Maybe (Text, Char)
-unsnoc (Text arr off len)
-    | len <= 0                 = Nothing
-    | n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n)
-    | otherwise                = Just (text arr off (len-2), U16.chr2 n0 n)
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+unsnoc t@(Text arr off len)
+    | len <= 0  = Nothing
+    | otherwise = Just (text arr off (len + d), c)
+        where
+            Iter c d = reverseIter t (len - 1)
 {-# INLINE [1] unsnoc #-}
 
 -- | /O(1)/ Tests whether a 'Text' is empty or not.
@@ -911,7 +900,7 @@ concat ts = case ts' of
               _ -> Text (A.run go) 0 len
   where
     ts' = L.filter (not . null) ts
-    len = sumP "concat" $ L.map lengthWord16 ts'
+    len = sumP "concat" $ L.map lengthWord8 ts'
     go :: ST s (A.MArray s)
     go = do
       arr <- A.new len
@@ -1263,7 +1252,7 @@ groupBy p = loop
         where Iter c d = iter t 0
               n     = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
 
--- | Returns the /array/ index (in units of 'Word16') at which a
+-- | Returns the /array/ index (in units of 'Word8') at which a
 -- character may be found.  This is /not/ the same as the logical
 -- index returned by e.g. 'findIndex'.
 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
@@ -1569,9 +1558,10 @@ words t@(Text arr off len) = loop 0 0
         | n >= len = if start == n
                      then []
                      else [Text arr (start+off) (n-start)]
+        -- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
         | isSpace c =
             if start == n
-            then loop (start+1) (start+1)
+            then loop (n+d) (n+d)
             else Text arr (start+off) (n-start) : loop (n+d) (n+d)
         | otherwise = loop start (n+d)
         where Iter c d = iter t n
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index c406c983..6566f40a 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -44,14 +44,14 @@ module Data.Text.Array
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits ((.&.), xor, shiftL, shiftR)
+import Data.Bits ((.&.), xor, shiftR)
 #if !MIN_VERSION_base(4,11,0)
 import Data.Text.Internal.Unsafe (inlinePerformIO)
-#endif
 import Foreign.C.Types (CInt(..))
+#endif
 import GHC.Exts hiding (toList)
 import GHC.ST (ST(..), runST)
-import GHC.Word (Word16(..))
+import GHC.Word (Word8(..))
 import Prelude hiding (length, read)
 
 -- | Immutable array type.
@@ -88,7 +88,7 @@ unsafeFreeze MArray{..} = ST $ \s1# ->
 -- | Indicate how many bytes would be used for an array of the given
 -- size.
 bytesInArray :: Int -> Int
-bytesInArray n = n `shiftL` 1
+bytesInArray n = n
 {-# INLINE bytesInArray #-}
 
 -- | Unchecked read of an immutable array.  May return garbage or
@@ -97,15 +97,15 @@ unsafeIndex ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  Array -> Int -> Word16
+  Array -> Int -> Word8
 unsafeIndex a@Array{..} i@(I# i#) =
 #if defined(ASSERTS)
-  let word16len = I# (sizeofByteArray# aBA) `quot` 2 in
-  if i < 0 || i >= word16len
-  then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word16len)
+  let word8len = I# (sizeofByteArray# aBA) in
+  if i < 0 || i >= word8len
+  then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word8len)
   else
 #endif
-  case indexWord16Array# aBA i# of r# -> (W16# r#)
+  case indexWord8Array# aBA i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
 #if defined(ASSERTS)
@@ -130,17 +130,17 @@ unsafeWrite ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  MArray s -> Int -> Word16 -> ST s ()
-unsafeWrite ma@MArray{..} i@(I# i#) (W16# e#) =
+  MArray s -> Int -> Word8 -> ST s ()
+unsafeWrite ma@MArray{..} i@(I# i#) (W8# e#) =
 #if defined(ASSERTS)
-  checkBoundsM ma (i * 2) 2 >>
+  checkBoundsM ma i 1 >>
 #endif
-  (ST $ \s1# -> case writeWord16Array# maBA i# e# s1# of
+  (ST $ \s1# -> case writeWord8Array# maBA i# e# s1# of
     s2# -> (# s2#, () #))
 {-# INLINE unsafeWrite #-}
 
 -- | Convert an immutable array to a list.
-toList :: Array -> Int -> Int -> [Word16]
+toList :: Array -> Int -> Int -> [Word8]
 toList ary off len = loop 0
     where loop i | i < len   = unsafeIndex ary (off+i) : loop (i+1)
                  | otherwise = []
@@ -176,10 +176,10 @@ copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff#
 #if defined(ASSERTS)
     srcLen <- getSizeofMArray src
     dstLen <- getSizeofMArray dst
-    assert (srcOff + count <= srcLen `quot` 2) .
-      assert (dstOff + count <= dstLen `quot` 2) .
+    assert (srcOff + count <= srcLen) .
+      assert (dstOff + count <= dstLen) .
 #endif
-      ST $ \s1# -> case copyMutableByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# count#) s1# of
+      ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of
         s2# -> (# s2#, () #)
 {-# INLINE copyM #-}
 
@@ -194,7 +194,7 @@ copyI :: MArray s               -- ^ Destination
 copyI (MArray dst#) dstOff@(I# dstOff#) (Array src#) (I# srcOff#) top@(I# top#)
     | dstOff >= top = return ()
     | otherwise = ST $ \s1# ->
-      case copyByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# (top# -# dstOff#)) s1# of
+      case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
         s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
@@ -209,7 +209,7 @@ equal :: Array                  -- ^ First
 equal (Array src1#) (I# off1#) (Array src2#) (I# off2#) (I# count#) = i == 0
   where
 #if MIN_VERSION_base(4,11,0)
-    i = I# (compareByteArrays# src1# (2# *# off1#) src2# (2# *# off2#) (2# *# count#))
+    i = I# (compareByteArrays# src1# off1# src2# off2# count#)
 #else
     i = fromIntegral (inlinePerformIO (memcmp src1# off1# src2# off2# count#))
 
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index d44e73ff..186e12b4 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -7,6 +7,7 @@
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts,
 --               (c) 2008, 2009 Tom Harper
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -63,30 +64,29 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
 import Control.Monad.ST (runST)
-import Data.Bits ((.&.), shiftR)
 import Data.ByteString as B
-import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Short.Internal as SBS
 import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, text)
 import Data.Text.Internal.Private (runText)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Internal.Unsafe.Char (ord, unsafeWrite)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show ()
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8, Word16, Word32)
-import Foreign.C.Types (CSize(CSize))
+import Data.Word (Word8, Word32)
+import Foreign.C.Types (CSize)
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
 import Foreign.Storable (Storable, peek, poke)
-import GHC.Base (ByteArray#, MutableByteArray#)
+import GHC.Base (MutableByteArray#)
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
+import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import qualified Data.Text.Internal.Fusion as F
 import Data.Text.Internal.ByteStringCompat
 #if defined(ASSERTS)
@@ -124,12 +124,12 @@ decodeLatin1 ::
 #endif
   ByteString -> Text
 decodeLatin1 bs = withBS bs aux where
-  aux fp len = text a 0 len
+  aux fp len = text a 0 actualLen
    where
-    a = A.run (A.new len >>= unsafeIOToST . go)
-    go dest = unsafeWithForeignPtr fp $ \ptr -> do
-      c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
-      return dest
+    (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
+    go dest = unsafeWithForeignPtr fp $ \src -> do
+      destLen <- c_decode_latin1 (A.maBA dest) src (src `plusPtr` len)
+      return (dest, destLen)
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -161,6 +161,8 @@ decodeUtf8With onErr bs = withBS bs aux
                       case onErr desc (Just x) of
                         Nothing -> loop $ curPtr' `plusPtr` 1
                         Just c
+                          -- TODO This is problematic, because even BMP replacement characters
+                          -- can take longer than one UTF8 code unit (which is byte).
                           | c > '\xFFFF' -> throwUnsupportedReplChar
                           | otherwise -> do
                               destOff <- peek destOffPtr
@@ -170,43 +172,14 @@ decodeUtf8With onErr bs = withBS bs aux
                               poke destOffPtr (destOff + intToCSize w)
                               loop $ curPtr' `plusPtr` 1
             loop ptr
-    (unsafeIOToST . go) =<< A.new len
+    -- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.
+    -- This is incorrect in general, but for now we just want to pass tests.
+    (unsafeIOToST . go) =<< A.new (len * 2 + 100)
    where
     desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
 
     throwUnsupportedReplChar = throwIO $
       ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
-  -- TODO: The code currently assumes that the transcoded UTF-16
-  -- stream is at most twice as long (in bytes) as the input UTF-8
-  -- stream. To justify this assumption one has to assume that the
-  -- error handler replacement character also satisfies this
-  -- invariant, by emitting at most one UTF16 code unit.
-  --
-  -- One easy way to support the full range of code-points for
-  -- replacement characters in the error handler is to simply change
-  -- the (over-)allocation to `A.new (2*len)` and then shrink back the
-  -- `ByteArray#` to the real size (recent GHCs have a cheap
-  -- `ByteArray#` resize-primop for that which allow the GC to reclaim
-  -- the overallocation). However, this would require 4 times as much
-  -- (temporary) storage as the original UTF-8 required.
-  --
-  -- Another strategy would be to optimistically assume that
-  -- replacement characters are within the BMP, and if the case of a
-  -- non-BMP replacement occurs reallocate the target buffer (or throw
-  -- an exception, and fallback to a pessimistic codepath, like e.g.
-  -- `decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)`)
-  --
-  -- Alternatively, `OnDecodeError` could become a datastructure which
-  -- statically encodes the replacement-character range,
-  -- e.g. something isomorphic to
-  --
-  --   Either (... -> Maybe Word16) (... -> Maybe Char)
-  --
-  -- And allow to statically switch between the BMP/non-BMP
-  -- replacement-character codepaths. There's multiple ways to address
-  -- this with different tradeoffs; but ideally we should optimise for
-  -- the optimistic/error-free case.
-{- INLINE[0] decodeUtf8With #-}
 
 -- $stream
 --
@@ -304,14 +277,15 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
  where
   -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential surrogate pair started in the last buffer (@undecoded0@), or
+  -- potential code point started in the last buffer (@undecoded0@), or
   -- replacement characters for each byte in @undecoded0@ if the
   -- sequence turns out to be invalid. There can be up to three bytes there,
-  -- hence we allocate @len+3@ 16-bit words.
+  -- hence we allocate @len+3@ bytes.
   decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
               -> Decoding
   decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
-    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
+    -- TODO Replace (+100) with something sensible.
+    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100)
        where
         decodeChunkToBuffer :: A.MArray s -> IO Decoding
         decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
@@ -342,7 +316,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                         -- the previous chunk, we invalidate the bytes from
                         -- @undecoded0@ and retry decoding the current chunk from
                         -- the initial state.
-                        traverse_ skipByte (B.unpack undecoded0 )
+                        traverse_ skipByte (B.unpack undecoded0)
                         loop lastPtr
                       else do
                         peek lastPtr >>= skipByte
@@ -436,50 +410,33 @@ encodeUtf8BuilderEscaped be =
             goPartial !iendTmp = go i0 op0
               where
                 go !i !op
-                  | i < iendTmp = case A.unsafeIndex arr i of
-                      w | w <= 0x7F -> do
-                            BP.runB be (word16ToWord8 w) op >>= go (i + 1)
-                        | w <= 0x7FF -> do
-                            poke8 @Word16 0 $ (w `shiftR` 6) + 0xC0
-                            poke8 @Word16 1 $ (w .&. 0x3f) + 0x80
-                            go (i + 1) (op `plusPtr` 2)
-                        | 0xD800 <= w && w <= 0xDBFF -> do
-                            let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
-                            poke8 @Int 0 $ (c `shiftR` 18) + 0xF0
-                            poke8 @Int 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
-                            poke8 @Int 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
-                            poke8 @Int 3 $ (c .&. 0x3F) + 0x80
-                            go (i + 2) (op `plusPtr` 4)
-                        | otherwise -> do
-                            poke8 @Word16 0 $ (w `shiftR` 12) + 0xE0
-                            poke8 @Word16 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
-                            poke8 @Word16 2 $ (w .&. 0x3F) + 0x80
-                            go (i + 1) (op `plusPtr` 3)
-                  | otherwise =
-                      outerLoop i (B.BufferRange op ope)
+                  | i < iendTmp = case utf8LengthByLeader w of
+                    1 -> do
+                      BP.runB be w op >>= go (i + 1)
+                    2 -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      go (i + 2) (op `plusPtr` 2)
+                    3 -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2))
+                      go (i + 3) (op `plusPtr` 3)
+                    _ -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2))
+                      poke (op `plusPtr` 3) (A.unsafeIndex arr (i+3))
+                      go (i + 4) (op `plusPtr` 4)
+                  | otherwise = outerLoop i (B.BufferRange op ope)
                   where
-                    -- Take care, a is either Word16 or Int above
-                    poke8 :: Integral a => Int -> a -> IO ()
-                    poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
+                    w = A.unsafeIndex arr i
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text arr off len)
+encodeUtf8 (Text (A.Array arr) off len)
   | len == 0  = B.empty
-  | otherwise = unsafeDupablePerformIO $ do
-  fp <- B.mallocByteString (len*3) -- see https://github.com/haskell/text/issues/194 for why len*3 is enough
-  unsafeWithForeignPtr fp $ \ptr ->
-    with ptr $ \destPtr -> do
-      c_encode_utf8 destPtr (A.aBA arr) (intToCSize off) (intToCSize len)
-      newDest <- peek destPtr
-      let utf8len = newDest `minusPtr` ptr
-      if utf8len >= len `shiftR` 1
-        then return (mkBS fp utf8len)
-        else do
-          fp' <- B.mallocByteString utf8len
-          unsafeWithForeignPtr fp' $ \ptr' -> do
-            B.memcpy ptr' ptr utf8len
-            return (mkBS fp' utf8len)
+  | otherwise = B.take len $ B.drop off $ SBS.fromShort $ SBS.SBS arr
 
 -- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
@@ -563,9 +520,6 @@ cSizeToInt = fromIntegral
 intToCSize :: Int -> CSize
 intToCSize = fromIntegral
 
-word16ToWord8 :: Word16 -> Word8
-word16ToWord8 = fromIntegral
-
 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
@@ -576,7 +530,4 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
     -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
 
 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
-    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()
-
-foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8
-    :: Ptr (Ptr Word8) -> ByteArray# -> CSize -> CSize -> IO ()
+    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs
index 65805b2f..87742011 100644
--- a/src/Data/Text/Foreign.hs
+++ b/src/Data/Text/Foreign.hs
@@ -14,7 +14,7 @@ module Data.Text.Foreign
     (
     -- * Interoperability with native code
     -- $interop
-      I16
+      I8
     -- * Safe conversion functions
     , fromPtr
     , useAsPtr
@@ -23,12 +23,12 @@ module Data.Text.Foreign
     , peekCStringLen
     , withCStringLen
     -- * Unsafe conversion code
-    , lengthWord16
+    , lengthWord8
     , unsafeCopyToPtr
     -- * Low-level manipulation
     -- $lowlevel
-    , dropWord16
-    , takeWord16
+    , dropWord8
+    , takeWord8
     ) where
 
 #if defined(ASSERTS)
@@ -39,8 +39,8 @@ import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen)
 import Data.Text.Encoding (decodeUtf8, encodeUtf8)
 import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Unsafe (lengthWord16)
-import Data.Word (Word16)
+import Data.Text.Unsafe (lengthWord8)
+import Data.Word (Word8)
 import Foreign.C.String (CStringLen)
 import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray)
 import Foreign.Marshal.Alloc (allocaBytes)
@@ -54,24 +54,22 @@ import qualified Data.Text.Array as A
 -- to have a fixed address in the Haskell heap. All communication with
 -- native code must thus occur by copying data back and forth.
 --
--- The 'Text' type's internal representation is UTF-16, using the
--- platform's native endianness.  This makes copied data suitable for
--- use with native libraries that use a similar representation, such
--- as ICU.  To interoperate with native libraries that use different
--- internal representations, such as UTF-8 or UTF-32, consider using
+-- The 'Text' type's internal representation is UTF-8.
+-- To interoperate with native libraries that use different
+-- internal representations, such as UTF-16 or UTF-32, consider using
 -- the functions in the 'Data.Text.Encoding' module.
 
--- | A type representing a number of UTF-16 code units.
-newtype I16 = I16 Int
+-- | A type representing a number of UTF-8 code units.
+newtype I8 = I8 Int
     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
 
--- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
+-- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word8' by copying the
 -- contents of the array.
-fromPtr :: Ptr Word16           -- ^ source array
-        -> I16                  -- ^ length of source array (in 'Word16' units)
+fromPtr :: Ptr Word8           -- ^ source array
+        -> I8                  -- ^ length of source array (in 'Word8' units)
         -> IO Text
-fromPtr _   (I16 0)   = return empty
-fromPtr ptr (I16 len) =
+fromPtr _   (I8 0)   = return empty
+fromPtr ptr (I8 len) =
 #if defined(ASSERTS)
     assert (len > 0) $
 #endif
@@ -83,72 +81,77 @@ fromPtr ptr (I16 len) =
         loop !p !i | i == len = return marr
                    | otherwise = do
           A.unsafeWrite marr i =<< unsafeIOToST (peek p)
-          loop (p `plusPtr` 2) (i + 1)
+          loop (p `plusPtr` 1) (i + 1)
 
 -- $lowlevel
 --
--- Foreign functions that use UTF-16 internally may return indices in
--- units of 'Word16' instead of characters.  These functions may
+-- Foreign functions that use UTF-8 internally may return indices in
+-- units of 'Word8' instead of characters.  These functions may
 -- safely be used with such indices, as they will adjust offsets if
 -- necessary to preserve the validity of a Unicode string.
 
--- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
+-- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word8' units in
 -- length.
 --
--- If @n@ would cause the 'Text' to end inside a surrogate pair, the
--- end of the prefix will be advanced by one additional 'Word16' unit
+-- If @n@ would cause the 'Text' to end inside a code point, the
+-- end of the prefix will be advanced by several additional 'Word8' units
 -- to maintain its validity.
-takeWord16 :: I16 -> Text -> Text
-takeWord16 (I16 n) t@(Text arr off len)
-    | n <= 0               = empty
-    | n >= len || m >= len = t
-    | otherwise            = Text arr off m
-  where
-    m | w < 0xD800 || w > 0xDBFF = n
-      | otherwise                = n+1
-    w = A.unsafeIndex arr (off+n-1)
+takeWord8 :: I8 -> Text -> Text
+takeWord8 = (fst .) . splitAtWord8
 
--- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
+-- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word8' units
 -- dropped from its beginning.
 --
--- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
--- beginning of the suffix will be advanced by one additional 'Word16'
+-- If @n@ would cause the 'Text' to begin inside a code point, the
+-- beginning of the suffix will be advanced by several additional 'Word8'
 -- unit to maintain its validity.
-dropWord16 :: I16 -> Text -> Text
-dropWord16 (I16 n) t@(Text arr off len)
-    | n <= 0               = t
-    | n >= len || m >= len = empty
-    | otherwise            = Text arr (off+m) (len-m)
+dropWord8 :: I8 -> Text -> Text
+dropWord8 = (snd .) . splitAtWord8
+
+splitAtWord8 :: I8 -> Text -> (Text, Text)
+splitAtWord8 (I8 n) t@(Text arr off len)
+    | n <= 0               = (empty, t)
+    | n >= len || m >= len = (t, empty)
+    | otherwise            = (Text arr off m, Text arr (off+m) (len-m))
   where
-    m | w < 0xD800 || w > 0xDBFF = n
-      | otherwise                = n+1
-    w = A.unsafeIndex arr (off+n-1)
+    m | w0 <  0x80 = n   -- last char is ASCII
+      | w0 >= 0xF0 = n+3 -- last char starts 4-byte sequence
+      | w0 >= 0xE0 = n+2 -- last char starts 3-byte sequence
+      | w0 >= 0xC0 = n+1 -- last char starts 2-byte sequence
+      | w1 >= 0xF0 = n+2 -- pre-last char starts 4-byte sequence
+      | w1 >= 0xE0 = n+1 -- pre-last char starts 3-byte sequence
+      | w1 >= 0xC0 = n   -- pre-last char starts 2-byte sequence
+      | w2 >= 0xF0 = n+1 -- pre-pre-last char starts 4-byte sequence
+      | otherwise  = n   -- pre-pre-last char starts 3-byte sequence
+    w0 = A.unsafeIndex arr (off+n-1)
+    w1 = A.unsafeIndex arr (off+n-2)
+    w2 = A.unsafeIndex arr (off+n-3)
 
 -- | /O(n)/ Copy a 'Text' to an array.  The array is assumed to be big
 -- enough to hold the contents of the entire 'Text'.
-unsafeCopyToPtr :: Text -> Ptr Word16 -> IO ()
+unsafeCopyToPtr :: Text -> Ptr Word8 -> IO ()
 unsafeCopyToPtr (Text arr off len) ptr = loop ptr off
   where
     end = off + len
     loop !p !i | i == end  = return ()
                | otherwise = do
       poke p (A.unsafeIndex arr i)
-      loop (p `plusPtr` 2) (i + 1)
+      loop (p `plusPtr` 1) (i + 1)
 
 -- | /O(n)/ Perform an action on a temporary, mutable copy of a
 -- 'Text'.  The copy is freed as soon as the action returns.
-useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a
+useAsPtr :: Text -> (Ptr Word8 -> I8 -> IO a) -> IO a
 useAsPtr t@(Text _arr _off len) action =
-    allocaBytes (len * 2) $ \buf -> do
+    allocaBytes len $ \buf -> do
       unsafeCopyToPtr t buf
-      action (castPtr buf) (I16 len)
+      action (castPtr buf) (I8 len)
 
 -- | /O(n)/ Make a mutable copy of a 'Text'.
-asForeignPtr :: Text -> IO (ForeignPtr Word16, I16)
+asForeignPtr :: Text -> IO (ForeignPtr Word8, I8)
 asForeignPtr t@(Text _arr _off len) = do
   fp <- mallocForeignPtrArray len
   unsafeWithForeignPtr fp $ unsafeCopyToPtr t
-  return (fp, I16 len)
+  return (fp, I8 len)
 
 -- | /O(n)/ Decode a C string with explicit length, which is assumed
 -- to have been encoded as UTF-8. If decoding fails, a
diff --git a/src/Data/Text/Internal.hs b/src/Data/Text/Internal.hs
index 36fa36e0..b5a1d443 100644
--- a/src/Data/Text/Internal.hs
+++ b/src/Data/Text/Internal.hs
@@ -55,9 +55,9 @@ import qualified Data.Text.Array as A
 
 -- | A space efficient, packed, unboxed Unicode text type.
 data Text = Text
-    {-# UNPACK #-} !A.Array          -- payload (Word16 elements)
-    {-# UNPACK #-} !Int              -- offset (units of Word16, not Char)
-    {-# UNPACK #-} !Int              -- length (units of Word16, not Char)
+    {-# UNPACK #-} !A.Array -- bytearray encoded as UTF-8
+    {-# UNPACK #-} !Int     -- offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+    {-# UNPACK #-} !Int     -- length in bytes (not in Char!), pointing to an end of UTF-8 sequence
     deriving (Typeable)
 
 -- | Smart constructor.
@@ -65,13 +65,16 @@ text_ ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  A.Array -> Int -> Int -> Text
+     A.Array -- ^ bytearray encoded as UTF-8
+  -> Int     -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+  -> Int     -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence
+  -> Text
 text_ arr off len =
 #if defined(ASSERTS)
   let c    = A.unsafeIndex arr off
   in assert (len >= 0) .
      assert (off >= 0) .
-     assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
+     assert (len == 0 || c < 0x80 || c >= 0xC0) $
 #endif
      Text arr off len
 {-# INLINE text_ #-}
@@ -92,7 +95,10 @@ text ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  A.Array -> Int -> Int -> Text
+     A.Array -- ^ bytearray encoded as UTF-8
+  -> Int     -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+  -> Int     -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence
+  -> Text
 text arr off len | len == 0  = empty
                  | otherwise = text_ arr off len
 {-# INLINE text #-}
@@ -109,7 +115,7 @@ showText (Text arr off len) =
 
 -- | Map a 'Char' to a 'Text'-safe value.
 --
--- UTF-16 surrogate code points are not included in the set of Unicode
+-- Unicode 'Data.Char.Surrogate' code points are not included in the set of Unicode
 -- scalar values, but are unfortunately admitted as valid 'Char'
 -- values by Haskell.  They cannot be represented in a 'Text'.  This
 -- function remaps those code points to the Unicode replacement
@@ -191,19 +197,17 @@ int64ToInt32 = fromIntegral
 
 -- $internals
 --
--- Internally, the 'Text' type is represented as an array of 'Word16'
--- UTF-16 code units. The offset and length fields in the constructor
+-- Internally, the 'Text' type is represented as an array of 'Word8'
+-- UTF-8 code units. The offset and length fields in the constructor
 -- are in these units, /not/ units of 'Char'.
 --
 -- Invariants that all functions must maintain:
 --
--- * Since the 'Text' type uses UTF-16 internally, it cannot represent
+-- * Since the 'Text' type uses UTF-8 internally, it cannot represent
 --   characters in the reserved surrogate code point range U+D800 to
 --   U+DFFF. To maintain this invariant, the 'safe' function maps
 --   'Char' values in this range to the replacement character (U+FFFD,
 --   \'&#xfffd;\').
 --
--- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must
---   always be followed by a trailing (or \"low\") surrogate code unit
---   (0xDC00-0xDFFF). A trailing surrogate code unit must always be
---   preceded by a leading surrogate code unit.
+-- * Offset and length must point to a valid UTF-8 sequence of bytes.
+--   Violation of this may cause memory access violation and divergence.
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 3da66c42..9d181276 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -140,7 +140,7 @@ singleton ::
   HasCallStack =>
 #endif
   Char -> Builder
-singleton c = writeAtMost 2 $ \ marr o -> unsafeWrite marr o (safe c)
+singleton c = writeAtMost 4 $ \ marr o -> unsafeWrite marr o (safe c)
 {-# INLINE singleton #-}
 
 ------------------------------------------------------------------------
@@ -185,7 +185,7 @@ fromString :: String -> Builder
 fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
     let loop !marr !o !u !l [] = k (Buffer marr o u l)
         loop marr o u l s@(c:cs)
-            | l <= 1 = do
+            | l <= 3 = do
                 arr <- A.unsafeFreeze marr
                 let !t = Text arr o u
                 marr' <- A.new chunkSize
diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs
index b42315e0..aa8f0d02 100644
--- a/src/Data/Text/Internal/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Encoding/Fusion.hs
@@ -43,7 +43,7 @@ import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
 import Data.Text.Encoding.Error
 import Data.Text.Internal.Encoding.Fusion.Common
-import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Word (Word8, Word16, Word32)
 import Foreign.ForeignPtr (ForeignPtr)
@@ -99,7 +99,7 @@ streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
       {-# INLINE next #-}
       next i
           | i >= l                         = Done
-          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
+          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr16 x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
           | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
           where
@@ -117,7 +117,7 @@ streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
       {-# INLINE next #-}
       next i
           | i >= l                         = Done
-          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
+          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr16 x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
           | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
           where
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index f0a04fa7..fa69b7d9 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -5,6 +5,7 @@
 -- Copyright   : (c) 2008, 2009 Tom Harper,
 --               (c) 2009, 2010 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -17,9 +18,10 @@
 --
 -- Basic UTF-8 validation and character manipulation.
 module Data.Text.Internal.Encoding.Utf8
-    (
+    ( utf8Length
+    , utf8LengthByLeader
     -- Decomposition
-      ord2
+    , ord2
     , ord3
     , ord4
     -- Construction
@@ -34,7 +36,7 @@ module Data.Text.Internal.Encoding.Utf8
     ) where
 
 import Data.Bits ((.&.), shiftR)
-import Data.Text.Internal.Unsafe.Char (ord)
+import Data.Char (ord)
 import GHC.Exts
 import GHC.Word (Word8(..))
 
@@ -52,6 +54,21 @@ between :: Word8                -- ^ byte to check
 between x y z = x >= y && x <= z
 {-# INLINE between #-}
 
+-- TODO make branchless by looking into Word64 by clz (ord c)
+utf8Length :: Char -> Int
+utf8Length c
+  | ord c < 0x80    = 1
+  | ord c < 0x800   = 2
+  | ord c < 0x10000 = 3
+  | otherwise       = 4
+
+utf8LengthByLeader :: Word8 -> Int
+utf8LengthByLeader w
+  | w < 0x80  = 1
+  | w < 0xE0  = 2
+  | w < 0xF0  = 3
+  | otherwise = 4
+
 ord2 :: Char -> (Word8,Word8)
 ord2 c =
     -- ord2 is used only in test suite to construct a deliberately invalid ByteString,
diff --git a/src/Data/Text/Internal/Fusion.hs b/src/Data/Text/Internal/Fusion.hs
index a96d2a17..01b781a1 100644
--- a/src/Data/Text/Internal/Fusion.hs
+++ b/src/Data/Text/Internal/Fusion.hs
@@ -50,19 +50,18 @@ module Data.Text.Internal.Fusion
     ) where
 
 import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
-                Num(..), Ord(..), ($), (&&),
-                fromIntegral, otherwise)
-import Data.Bits ((.&.), shiftL, shiftR)
+                Num(..), Ord(..), ($),
+                otherwise)
+import Data.Bits (shiftL, shiftR)
 import Data.Text.Internal (Text(..))
 import Data.Text.Internal.Private (runText)
-import Data.Text.Internal.Unsafe.Char (ord, unsafeChr, unsafeWrite)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeWrite)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Internal.Fusion.Types
 import Data.Text.Internal.Fusion.Size
 import qualified Data.Text.Internal as I
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
-import Data.Word (Word16)
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
 
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
@@ -82,16 +81,24 @@ stream ::
   HasCallStack =>
 #endif
   Text -> Stream Char
-stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
+stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len)
     where
       !end = off+len
       next !i
-          | i >= end                   = Done
-          | n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2)
-          | otherwise                  = Yield (unsafeChr n) (i + 1)
+          | i >= end  = Done
+          | otherwise = Yield chr (i + l)
           where
-            n  = A.unsafeIndex arr i
-            n2 = A.unsafeIndex arr (i + 1)
+            n0 = A.unsafeIndex arr i
+            n1 = A.unsafeIndex arr (i + 1)
+            n2 = A.unsafeIndex arr (i + 2)
+            n3 = A.unsafeIndex arr (i + 3)
+
+            l  = U8.utf8LengthByLeader n0
+            chr = case l of
+              1 -> unsafeChr8 n0
+              2 -> U8.chr2 n0 n1
+              3 -> U8.chr3 n0 n1 n2
+              _ -> U8.chr4 n0 n1 n2 n3
 {-# INLINE [0] stream #-}
 
 -- | /O(n)/ Converts 'Text' into a 'Stream' 'Char', but iterates
@@ -101,16 +108,20 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
 --
 -- @'unstream' . 'reverseStream' = 'Data.Text.reverse' @
 reverseStream :: Text -> Stream Char
-reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 1) len)
+reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len)
     where
       {-# INLINE next #-}
       next !i
-          | i < off                    = Done
-          | n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2)
-          | otherwise                  = Yield (unsafeChr n) (i - 1)
+          | i < off    = Done
+          | n0 <  0x80 = Yield (unsafeChr8 n0)       (i - 1)
+          | n1 >= 0xC0 = Yield (U8.chr2 n1 n0)       (i - 2)
+          | n2 >= 0xC0 = Yield (U8.chr3 n2 n1 n0)    (i - 3)
+          | otherwise  = Yield (U8.chr4 n3 n2 n1 n0) (i - 4)
           where
-            n  = A.unsafeIndex arr i
-            n2 = A.unsafeIndex arr (i - 1)
+            n0 = A.unsafeIndex arr i
+            n1 = A.unsafeIndex arr (i - 1)
+            n2 = A.unsafeIndex arr (i - 2)
+            n3 = A.unsafeIndex arr (i - 3)
 {-# INLINE [0] reverseStream #-}
 
 -- | /O(n)/ Convert 'Stream' 'Char' into a 'Text'.
@@ -123,10 +134,10 @@ reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `sh
 unstream :: Stream Char -> Text
 unstream (Stream next0 s0 len) = runText $ \done -> do
   -- Before encoding each char we perform a buffer realloc check assuming
-  -- worst case encoding size of two 16-bit units for the char. Just add an
+  -- worst case encoding size of four 8-bit units for the char. Just add an
   -- extra space to the buffer so that we do not end up reallocating even when
   -- all the chars are encoded as single unit.
-  let mlen = upperBound 4 len + 1
+  let mlen = upperBound 4 len + 3
   arr0 <- A.new mlen
   let outer !arr !maxi = encode
        where
@@ -137,7 +148,7 @@ unstream (Stream next0 s0 len) = runText $ \done -> do
                 Skip si'    -> encode si' di
                 Yield c si'
                     -- simply check for the worst case
-                    | maxi < di + 1 -> realloc si di
+                    | maxi < di + 3 -> realloc si di
                     | otherwise -> do
                             n <- unsafeWrite arr di c
                             encode si' (di + n)
@@ -192,22 +203,12 @@ reverse (Stream next s len0)
                        let newLen = len `shiftL` 1
                        marr' <- A.new newLen
                        A.copyM marr' (newLen-len) marr 0 len
-                       write s1 (len+i) newLen marr'
-                     | otherwise -> write s1 i len marr
-            where n = ord x
-                  least | n < 0x10000 = 0
-                        | otherwise   = 1
-                  m = n - 0x10000
-                  lo = intToWord16 $ (m `shiftR` 10) + 0xD800
-                  hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00
-                  write t j l mar
-                      | n < 0x10000 = do
-                          A.unsafeWrite mar j (intToWord16 n)
-                          loop t (j-1) l mar
-                      | otherwise = do
-                          A.unsafeWrite mar (j-1) lo
-                          A.unsafeWrite mar j hi
-                          loop t (j-2) l mar
+                       _ <- unsafeWrite marr' (len + i - least) x
+                       loop s1 (len + i - least - 1) newLen marr'
+                     | otherwise -> do
+                       _ <- unsafeWrite marr (i - least) x
+                       loop s1 (i - least - 1) len marr
+            where least = U8.utf8Length x - 1
 {-# INLINE [0] reverse #-}
 
 -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
@@ -304,9 +305,5 @@ mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
                 | otherwise -> do d <- unsafeWrite arr i c
                                   loop z' s' (i+d)
                 where (z',c) = f z x
-                      j | ord c < 0x10000 = i
-                        | otherwise       = i + 1
+                      j = i + U8.utf8Length c - 1
 {-# INLINE [0] mapAccumL #-}
-
-intToWord16 :: Int -> Word16
-intToWord16 = fromIntegral
diff --git a/src/Data/Text/Internal/Fusion/Common.hs b/src/Data/Text/Internal/Fusion/Common.hs
index d9f83498..dc19cf1b 100644
--- a/src/Data/Text/Internal/Fusion/Common.hs
+++ b/src/Data/Text/Internal/Fusion/Common.hs
@@ -124,15 +124,17 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
                 (&&), fromIntegral, otherwise)
 import qualified Data.List as L
 import qualified Prelude as P
-import Data.Bits (shiftL)
 import Data.Char (isLetter, isSpace)
 import Data.Int (Int64)
+import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
 import Data.Text.Internal.Fusion.Types
 import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
                                      upperMapping)
 import Data.Text.Internal.Fusion.Size
-import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
-import GHC.Types (Char(..), Int(..))
+import GHC.Prim (Addr#, indexWord8OffAddr#)
+import GHC.Types (Int(..))
+import Data.Text.Internal.Unsafe.Char (unsafeChr8)
+import GHC.Word
 
 -- | /O(1)/ Convert a character into a 'Stream'
 --
@@ -185,23 +187,16 @@ streamCString# addr = Stream step 0 unknownSize
   where
     step !i
         | b == 0    = Done
-        | b <= 0x7f = Yield (C# b#) (i+1)
-        | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1
-                      in Yield c (i+2)
-        | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) +
-                                      (next 1  `shiftL` 6) +
-                                       next 2
-                      in Yield c (i+3)
-        | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) +
-                                      (next 1  `shiftL` 12) +
-                                      (next 2  `shiftL` 6) +
-                                       next 3
-                      in Yield c (i+4)
-      where b      = I# (ord# b#)
-            next n = I# (ord# (at# (i+n))) - 0x80
-            !b#    = at# i
-    at# (I# i#) = indexCharOffAddr# addr i#
-    chr (I# i#) = C# (chr# i#)
+        | otherwise = Yield chr (i + l)
+      where b = at# i
+            l = utf8LengthByLeader b
+            next n = at# (i+n)
+            chr = case l of
+              1 -> unsafeChr8 b
+              2 -> chr2 b (next 1)
+              3 -> chr3 b (next 1) (next 2)
+              _ -> chr4 b (next 1) (next 2) (next 3)
+    at# (I# i#) = W8# (indexWord8OffAddr# addr i#)
 {-# INLINE [0] streamCString# #-}
 
 -- ----------------------------------------------------------------------------
diff --git a/src/Data/Text/Internal/Fusion/Size.hs b/src/Data/Text/Internal/Fusion/Size.hs
index 50118c97..d6420555 100644
--- a/src/Data/Text/Internal/Fusion/Size.hs
+++ b/src/Data/Text/Internal/Fusion/Size.hs
@@ -37,13 +37,13 @@ module Data.Text.Internal.Fusion.Size
     , isEmpty
     ) where
 
-import Data.Char (ord)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length)
 import Data.Text.Internal (mul)
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
 
--- | A size in UTF-16 code units.
+-- | A size in UTF-8 code units (which is bytes).
 data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
           | Unknown                                         -- ^ Unknown size.
             deriving (Eq, Show)
@@ -55,9 +55,7 @@ exactly _ = Nothing
 
 -- | The 'Size' of the given code point.
 charSize :: Char -> Size
-charSize c
-  | ord c < 0x10000 = exactSize 1
-  | otherwise       = exactSize 2
+charSize = exactSize . utf8Length
 
 -- | The 'Size' of @n@ code points.
 codePointsSize :: Int -> Size
@@ -65,7 +63,7 @@ codePointsSize n =
 #if defined(ASSERTS)
     assert (n >= 0)
 #endif
-    Between n (2*n)
+    Between n (4*n)
 {-# INLINE codePointsSize #-}
 
 exactSize :: Int -> Size
@@ -160,7 +158,7 @@ upperBound _ (Between _ n) = n
 upperBound k _             = k
 {-# INLINE upperBound #-}
 
--- | Compute the maximum size from a size hint, if possible.
+-- | Compute the minimum size from a size hint, if possible.
 lowerBound :: Int -> Size -> Int
 lowerBound _ (Between n _) = n
 lowerBound k _             = k
diff --git a/src/Data/Text/Internal/Fusion/Types.hs b/src/Data/Text/Internal/Fusion/Types.hs
index df05773d..b97784f8 100644
--- a/src/Data/Text/Internal/Fusion/Types.hs
+++ b/src/Data/Text/Internal/Fusion/Types.hs
@@ -75,7 +75,7 @@ instance (Ord a) => Ord (Stream a) where
 -- unstreaming functions must be able to cope with the hint being too
 -- small or too large.
 --
--- The size hint tries to track the UTF-16 code units in a stream,
+-- The size hint tries to track the UTF-8 code units in a stream,
 -- but often counts the number of code points instead.  It can easily
 -- undercount if, for instance, a transformed stream contains astral
 -- plane code points (those above 0x10000).
diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
index dba6db34..69149779 100644
--- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -41,7 +41,7 @@ import Data.Text.Internal.Encoding.Fusion.Common
 import Data.Text.Encoding.Error
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
-import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Word (Word8, Word16, Word32)
 import qualified Data.Text.Internal.Encoding.Utf8 as U8
@@ -112,7 +112,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
   where
     next (T bs@(Chunk ps _) S0 i)
       | i + 1 < len && U16.validate1 x1 =
-          Yield (unsafeChr x1)         (T bs S0 (i+2))
+          Yield (unsafeChr16 x1)         (T bs S0 (i+2))
       | i + 3 < len && U16.validate2 x1 x2 =
           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
       where len = B.length ps
@@ -123,7 +123,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
     next st@(T bs s i) =
       case s of
         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
-          Yield (unsafeChr (c w1 w2))   es
+          Yield (unsafeChr16 (c w1 w2))   es
         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
         _ -> consume st
@@ -152,7 +152,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
   where
     next (T bs@(Chunk ps _) S0 i)
       | i + 1 < len && U16.validate1 x1 =
-          Yield (unsafeChr x1)         (T bs S0 (i+2))
+          Yield (unsafeChr16 x1)         (T bs S0 (i+2))
       | i + 3 < len && U16.validate2 x1 x2 =
           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
       where len = B.length ps
@@ -163,7 +163,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
     next st@(T bs s i) =
       case s of
         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
-          Yield (unsafeChr (c w1 w2))   es
+          Yield (unsafeChr16 (c w1 w2))   es
         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
         _ -> consume st
diff --git a/src/Data/Text/Internal/Lazy/Fusion.hs b/src/Data/Text/Internal/Lazy/Fusion.hs
index c9b71c86..867d0ac4 100644
--- a/src/Data/Text/Internal/Lazy/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Fusion.hs
@@ -79,8 +79,8 @@ unstreamChunks !chunkSize (Stream next s0 len0)
                     where unknownLength = 4
       where
         inner marr !len s !i
-            | i + 1 >= chunkSize = finish marr i s
-            | i + 1 >= len       = {-# SCC "unstreamChunks/resize" #-} do
+            | i + 3 >= chunkSize = finish marr i s
+            | i + 3 >= len       = {-# SCC "unstreamChunks/resize" #-} do
                 let newLen = min (len `shiftL` 1) chunkSize
                 marr' <- A.new newLen
                 A.copyM marr' 0 marr 0 len
diff --git a/src/Data/Text/Internal/Lazy/Search.hs b/src/Data/Text/Internal/Lazy/Search.hs
index 72930482..78450cbf 100644
--- a/src/Data/Text/Internal/Lazy/Search.hs
+++ b/src/Data/Text/Internal/Lazy/Search.hs
@@ -25,7 +25,7 @@ module Data.Text.Internal.Lazy.Search
 import Data.Bits (unsafeShiftL)
 import qualified Data.Text.Array as A
 import Data.Int (Int64)
-import Data.Word (Word16, Word64)
+import Data.Word (Word8, Word64)
 import qualified Data.Text.Internal as T
 import Data.Text.Internal.Fusion.Types (PairS(..))
 import Data.Text.Internal.Lazy (Text(..), foldlChunks)
@@ -75,8 +75,8 @@ indices needle@(Chunk n ns) _haystack@(Chunk k ks)
         where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
     (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
 
-    swizzle :: Word16 -> Word64
-    swizzle w = 1 `unsafeShiftL` (word16ToInt w .&. 0x3f)
+    swizzle :: Word8 -> Word64
+    swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f)
 
     buildTable (T.Text xarr xoff xlen) xs = go
       where
@@ -105,7 +105,7 @@ indices _ _ = []
 -- | Fast index into a partly unpacked 'Text'.  We take into account
 -- the possibility that the caller might try to access one element
 -- past the end.
-index :: T.Text -> Text -> Int64 -> Word16
+index :: T.Text -> Text -> Int64 -> Word8
 index (T.Text arr off len) xs !i
     | j < len   = A.unsafeIndex arr (off+j)
     | otherwise = case xs of
@@ -117,8 +117,8 @@ index (T.Text arr off len) xs !i
                     Chunk c cs -> index c cs (i-intToInt64 len)
     where j = int64ToInt i
 
--- | A variant of 'indices' that scans linearly for a single 'Word16'.
-indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
+-- | A variant of 'indices' that scans linearly for a single 'Word8'.
+indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64]
 indicesOne c = chunk
   where
     chunk :: Int64 -> T.Text -> Text -> [Int64]
@@ -131,7 +131,7 @@ indicesOne c = chunk
              | otherwise = go (h+1)
              where on = A.unsafeIndex oarr (ooff+h)
 
--- | The number of 'Word16' values in a 'Text'.
+-- | The number of 'Word8' values in a 'Text'.
 wordLength :: Text -> Int64
 wordLength = foldlChunks sumLength 0
   where
@@ -147,5 +147,5 @@ intToInt64 = fromIntegral
 int64ToInt :: Int64 -> Int
 int64ToInt = fromIntegral
 
-word16ToInt :: Word16 -> Int
-word16ToInt = fromIntegral
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
diff --git a/src/Data/Text/Internal/Search.hs b/src/Data/Text/Internal/Search.hs
index 0aaab2ab..5688917f 100644
--- a/src/Data/Text/Internal/Search.hs
+++ b/src/Data/Text/Internal/Search.hs
@@ -32,7 +32,7 @@ module Data.Text.Internal.Search
     ) where
 
 import qualified Data.Text.Array as A
-import Data.Word (Word64, Word16)
+import Data.Word (Word64, Word8)
 import Data.Text.Internal (Text(..))
 import Data.Bits ((.|.), (.&.), unsafeShiftL)
 
@@ -67,8 +67,8 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
               skp' | c == z    = nlen - i - 2
                    | otherwise = skp
 
-    swizzle :: Word16 -> Word64
-    swizzle k = 1 `unsafeShiftL` (word16ToInt k .&. 0x3f)
+    swizzle :: Word8 -> Word64
+    swizzle k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
 
     scan !i
         | i > ldiff                  = []
@@ -90,5 +90,5 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
                       | otherwise     = loop (i+1)
 {-# INLINE indices #-}
 
-word16ToInt :: Word16 -> Int
-word16ToInt = fromIntegral
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
diff --git a/src/Data/Text/Internal/Unsafe/Char.hs b/src/Data/Text/Internal/Unsafe/Char.hs
index 7982e276..3f3372c8 100644
--- a/src/Data/Text/Internal/Unsafe/Char.hs
+++ b/src/Data/Text/Internal/Unsafe/Char.hs
@@ -19,14 +19,14 @@
 module Data.Text.Internal.Unsafe.Char
     (
       ord
-    , unsafeChr
+    , unsafeChr16
     , unsafeChr8
     , unsafeChr32
     , unsafeWrite
     ) where
 
 import Control.Monad.ST (ST)
-import Data.Bits ((.&.), shiftR)
+import Data.Text.Internal.Encoding.Utf8
 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
 import GHC.Word (Word8(..), Word16(..), Word32(..))
 import qualified Data.Text.Array as A
@@ -39,9 +39,9 @@ ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
 {-# INLINE ord #-}
 
-unsafeChr :: Word16 -> Char
-unsafeChr (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
-{-# INLINE unsafeChr #-}
+unsafeChr16 :: Word16 -> Char
+unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
+{-# INLINE unsafeChr16 #-}
 
 unsafeChr8 :: Word8 -> Char
 unsafeChr8 (W8# w#) = C# (chr# (word2Int# (word8ToWord# w#)))
@@ -52,25 +52,36 @@ unsafeChr32 (W32# w#) = C# (chr# (word2Int# (word32ToWord# w#)))
 {-# INLINE unsafeChr32 #-}
 
 -- | Write a character into the array at the given offset.  Returns
--- the number of 'Word16's written.
+-- the number of 'Word8's written.
 unsafeWrite ::
 #if defined(ASSERTS)
     HasCallStack =>
 #endif
     A.MArray s -> Int -> Char -> ST s Int
-unsafeWrite marr i c
-    | n < 0x10000 = do
-        A.unsafeWrite marr i (intToWord16 n)
+unsafeWrite marr i c = case utf8Length c of
+    1 -> do
+        let n0 = intToWord8 (ord c)
+        A.unsafeWrite marr i n0
         return 1
-    | otherwise = do
-        A.unsafeWrite marr i lo
-        A.unsafeWrite marr (i+1) hi
+    2 -> do
+        let (n0, n1) = ord2 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
         return 2
-    where n = ord c
-          m = n - 0x10000
-          lo = intToWord16 $ (m `shiftR` 10) + 0xD800
-          hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00
+    3 -> do
+        let (n0, n1, n2) = ord3 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
+        A.unsafeWrite marr (i+2) n2
+        return 3
+    _ -> do
+        let (n0, n1, n2, n3) = ord4 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
+        A.unsafeWrite marr (i+2) n2
+        A.unsafeWrite marr (i+3) n3
+        return 4
 {-# INLINE unsafeWrite #-}
 
-intToWord16 :: Int -> Word16
-intToWord16 = fromIntegral
+intToWord8 :: Int -> Word8
+intToWord8 = fromIntegral
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 8f67b653..bb716286 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -253,7 +253,8 @@ import GHC.Stack (HasCallStack)
 -- points
 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
 -- as 'Char' values, including code points from this invalid range.
--- This means that there are some 'Char' values that are not valid
+-- This means that there are some 'Char' values
+-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
 -- Unicode scalar values, and the functions in this module must handle
 -- those cases.
 --
@@ -262,12 +263,7 @@ import GHC.Stack (HasCallStack)
 -- that are not valid Unicode scalar values with the replacement
 -- character \"&#xfffd;\" (U+FFFD).  Functions that perform this
 -- inspection and replacement are documented with the phrase
--- \"Performs replacement on invalid scalar values\".
---
--- (One reason for this policy of replacement is that internally, a
--- 'Text' value is represented as packed UTF-16 data. Values in the
--- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
--- code points, and so cannot be represented. The functions replace
+-- \"Performs replacement on invalid scalar values\". The functions replace
 -- invalid scalar values, instead of dropping them, as a security
 -- measure. For details, see
 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
@@ -283,13 +279,13 @@ equal Empty _     = False
 equal _ Empty     = False
 equal (Chunk a as) (Chunk b bs) =
     case compare lenA lenB of
-      LT -> a == (T.takeWord16 lenA b) &&
-            as `equal` Chunk (T.dropWord16 lenA b) bs
+      LT -> a == (T.takeWord8 lenA b) &&
+            as `equal` Chunk (T.dropWord8 lenA b) bs
       EQ -> a == b && as `equal` bs
-      GT -> T.takeWord16 lenB a == b &&
-            Chunk (T.dropWord16 lenB a) as `equal` bs
-  where lenA = T.lengthWord16 a
-        lenB = T.lengthWord16 b
+      GT -> T.takeWord8 lenB a == b &&
+            Chunk (T.dropWord8 lenB a) as `equal` bs
+  where lenA = T.lengthWord8 a
+        lenB = T.lengthWord8 b
 
 instance Eq Text where
     (==) = equal
@@ -1040,9 +1036,9 @@ dropEnd n t0
                     T.dropEnd (int64ToInt m) t : ts
       where l = intToInt64 (T.length t)
 
--- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16'
+-- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word8'
 -- values dropped, or the empty 'Text' if @n@ is greater than the
--- number of 'Word16' values present.
+-- number of 'Word8' values present.
 dropWords :: Int64 -> Text -> Text
 dropWords i t0
     | i <= 0    = t0
@@ -1082,7 +1078,7 @@ takeWhileEnd :: (Char -> Bool) -> Text -> Text
 takeWhileEnd p = takeChunk empty . L.reverse . toChunks
   where takeChunk acc []     = acc
         takeChunk acc (t:ts)
-          | T.lengthWord16 t' < T.lengthWord16 t
+          | T.lengthWord8 t' < T.lengthWord8 t
                              = chunk t' acc
           | otherwise        = takeChunk (Chunk t' acc) ts
           where t' = T.takeWhileEnd p t
@@ -1164,7 +1160,7 @@ splitAt = loop
          where len = intToInt64 (T.length t)
 
 -- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first
--- element is a prefix of @t@ whose chunks contain @n@ 'Word16'
+-- element is a prefix of @t@ whose chunks contain @n@ 'Word8'
 -- values, and whose second is the remainder of the string.
 splitAtWord :: Int64 -> Text -> PairS Text Text
 splitAtWord _ Empty = empty :*: empty
diff --git a/src/Data/Text/Lazy/Builder/Int.hs b/src/Data/Text/Lazy/Builder/Int.hs
index a6a79d64..f861d1e5 100644
--- a/src/Data/Text/Lazy/Builder/Int.hs
+++ b/src/Data/Text/Lazy/Builder/Int.hs
@@ -102,15 +102,15 @@ posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0
           let i = fromIntegral i0; j = i + i
           unsafeWrite marr off $ get (j + 1)
           unsafeWrite marr (off - 1) $ get j
-        get = word8ToWord16 . B.unsafeIndex digits
+        get = B.unsafeIndex digits
 
-minus, zero :: Word16
+minus, zero :: Word8
 {-# INLINE minus #-}
 {-# INLINE zero #-}
 minus = 45
 zero = 48
 
-i2w :: (Integral a) => a -> Word16
+i2w :: (Integral a) => a -> Word8
 {-# INLINE i2w #-}
 i2w v = zero + fromIntegral v
 
@@ -242,6 +242,3 @@ integer base i
             | otherwise = loop (d-1) q <> hexDigit r
             where q = n `quotInt` base
                   r = n `remInt` base
-
-word8ToWord16 :: Word8 -> Word16
-word8ToWord16 = fromIntegral
diff --git a/src/Data/Text/Show.hs b/src/Data/Text/Show.hs
index 8d7ceb0e..9baeaeee 100644
--- a/src/Data/Text/Show.hs
+++ b/src/Data/Text/Show.hs
@@ -21,6 +21,7 @@ module Data.Text.Show
 
 import Control.Monad.ST (ST)
 import Data.Text.Internal (Text(..), empty_, safe)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length)
 import Data.Text.Internal.Fusion (stream, unstream)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import GHC.Prim (Addr#)
@@ -95,7 +96,6 @@ singleton_ c = Text (A.run x) 0 len
         x = do arr <- A.new len
                _ <- unsafeWrite arr 0 d
                return arr
-        len | d < '\x10000' = 1
-            | otherwise     = 2
+        len = utf8Length d
         d = safe c
 {-# NOINLINE singleton_ #-}
diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs
index 88727074..64cc83c4 100644
--- a/src/Data/Text/Unsafe.hs
+++ b/src/Data/Text/Unsafe.hs
@@ -20,19 +20,19 @@ module Data.Text.Unsafe
     , reverseIter_
     , unsafeHead
     , unsafeTail
-    , lengthWord16
-    , takeWord16
-    , dropWord16
+    , lengthWord8
+    , takeWord8
+    , dropWord8
     ) where
 
 #if defined(ASSERTS)
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Text.Internal.Encoding.Utf16 (chr2)
+import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
 import Data.Text.Internal (Text(..))
 import Data.Text.Internal.Unsafe (inlineInterleaveST, inlinePerformIO)
-import Data.Text.Internal.Unsafe.Char (unsafeChr)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8)
 import qualified Data.Text.Array as A
 import GHC.IO (unsafeDupablePerformIO)
 
@@ -40,11 +40,15 @@ import GHC.IO (unsafeDupablePerformIO)
 -- omits the check for the empty case, so there is an obligation on
 -- the programmer to provide a proof that the 'Text' is non-empty.
 unsafeHead :: Text -> Char
-unsafeHead (Text arr off _len)
-    | m < 0xD800 || m > 0xDBFF = unsafeChr m
-    | otherwise                = chr2 m n
-    where m = A.unsafeIndex arr off
-          n = A.unsafeIndex arr (off+1)
+unsafeHead (Text arr off _len) = case utf8LengthByLeader m0 of
+    1 -> unsafeChr8 m0
+    2 -> chr2 m0 m1
+    3 -> chr3 m0 m1 m2
+    _ -> chr4 m0 m1 m2 m3
+    where m0 = A.unsafeIndex arr off
+          m1 = A.unsafeIndex arr (off+1)
+          m2 = A.unsafeIndex arr (off+2)
+          m3 = A.unsafeIndex arr (off+3)
 {-# INLINE unsafeHead #-}
 
 -- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail'
@@ -60,8 +64,9 @@ unsafeTail t@(Text arr off len) =
 {-# INLINE unsafeTail #-}
 
 data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int
+  deriving (Show)
 
--- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16
+-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-8
 -- array, returning the current character and the delta to add to give
 -- the next offset to iterate at.
 iter ::
@@ -69,61 +74,73 @@ iter ::
   HasCallStack =>
 #endif
   Text -> Int -> Iter
-iter (Text arr off _len) i
-    | m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1
-    | otherwise                = Iter (chr2 m n) 2
-  where m = A.unsafeIndex arr j
-        n = A.unsafeIndex arr k
+iter (Text arr off _len) i = Iter chr l
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j+1)
+        m2 = A.unsafeIndex arr (j+2)
+        m3 = A.unsafeIndex arr (j+3)
         j = off + i
-        k = j + 1
+        l = utf8LengthByLeader m0
+        chr = case l of
+            1 -> unsafeChr8 m0
+            2 -> chr2 m0 m1
+            3 -> chr3 m0 m1 m2
+            _ -> chr4 m0 m1 m2 m3
 {-# INLINE iter #-}
 
--- | /O(1)/ Iterate one step through a UTF-16 array, returning the
+-- | /O(1)/ Iterate one step through a UTF-8 array, returning the
 -- delta to add to give the next offset to iterate at.
 iter_ :: Text -> Int -> Int
-iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1
-                            | otherwise                = 2
+iter_ (Text arr off _len) i = utf8LengthByLeader m
   where m = A.unsafeIndex arr (off+i)
 {-# INLINE iter_ #-}
 
--- | /O(1)/ Iterate one step backwards through a UTF-16 array,
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the current character and the delta to add (i.e. a
 -- negative number) to give the next offset to iterate at.
 reverseIter :: Text -> Int ->  Iter
 reverseIter (Text arr off _len) i
-    | m < 0xDC00 || m > 0xDFFF = Iter (unsafeChr m) (-1)
-    | otherwise                = Iter (chr2 n m) (-2)
-  where m = A.unsafeIndex arr j
-        n = A.unsafeIndex arr k
+    | m0 <  0x80 = Iter (unsafeChr8 m0) (-1)
+    | m1 >= 0xC0 = Iter (chr2 m1 m0) (-2)
+    | m2 >= 0xC0 = Iter (chr3 m2 m1 m0) (-3)
+    | otherwise  = Iter (chr4 m3 m2 m1 m0) (-4)
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j-1)
+        m2 = A.unsafeIndex arr (j-2)
+        m3 = A.unsafeIndex arr (j-3)
         j = off + i
-        k = j - 1
 {-# INLINE reverseIter #-}
 
--- | /O(1)/ Iterate one step backwards through a UTF-16 array,
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the delta to add (i.e. a negative number) to give the
 -- next offset to iterate at.
 --
 -- @since 1.1.1.0
 reverseIter_ :: Text -> Int -> Int
 reverseIter_ (Text arr off _len) i
-    | m < 0xDC00 || m > 0xDFFF = -1
-    | otherwise                = -2
-  where m = A.unsafeIndex arr (off+i)
+    | m0 <  0x80 = -1
+    | m1 >= 0xC0 = -2
+    | m2 >= 0xC0 = -3
+    | otherwise  = -4
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j-1)
+        m2 = A.unsafeIndex arr (j-2)
+        j = off + i
 {-# INLINE reverseIter_ #-}
 
--- | /O(1)/ Return the length of a 'Text' in units of 'Word16'.  This
+-- | /O(1)/ Return the length of a 'Text' in units of 'Word8'.  This
 -- is useful for sizing a target array appropriately before using
 -- 'unsafeCopyToPtr'.
-lengthWord16 :: Text -> Int
-lengthWord16 (Text _arr _off len) = len
-{-# INLINE lengthWord16 #-}
+lengthWord8 :: Text -> Int
+lengthWord8 (Text _arr _off len) = len
+{-# INLINE lengthWord8 #-}
 
--- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'.
-takeWord16 :: Int -> Text -> Text
-takeWord16 k (Text arr off _len) = Text arr off k
-{-# INLINE takeWord16 #-}
+-- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'.
+takeWord8 :: Int -> Text -> Text
+takeWord8 k (Text arr off _len) = Text arr off k
+{-# INLINE takeWord8 #-}
 
--- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'.
-dropWord16 :: Int -> Text -> Text
-dropWord16 k (Text arr off len) = Text arr (off+k) (len-k)
-{-# INLINE dropWord16 #-}
+-- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'.
+dropWord8 :: Int -> Text -> Text
+dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
+{-# INLINE dropWord8 #-}
diff --git a/tests/Tests/Properties/LowLevel.hs b/tests/Tests/Properties/LowLevel.hs
index 1972fd36..79aecd05 100644
--- a/tests/Tests/Properties/LowLevel.hs
+++ b/tests/Tests/Properties/LowLevel.hs
@@ -10,7 +10,7 @@ import Control.Exception as E (SomeException, catch, evaluate)
 import Data.Int (Int32, Int64)
 import Data.Text.Foreign
 import Data.Text.Internal (mul, mul32, mul64)
-import Data.Word (Word16, Word32)
+import Data.Word (Word8, Word16, Word32)
 import System.IO.Unsafe (unsafePerformIO)
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -46,9 +46,9 @@ t_mul a b = mulRef a b === eval mul a b
 
 -- Misc.
 
-t_dropWord16 m t = dropWord16 m t `T.isSuffixOf` t
-t_takeWord16 m t = takeWord16 m t `T.isPrefixOf` t
-t_take_drop_16 (Small n) t = T.append (takeWord16 n t) (dropWord16 n t) === t
+t_dropWord8 m t = dropWord8 m t `T.isSuffixOf` t
+t_takeWord8 m t = takeWord8 m t `T.isPrefixOf` t
+t_take_drop_8 (Small n) t = T.append (takeWord8 n t) (dropWord8 n t) === t
 t_use_from t = ioProperty $ (==t) <$> useAsPtr t fromPtr
 
 t_copy t = T.copy t === t
@@ -80,9 +80,9 @@ testLowLevel =
     ],
 
     testGroup "misc" [
-      testProperty "t_dropWord16" t_dropWord16,
-      testProperty "t_takeWord16" t_takeWord16,
-      testProperty "t_take_drop_16" t_take_drop_16,
+      testProperty "t_dropWord8" t_dropWord8,
+      testProperty "t_takeWord8" t_takeWord8,
+      testProperty "t_take_drop_8" t_take_drop_8,
       testProperty "t_use_from" t_use_from,
       testProperty "t_copy" t_copy
     ],
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index e6e1a60d..94790fa0 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -32,7 +32,7 @@ import Control.Arrow ((***))
 import Control.DeepSeq (NFData (..), deepseq)
 import Control.Exception (bracket)
 import Data.Char (isSpace)
-import Data.Text.Foreign (I16)
+import Data.Text.Foreign (I8)
 import Data.Text.Lazy.Builder.RealFloat (FPFormat(..))
 import Data.Word (Word8, Word16)
 import Test.QuickCheck hiding (Fixed(..), Small (..), (.&.))
@@ -51,7 +51,7 @@ import qualified System.IO as IO
 genWord8 :: Gen Word8
 genWord8 = chooseAny
 
-instance Arbitrary I16 where
+instance Arbitrary I8 where
     arbitrary     = arbitrarySizedIntegral
     shrink        = shrinkIntegral
 
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
index 157d0e89..90de08bf 100644
--- a/tests/Tests/Regressions.hs
+++ b/tests/Tests/Regressions.hs
@@ -80,10 +80,10 @@ mapAccumL_resize = do
   let f a _ = (a, '\65536')
       count = 5
       val   = T.mapAccumL f (0::Int) (T.replicate count "a")
-  assertEqual "mapAccumL should correctly fill buffers for two-word results"
+  assertEqual "mapAccumL should correctly fill buffers for four-byte results"
              (0, T.replicate count "\65536") val
-  assertEqual "mapAccumL should correctly size buffers for two-word results"
-             (count * 2) (T.lengthWord16 (snd val))
+  assertEqual "mapAccumL should correctly size buffers for four-byte results"
+             (count * 4) (T.lengthWord8 (snd val))
 
 -- See GitHub #197
 t197 :: IO ()
diff --git a/text.cabal b/text.cabal
index 2535129c..ccf86a6d 100644
--- a/text.cabal
+++ b/text.cabal
@@ -8,7 +8,7 @@ synopsis:       An efficient packed Unicode text type.
 description:
     .
     An efficient packed, immutable Unicode text type (both strict and
-    lazy), with a powerful loop fusion optimization framework.
+    lazy).
     .
     The 'Text' type represents Unicode character strings, in a time and
     space-efficient manner. This package provides text processing
@@ -37,23 +37,12 @@ description:
     the [text-icu package](https://hackage.haskell.org/package/text-icu)
     based on the well-respected and liberally
     licensed [ICU library](http://site.icu-project.org/).
-    .
-    == Internal Representation: UTF-16 vs. UTF-8
-    .
-    Currently the @text@ library uses UTF-16 as its internal representation
-    which is [neither a fixed-width nor always the most dense representation](http://utf8everywhere.org/)
-    for Unicode text. We're currently investigating the feasibility
-    of [changing Text's internal representation to UTF-8](https://github.com/text-utf8)
-    and if you need such a 'Text' type right now you might be interested in using the spin-off
-    packages <https://hackage.haskell.org/package/text-utf8 text-utf8> and
-    <https://hackage.haskell.org/package/text-short text-short>.
-
 
 license:        BSD2
 license-file:   LICENSE
 author:         Bryan O'Sullivan <bos@serpentine.com>
 maintainer:     Haskell Text Team <andrew.lelechenko@gmail.com>, Core Libraries Committee
-copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper
+copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper, 2021 Andrew Lelechenko
 category:       Data, Text
 build-type:     Simple
 tested-with:    GHC==9.0.1,
@@ -76,6 +65,7 @@ flag developer
 
 library
   c-sources:    cbits/cbits.c
+                cbits/utils.c
   include-dirs: include
   hs-source-dirs: src
 

From d3beb9436e5eee3e52628f84d002c425cf010639 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Tue, 24 Aug 2021 22:11:52 +0100
Subject: [PATCH 02/38] Rename constructors in Data.Array to highlight
 compatibility issues in downstream packages

---
 src/Data/Text/Array.hs     | 36 ++++++++++++++++--------------------
 src/Data/Text/Encoding.hs  | 24 ++++++++++++------------
 tests/Tests/Regressions.hs | 11 ++++++-----
 3 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 6566f40a..c9bff89a 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -55,14 +55,10 @@ import GHC.Word (Word8(..))
 import Prelude hiding (length, read)
 
 -- | Immutable array type.
---
--- The 'Array' constructor is exposed since @text-1.1.1.3@
-data Array = Array { aBA :: ByteArray# }
+data Array = ByteArray ByteArray#
 
 -- | Mutable array type, for use in the ST monad.
---
--- The 'MArray' constructor is exposed since @text-1.1.1.3@
-data MArray s = MArray { maBA :: MutableByteArray# s }
+data MArray s = MutableByteArray (MutableByteArray# s)
 
 -- | Create an uninitialized mutable array.
 new :: forall s. Int -> ST s (MArray s)
@@ -70,7 +66,7 @@ new n
   | n < 0 || n .&. highBit /= 0 = array_size_error
   | otherwise = ST $ \s1# ->
        case newByteArray# len# s1# of
-         (# s2#, marr# #) -> (# s2#, MArray marr# #)
+         (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
   where !(I# len#) = bytesInArray n
         highBit    = maxBound `xor` (maxBound `shiftR` 1)
 {-# INLINE new #-}
@@ -80,9 +76,9 @@ array_size_error = error "Data.Text.Array.new: size overflow"
 
 -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
 unsafeFreeze :: MArray s -> ST s Array
-unsafeFreeze MArray{..} = ST $ \s1# ->
-    case unsafeFreezeByteArray# maBA s1# of
-        (# s2#, ba# #) -> (# s2#, Array ba# #)
+unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
+    case unsafeFreezeByteArray# marr s1# of
+        (# s2#, ba# #) -> (# s2#, ByteArray ba# #)
 {-# INLINE unsafeFreeze #-}
 
 -- | Indicate how many bytes would be used for an array of the given
@@ -98,22 +94,22 @@ unsafeIndex ::
   HasCallStack =>
 #endif
   Array -> Int -> Word8
-unsafeIndex a@Array{..} i@(I# i#) =
+unsafeIndex (ByteArray arr) i@(I# i#) =
 #if defined(ASSERTS)
-  let word8len = I# (sizeofByteArray# aBA) in
+  let word8len = I# (sizeofByteArray# arr) in
   if i < 0 || i >= word8len
   then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word8len)
   else
 #endif
-  case indexWord8Array# aBA i# of r# -> (W8# r#)
+  case indexWord8Array# arr i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
 #if defined(ASSERTS)
 -- sizeofMutableByteArray# is deprecated, because it is unsafe in the presence of
 -- shrinkMutableByteArray# and resizeMutableByteArray#.
 getSizeofMArray :: MArray s -> ST s Int
-getSizeofMArray ma@MArray{..} = ST $ \s0# ->
-  case getSizeofMutableByteArray# maBA s0# of
+getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
+  case getSizeofMutableByteArray# marr s0# of
     (# s1#, word8len# #) -> (# s1#, I# word8len# #)
 
 checkBoundsM :: HasCallStack => MArray s -> Int -> Int -> ST s ()
@@ -131,11 +127,11 @@ unsafeWrite ::
   HasCallStack =>
 #endif
   MArray s -> Int -> Word8 -> ST s ()
-unsafeWrite ma@MArray{..} i@(I# i#) (W8# e#) =
+unsafeWrite ma@(MutableByteArray marr) i@(I# i#) (W8# e#) =
 #if defined(ASSERTS)
   checkBoundsM ma i 1 >>
 #endif
-  (ST $ \s1# -> case writeWord8Array# maBA i# e# s1# of
+  (ST $ \s1# -> case writeWord8Array# marr i# e# s1# of
     s2# -> (# s2#, () #))
 {-# INLINE unsafeWrite #-}
 
@@ -170,7 +166,7 @@ copyM :: MArray s               -- ^ Destination
       -> Int                    -- ^ Source offset
       -> Int                    -- ^ Count
       -> ST s ()
-copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff#) count@(I# count#)
+copyM dst@(MutableByteArray dst#) dstOff@(I# dstOff#) src@(MutableByteArray src#) srcOff@(I# srcOff#) count@(I# count#)
     | I# count# <= 0 = return ()
     | otherwise = do
 #if defined(ASSERTS)
@@ -191,7 +187,7 @@ copyI :: MArray s               -- ^ Destination
       -> Int                    -- ^ First offset in destination /not/ to
                                 -- copy (i.e. /not/ length)
       -> ST s ()
-copyI (MArray dst#) dstOff@(I# dstOff#) (Array src#) (I# srcOff#) top@(I# top#)
+copyI (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray src#) (I# srcOff#) top@(I# top#)
     | dstOff >= top = return ()
     | otherwise = ST $ \s1# ->
       case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
@@ -206,7 +202,7 @@ equal :: Array                  -- ^ First
       -> Int                    -- ^ Offset into second
       -> Int                    -- ^ Count
       -> Bool
-equal (Array src1#) (I# off1#) (Array src2#) (I# off2#) (I# count#) = i == 0
+equal (ByteArray src1#) (I# off1#) (ByteArray src2#) (I# off2#) (I# count#) = i == 0
   where
 #if MIN_VERSION_base(4,11,0)
     i = I# (compareByteArrays# src1# off1# src2# off2# count#)
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 186e12b4..6a42b52b 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -127,9 +127,9 @@ decodeLatin1 bs = withBS bs aux where
   aux fp len = text a 0 actualLen
    where
     (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
-    go dest = unsafeWithForeignPtr fp $ \src -> do
-      destLen <- c_decode_latin1 (A.maBA dest) src (src `plusPtr` len)
-      return (dest, destLen)
+    go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
+      destLen <- c_decode_latin1 dest src (src `plusPtr` len)
+      return (A.MutableByteArray dest, destLen)
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -147,15 +147,15 @@ decodeUtf8With ::
 decodeUtf8With onErr bs = withBS bs aux
  where
   aux fp len = runText $ \done -> do
-    let go dest = unsafeWithForeignPtr fp $ \ptr ->
+    let go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
           with (0::CSize) $ \destOffPtr -> do
             let end = ptr `plusPtr` len
                 loop curPtr = do
-                  curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
+                  curPtr' <- c_decode_utf8 dest destOffPtr curPtr end
                   if curPtr' == end
                     then do
                       n <- peek destOffPtr
-                      unsafeSTToIO (done dest (cSizeToInt n))
+                      unsafeSTToIO (done (A.MutableByteArray dest) (cSizeToInt n))
                     else do
                       x <- peek curPtr'
                       case onErr desc (Just x) of
@@ -167,7 +167,7 @@ decodeUtf8With onErr bs = withBS bs aux
                           | otherwise -> do
                               destOff <- peek destOffPtr
                               w <- unsafeSTToIO $
-                                   unsafeWrite dest (cSizeToInt destOff)
+                                   unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff)
                                                (safe c)
                               poke destOffPtr (destOff + intToCSize w)
                               loop $ curPtr' `plusPtr` 1
@@ -288,7 +288,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
     aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100)
        where
         decodeChunkToBuffer :: A.MArray s -> IO Decoding
-        decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
+        decodeChunkToBuffer (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
           with (0::CSize) $ \destOffPtr ->
           with codepoint0 $ \codepointPtr ->
           with state0 $ \statePtr ->
@@ -297,7 +297,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                 loop curPtr = do
                   prevState <- peek statePtr
                   poke curPtrPtr curPtr
-                  lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+                  lastPtr <- c_decode_utf8_with_state dest destOffPtr
                              curPtrPtr end codepointPtr statePtr
                   state <- peek statePtr
                   case state of
@@ -309,7 +309,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                             Just c -> do
                               destOff <- peek destOffPtr
                               w <- unsafeSTToIO $
-                                   unsafeWrite dest (cSizeToInt destOff) (safe c)
+                                   unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff) (safe c)
                               poke destOffPtr (destOff + intToCSize w)
                       if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
                         -- If we can't complete the sequence @undecoded0@ from
@@ -327,7 +327,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                       n <- peek destOffPtr
                       codepoint <- peek codepointPtr
                       chunkText <- unsafeSTToIO $ do
-                          arr <- A.unsafeFreeze dest
+                          arr <- A.unsafeFreeze (A.MutableByteArray dest)
                           return $! text arr 0 (cSizeToInt n)
                       let left = lastPtr `minusPtr` ptr
                           !undecoded = case state of
@@ -434,7 +434,7 @@ encodeUtf8BuilderEscaped be =
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text (A.Array arr) off len)
+encodeUtf8 (Text (A.ByteArray arr) off len)
   | len == 0  = B.empty
   | otherwise = B.take len $ B.drop off $ SBS.fromShort $ SBS.SBS arr
 
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
index 90de08bf..cf011680 100644
--- a/tests/Tests/Regressions.hs
+++ b/tests/Tests/Regressions.hs
@@ -1,5 +1,6 @@
 -- | Regression tests for specific bugs.
 --
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE MagicHash #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ScopedTypeVariables #-}
@@ -127,15 +128,15 @@ t280_singleton =
 t301 :: IO ()
 t301 = do
     assertEqual "The length of the array remains the same despite slicing"
-                (I# (sizeofByteArray# (TA.aBA originalArr)))
-                (I# (sizeofByteArray# (TA.aBA newArr)))
+                (I# (sizeofByteArray# originalArr))
+                (I# (sizeofByteArray# newArr))
 
     assertEqual "The new array still contains the original value"
-                (T.Text newArr originalOff originalLen)
+                (T.Text (TA.ByteArray newArr) originalOff originalLen)
                 original
   where
-    original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
-    T.Text newArr _off _len = T.take 1 $ T.drop 1 original
+    !original@(T.Text (TA.ByteArray originalArr) originalOff originalLen) = T.pack "1234567890"
+    !(T.Text (TA.ByteArray newArr) _off _len) = T.take 1 $ T.drop 1 original
 
 t330 :: IO ()
 t330 = do

From fd8cf068c0ae8230d15ac3c057b34e74965513ec Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 22 May 2021 19:25:27 +0100
Subject: [PATCH 03/38] Make utf8Length branchless

---
 src/Data/Text/Internal/Encoding/Utf8.hs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index fa69b7d9..e31c2f7a 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -35,7 +35,7 @@ module Data.Text.Internal.Encoding.Utf8
     , validate4
     ) where
 
-import Data.Bits ((.&.), shiftR)
+import Data.Bits (Bits(..))
 import Data.Char (ord)
 import GHC.Exts
 import GHC.Word (Word8(..))
@@ -54,13 +54,15 @@ between :: Word8                -- ^ byte to check
 between x y z = x >= y && x <= z
 {-# INLINE between #-}
 
--- TODO make branchless by looking into Word64 by clz (ord c)
+-- This is a branchless version of
+-- utf8Length c
+--   | ord c < 0x80    = 1
+--   | ord c < 0x800   = 2
+--   | ord c < 0x10000 = 3
+--   | otherwise       = 4
 utf8Length :: Char -> Int
-utf8Length c
-  | ord c < 0x80    = 1
-  | ord c < 0x800   = 2
-  | ord c < 0x10000 = 3
-  | otherwise       = 4
+utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800#) +# geChar# c (chr# 0x10000#)))
+{-# INLINE utf8Length #-}
 
 utf8LengthByLeader :: Word8 -> Int
 utf8LengthByLeader w

From fd65ba26178ff86bb2f2d226dd271b355b4ee160 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 14 Jul 2021 22:40:48 +0100
Subject: [PATCH 04/38] Enable asserts in ord{2,3,4} and inline

---
 src/Data/Text/Internal/Encoding/Utf8.hs | 45 ++++++++++++++++++-------
 tests/Tests/Properties/Transcoding.hs   | 18 +++++-----
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index e31c2f7a..836b32bf 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -35,6 +35,10 @@ module Data.Text.Internal.Encoding.Utf8
     , validate4
     ) where
 
+#if defined(ASSERTS)
+import Control.Exception (assert)
+import GHC.Stack (HasCallStack)
+#endif
 import Data.Bits (Bits(..))
 import Data.Char (ord)
 import GHC.Exts
@@ -71,34 +75,48 @@ utf8LengthByLeader w
   | w < 0xF0  = 3
   | otherwise = 4
 
-ord2 :: Char -> (Word8,Word8)
+ord2 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Char -> (Word8,Word8)
 ord2 c =
-    -- ord2 is used only in test suite to construct a deliberately invalid ByteString,
-    -- actually violating the assertion, so it is commented out
-    -- assert (n >= 0x80 && n <= 0x07ff)
+#if defined(ASSERTS)
+    assert (n >= 0x80 && n <= 0x07ff)
+#endif
     (x1,x2)
     where
       n  = ord c
       x1 = intToWord8 $ (n `shiftR` 6) + 0xC0
       x2 = intToWord8 $ (n .&. 0x3F)   + 0x80
+{-# INLINE ord2 #-}
 
-ord3 :: Char -> (Word8,Word8,Word8)
+ord3 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Char -> (Word8,Word8,Word8)
 ord3 c =
-    -- ord3 is used only in test suite to construct a deliberately invalid ByteString,
-    -- actually violating the assertion, so it is commented out
-    -- assert (n >= 0x0800 && n <= 0xffff)
+#if defined(ASSERTS)
+    assert (n >= 0x0800 && n <= 0xffff)
+#endif
     (x1,x2,x3)
     where
       n  = ord c
       x1 = intToWord8 $ (n `shiftR` 12) + 0xE0
       x2 = intToWord8 $ ((n `shiftR` 6) .&. 0x3F) + 0x80
       x3 = intToWord8 $ (n .&. 0x3F) + 0x80
+{-# INLINE ord3 #-}
 
-ord4 :: Char -> (Word8,Word8,Word8,Word8)
+ord4 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Char -> (Word8,Word8,Word8,Word8)
 ord4 c =
-    -- ord4 is used only in test suite to construct a deliberately invalid ByteString,
-    -- actually violating the assertion, so it is commented out
-    -- assert (n >= 0x10000)
+#if defined(ASSERTS)
+    assert (n >= 0x10000)
+#endif
     (x1,x2,x3,x4)
     where
       n  = ord c
@@ -106,6 +124,7 @@ ord4 c =
       x2 = intToWord8 $ ((n `shiftR` 12) .&. 0x3F) + 0x80
       x3 = intToWord8 $ ((n `shiftR` 6) .&. 0x3F) + 0x80
       x4 = intToWord8 $ (n .&. 0x3F) + 0x80
+{-# INLINE ord4 #-}
 
 chr2 :: Word8 -> Word8 -> Char
 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
@@ -127,7 +146,7 @@ chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
       !z3# = y3# -# 0x80#
 {-# INLINE chr3 #-}
 
-chr4             :: Word8 -> Word8 -> Word8 -> Word8 -> Char
+chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
     C# (chr# (z1# +# z2# +# z3# +# z4#))
     where
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index f0f5d95b..8d4607f0 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -7,7 +7,7 @@ module Tests.Properties.Transcoding
     ) where
 
 import Control.Applicative ((<$>), (<*>))
-import Data.Bits ((.&.))
+import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Data.Text.Encoding.Error (UnicodeException)
 import Data.Text.Internal.Encoding.Utf8 (ord2, ord3, ord4)
@@ -141,21 +141,21 @@ genInvalidUTF8 = B.pack <$> oneof [
     -- short 4-byte sequence
   , (:) <$> choose (0xF0, 0xF4) <*> upTo 2 contByte
     -- overlong encoding
-  , do k <- choose (0,0xFFFF)
-       let c = chr k
+  , do k <- choose (0 :: Int, 0xFFFF)
        case k of
-         _ | k < 0x80   -> oneof [ let (w,x)     = ord2 c in return [w,x]
-                                 , let (w,x,y)   = ord3 c in return [w,x,y]
-                                 , let (w,x,y,z) = ord4 c in return [w,x,y,z] ]
-           | k < 0x7FF  -> oneof [ let (w,x,y)   = ord3 c in return [w,x,y]
-                                 , let (w,x,y,z) = ord4 c in return [w,x,y,z] ]
-           | otherwise  ->         let (w,x,y,z) = ord4 c in return [w,x,y,z]
+         _ | k < 0x80   -> elements [ord2_ k, ord3_ k, ord4_ k]
+           | k < 0x7FF  -> elements [ord3_ k, ord4_ k]
+           | otherwise  -> return (ord4_ k)
   ]
   where
     contByte = (0x80 +) <$> choose (0, 0x3f)
     upTo n gen = do
       k <- choose (0,n)
       vectorOf k gen
+    -- Data.Text.Internal.Encoding.Utf8.ord{2,3,4} withous sanity checks
+    ord2_ n = map fromIntegral [(n `shiftR` 6) + 0xC0, (n .&. 0x3F) + 0x80]
+    ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
+    ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
 decodeLL :: BL.ByteString -> TL.Text
 decodeLL = EL.decodeUtf8With E.lenientDecode

From 0df0173ee21af4625c2b21f1f2cb063e6a664571 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 22 May 2021 19:51:47 +0100
Subject: [PATCH 05/38] Use a primitive to resize MutableByteArray

---
 src/Data/Text/Array.hs           | 8 ++++++++
 src/Data/Text/Internal/Fusion.hs | 6 ++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index c9bff89a..e136fd39 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -27,6 +27,7 @@ module Data.Text.Array
       Array(..)
     , MArray(..)
     -- * Functions
+    , resizeM
     , copyM
     , copyI
     , empty
@@ -41,6 +42,7 @@ module Data.Text.Array
     ) where
 
 #if defined(ASSERTS)
+-- TODO employ resizeMutableByteArray# instead of cropping Text
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
@@ -159,6 +161,12 @@ run2 k = runST (do
                  return (arr,b))
 {-# INLINE run2 #-}
 
+resizeM :: MArray s -> Int -> ST s (MArray s)
+resizeM (MutableByteArray ma) i@(I# i#) = ST $ \s1# ->
+  case resizeMutableByteArray# ma i# s1# of
+    (# s2#, newArr #) -> (# s2#, MutableByteArray newArr #)
+{-# INLINE resizeM #-}
+
 -- | Copy some elements of a mutable array.
 copyM :: MArray s               -- ^ Destination
       -> Int                    -- ^ Destination offset
diff --git a/src/Data/Text/Internal/Fusion.hs b/src/Data/Text/Internal/Fusion.hs
index 01b781a1..1ff8672d 100644
--- a/src/Data/Text/Internal/Fusion.hs
+++ b/src/Data/Text/Internal/Fusion.hs
@@ -157,8 +157,7 @@ unstream (Stream next0 s0 len) = runText $ \done -> do
         {-# NOINLINE realloc #-}
         realloc !si !di = do
             let newlen = (maxi + 1) * 2
-            arr' <- A.new newlen
-            A.copyM arr' 0 arr 0 di
+            arr' <- A.resizeM arr newlen
             outer arr' (newlen - 1) si di
 
   outer arr0 (mlen - 1) s0 0
@@ -299,8 +298,7 @@ mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
               Yield x s'
                 | j >= top  -> {-# SCC "mapAccumL/resize" #-} do
                                let top' = (top + 1) `shiftL` 1
-                               arr' <- A.new top'
-                               A.copyM arr' 0 arr 0 top
+                               arr' <- A.resizeM arr top'
                                outer arr' top' z s i
                 | otherwise -> do d <- unsafeWrite arr i c
                                   loop z' s' (i+d)

From c10dd8f49adc730c9ba18b9147fa18b84d75fc97 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 22 May 2021 20:44:26 +0100
Subject: [PATCH 06/38] Implement utf8LengthByLeader via bit magic

---
 src/Data/Text/Internal/Encoding/Utf8.hs | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 836b32bf..9ee0c1c9 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -39,7 +39,7 @@ module Data.Text.Internal.Encoding.Utf8
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits (Bits(..))
+import Data.Bits (Bits(..), FiniteBits(..))
 import Data.Char (ord)
 import GHC.Exts
 import GHC.Word (Word8(..))
@@ -68,12 +68,21 @@ utf8Length :: Char -> Int
 utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800#) +# geChar# c (chr# 0x10000#)))
 {-# INLINE utf8Length #-}
 
+-- This is a branchless version of
+-- utf8LengthByLeader w
+--   | w < 0x80  = 1
+--   | w < 0xE0  = 2
+--   | w < 0xF0  = 3
+--   | otherwise = 4
+--
+-- c `xor` I# (c# <=# 0#) is a branchless equivalent of c `max` 1.
+-- It is crucial to write c# <=# 0# and not c# ==# 0#, otherwise
+-- GHC is tempted to "optimize" by introduction of branches.
 utf8LengthByLeader :: Word8 -> Int
-utf8LengthByLeader w
-  | w < 0x80  = 1
-  | w < 0xE0  = 2
-  | w < 0xF0  = 3
-  | otherwise = 4
+utf8LengthByLeader w = c `xor` I# (c# <=# 0#)
+  where
+    !c@(I# c#) = countLeadingZeros (complement w)
+{-# INLINE utf8LengthByLeader #-}
 
 ord2 ::
 #if defined(ASSERTS)

From 250babfaa1b931c8794714899bb8db6f3aad3ea6 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 23 May 2021 17:38:48 +0100
Subject: [PATCH 07/38] Make copyI/copyM/new branchless

---
 src/Data/Text/Array.hs | 52 ++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index e136fd39..fd2110f8 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -42,11 +42,8 @@ module Data.Text.Array
     ) where
 
 #if defined(ASSERTS)
--- TODO employ resizeMutableByteArray# instead of cropping Text
-import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits ((.&.), xor, shiftR)
 #if !MIN_VERSION_base(4,11,0)
 import Data.Text.Internal.Unsafe (inlinePerformIO)
 import Foreign.C.Types (CInt(..))
@@ -64,18 +61,15 @@ data MArray s = MutableByteArray (MutableByteArray# s)
 
 -- | Create an uninitialized mutable array.
 new :: forall s. Int -> ST s (MArray s)
-new n
-  | n < 0 || n .&. highBit /= 0 = array_size_error
+new (I# len#)
+#if defined(ASSERTS)
+  | I# len# < 0 = error "Data.Text.Array.new: size overflow"
+#endif
   | otherwise = ST $ \s1# ->
-       case newByteArray# len# s1# of
-         (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
-  where !(I# len#) = bytesInArray n
-        highBit    = maxBound `xor` (maxBound `shiftR` 1)
+    case newByteArray# len# s1# of
+      (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
 {-# INLINE new #-}
 
-array_size_error :: a
-array_size_error = error "Data.Text.Array.new: size overflow"
-
 -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
 unsafeFreeze :: MArray s -> ST s Array
 unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
@@ -83,12 +77,6 @@ unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
         (# s2#, ba# #) -> (# s2#, ByteArray ba# #)
 {-# INLINE unsafeFreeze #-}
 
--- | Indicate how many bytes would be used for an array of the given
--- size.
-bytesInArray :: Int -> Int
-bytesInArray n = n
-{-# INLINE bytesInArray #-}
-
 -- | Unchecked read of an immutable array.  May return garbage or
 -- crash on an out-of-bounds access.
 unsafeIndex ::
@@ -175,16 +163,23 @@ copyM :: MArray s               -- ^ Destination
       -> Int                    -- ^ Count
       -> ST s ()
 copyM dst@(MutableByteArray dst#) dstOff@(I# dstOff#) src@(MutableByteArray src#) srcOff@(I# srcOff#) count@(I# count#)
-    | I# count# <= 0 = return ()
+#if defined(ASSERTS)
+  | count < 0 = error $
+    "copyM: count must be >= 0, but got " ++ show count
+#endif
     | otherwise = do
 #if defined(ASSERTS)
     srcLen <- getSizeofMArray src
     dstLen <- getSizeofMArray dst
-    assert (srcOff + count <= srcLen) .
-      assert (dstOff + count <= dstLen) .
+    if srcOff + count > srcLen
+      then error "copyM: source is too short"
+      else return ()
+    if dstOff + count > dstLen
+      then error "copyM: destination is too short"
+      else return ()
 #endif
-      ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of
-        s2# -> (# s2#, () #)
+    ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of
+      s2# -> (# s2#, () #)
 {-# INLINE copyM #-}
 
 -- | Copy some elements of an immutable array.
@@ -196,10 +191,13 @@ copyI :: MArray s               -- ^ Destination
                                 -- copy (i.e. /not/ length)
       -> ST s ()
 copyI (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray src#) (I# srcOff#) top@(I# top#)
-    | dstOff >= top = return ()
-    | otherwise = ST $ \s1# ->
-      case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
-        s2# -> (# s2#, () #)
+#if defined(ASSERTS)
+  | top < dstOff = error $
+    "copyI: top must be >= dstOff, but " ++ show top ++ " < " ++ show dstOff
+#endif
+  | otherwise = ST $ \s1# ->
+    case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
+      s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
 -- | Compare portions of two arrays for equality.  No bounds checking

From 8476760428de318a529d6ecdd2a27535c6fa5f95 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 23 May 2021 18:24:06 +0100
Subject: [PATCH 08/38] Change semantics of copyI: pass length, not end offset

---
 src/Data/Text.hs                  | 17 ++++++++---------
 src/Data/Text/Array.hs            | 13 ++++++-------
 src/Data/Text/Internal/Builder.hs |  2 +-
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index b035ed71..4a50bd4e 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -455,8 +455,8 @@ append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
       x :: ST s (A.MArray s)
       x = do
         arr <- A.new len
-        A.copyI arr 0 arr1 off1 len1
-        A.copyI arr len1 arr2 off2 len
+        A.copyI len1 arr 0 arr1 off1
+        A.copyI len2 arr len1 arr2 off2
         return arr
 {-# NOINLINE append #-}
 
@@ -689,10 +689,10 @@ replace needle@(Text _      _      neeLen)
       let loop (i:is) o d = do
             let d0 = d + i - o
                 d1 = d0 + repLen
-            A.copyI marr d  hayArr (hayOff+o) d0
-            A.copyI marr d0 repArr repOff d1
+            A.copyI (i - o) marr d  hayArr (hayOff+o)
+            A.copyI repLen  marr d0 repArr repOff
             loop is (i + neeLen) d1
-          loop []     o d = A.copyI marr d hayArr (hayOff+o) len
+          loop []     o d = A.copyI (len - d) marr d hayArr (hayOff+o)
       loop ixs 0 0
       return marr
 
@@ -904,8 +904,7 @@ concat ts = case ts' of
     go :: ST s (A.MArray s)
     go = do
       arr <- A.new len
-      let step i (Text a o l) =
-            let !j = i + l in A.copyI arr i a o j >> return j
+      let step i (Text a o l) = A.copyI l arr i a o >> return (i + l)
       foldM step 0 ts' >> return arr
 
 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
@@ -1017,7 +1016,7 @@ replicate n t@(Text a o l)
     x :: ST s (A.MArray s)
     x = do
       arr <- A.new len
-      A.copyI arr 0 a o l
+      A.copyI l arr 0 a o
       let loop !l1 =
             let rest = len - l1 in
             if rest <= l1 then A.copyM arr l1 arr 0 rest >> return arr
@@ -1760,7 +1759,7 @@ copy (Text arr off len) = Text (A.run go) 0 len
     go :: ST s (A.MArray s)
     go = do
       marr <- A.new len
-      A.copyI marr 0 arr off len
+      A.copyI len marr 0 arr off
       return marr
 
 
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index fd2110f8..459945ce 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -183,20 +183,19 @@ copyM dst@(MutableByteArray dst#) dstOff@(I# dstOff#) src@(MutableByteArray src#
 {-# INLINE copyM #-}
 
 -- | Copy some elements of an immutable array.
-copyI :: MArray s               -- ^ Destination
+copyI :: Int                    -- ^ Count
+      -> MArray s               -- ^ Destination
       -> Int                    -- ^ Destination offset
       -> Array                  -- ^ Source
       -> Int                    -- ^ Source offset
-      -> Int                    -- ^ First offset in destination /not/ to
-                                -- copy (i.e. /not/ length)
       -> ST s ()
-copyI (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray src#) (I# srcOff#) top@(I# top#)
+copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray src#) (I# srcOff#)
 #if defined(ASSERTS)
-  | top < dstOff = error $
-    "copyI: top must be >= dstOff, but " ++ show top ++ " < " ++ show dstOff
+  | count < 0 = error $
+    "copyI: count must be >= 0, but got " ++ show count
 #endif
   | otherwise = ST $ \s1# ->
-    case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
+    case copyByteArray# src# srcOff# dst# dstOff# count# s1# of
       s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 9d181276..0951e488 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -168,7 +168,7 @@ copyLimit = 128
 fromText :: S.Text -> Builder
 fromText t@(Text arr off l)
     | S.null t       = empty
-    | l <= copyLimit = writeN l $ \marr o -> A.copyI marr o arr off (l+o)
+    | l <= copyLimit = writeN l $ \marr o -> A.copyI l marr o arr off
     | otherwise      = flush `append` mapBuilder (t :)
 {-# INLINE [1] fromText #-}
 

From 22ce9c7a2f5806781fbfa91ff838407df11c4635 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 23 May 2021 18:56:17 +0100
Subject: [PATCH 09/38] Shrink mutable arrays whenever possible

---
 src/Data/Text/Array.hs                | 18 ++++++++++++++++++
 src/Data/Text/Encoding.hs             |  4 +++-
 src/Data/Text/Internal/Builder.hs     |  1 +
 src/Data/Text/Internal/Lazy/Fusion.hs |  1 +
 src/Data/Text/Internal/Private.hs     |  1 +
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 459945ce..00427750 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -28,6 +28,7 @@ module Data.Text.Array
     , MArray(..)
     -- * Functions
     , resizeM
+    , shrinkM
     , copyM
     , copyI
     , empty
@@ -155,6 +156,23 @@ resizeM (MutableByteArray ma) i@(I# i#) = ST $ \s1# ->
     (# s2#, newArr #) -> (# s2#, MutableByteArray newArr #)
 {-# INLINE resizeM #-}
 
+shrinkM ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  MArray s -> Int -> ST s ()
+shrinkM (MutableByteArray marr) i@(I# newSize) = do
+#if defined(ASSERTS)
+  oldSize <- getSizeofMArray (MutableByteArray marr)
+  if I# newSize > oldSize
+    then error $ "shrinkM: shrink cannot grow " ++ show oldSize ++ " to " ++ show (I# newSize)
+    else return ()
+#endif
+  ST $ \s1# ->
+    case shrinkMutableByteArray# marr newSize s1# of
+      s2# -> (# s2#, () #)
+{-# INLINE shrinkM #-}
+
 -- | Copy some elements of a mutable array.
 copyM :: MArray s               -- ^ Destination
       -> Int                    -- ^ Destination offset
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 6a42b52b..49aced62 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -327,8 +327,10 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                       n <- peek destOffPtr
                       codepoint <- peek codepointPtr
                       chunkText <- unsafeSTToIO $ do
+                          let l = cSizeToInt n
+                          A.shrinkM (A.MutableByteArray dest) l
                           arr <- A.unsafeFreeze (A.MutableByteArray dest)
-                          return $! text arr 0 (cSizeToInt n)
+                          return $! text arr 0 l
                       let left = lastPtr `minusPtr` ptr
                           !undecoded = case state of
                             UTF8_ACCEPT -> B.empty
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 0951e488..590c6528 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -186,6 +186,7 @@ fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
     let loop !marr !o !u !l [] = k (Buffer marr o u l)
         loop marr o u l s@(c:cs)
             | l <= 3 = do
+                A.shrinkM marr (o + u)
                 arr <- A.unsafeFreeze marr
                 let !t = Text arr o u
                 marr' <- A.new chunkSize
diff --git a/src/Data/Text/Internal/Lazy/Fusion.hs b/src/Data/Text/Internal/Lazy/Fusion.hs
index 867d0ac4..b13f6574 100644
--- a/src/Data/Text/Internal/Lazy/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Fusion.hs
@@ -93,6 +93,7 @@ unstreamChunks !chunkSize (Stream next s0 len0)
                   Yield x s'  -> do d <- unsafeWrite marr i x
                                     inner marr len s' (i+d)
         finish marr len s' = do
+          A.shrinkM marr len
           arr <- A.unsafeFreeze marr
           return (I.Text arr 0 len `Chunk` outer s')
 {-# INLINE [0] unstreamChunks #-}
diff --git a/src/Data/Text/Internal/Private.hs b/src/Data/Text/Internal/Private.hs
index e953da19..b150fed2 100644
--- a/src/Data/Text/Internal/Private.hs
+++ b/src/Data/Text/Internal/Private.hs
@@ -40,6 +40,7 @@ runText ::
 #endif
   (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text
 runText act = runST (act $ \ !marr !len -> do
+                             A.shrinkM marr len
                              arr <- A.unsafeFreeze marr
                              return $! text arr 0 len)
 {-# INLINE runText #-}

From c99cc6ba28c6fcc12cd0fd2e820940c4e3480bb7 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Tue, 22 Jun 2021 19:00:21 +0100
Subject: [PATCH 10/38] Speed up words

---
 src/Data/Text.hs | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 4a50bd4e..7a1e20e4 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -208,7 +208,7 @@ import Control.DeepSeq (NFData(rnf))
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits (shiftL)
+import Data.Bits (shiftL, (.&.))
 import Data.Char (isSpace)
 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
                   Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
@@ -220,6 +220,7 @@ import Data.Binary (Binary(get, put))
 import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
+import Data.Text.Internal.Encoding.Utf8 (chr3, utf8LengthByLeader)
 import qualified Data.Text.Internal.Fusion as S
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
@@ -236,6 +237,7 @@ import Data.ByteString (ByteString)
 import qualified Data.Text.Lazy as L
 import Data.Int (Int64)
 #endif
+import Data.Word (Word8)
 import GHC.Base (eqInt, neInt, gtInt, geInt, ltInt, leInt)
 import qualified GHC.Exts as Exts
 import qualified Language.Haskell.TH.Lib as TH
@@ -1551,21 +1553,43 @@ zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
 -- representing white space.
 words :: Text -> [Text]
-words t@(Text arr off len) = loop 0 0
+words (Text arr off len) = loop 0 0
   where
     loop !start !n
         | n >= len = if start == n
                      then []
-                     else [Text arr (start+off) (n-start)]
-        -- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
-        | isSpace c =
+                     else [Text arr (start + off) (n - start)]
+        -- Spaces in UTF-8 take either 1 byte for 0x09..0x0D + 0x20
+        | isAsciiSpace w0 =
             if start == n
-            then loop (n+d) (n+d)
-            else Text arr (start+off) (n-start) : loop (n+d) (n+d)
-        | otherwise = loop start (n+d)
-        where Iter c d = iter t n
+            then loop (n + 1) (n + 1)
+            else Text arr (start + off) (n - start) : loop (n + 1) (n + 1)
+        | w0 < 0x80 = loop start (n + 1)
+        -- or 2 bytes for 0xA0
+        | w0 == 0xC2, w1 == 0xA0 =
+            if start == n
+            then loop (n + 2) (n + 2)
+            else Text arr (start + off) (n - start) : loop (n + 2) (n + 2)
+        | w0 < 0xE0 = loop start (n + 2)
+        -- or 3 bytes for 0x1680 + 0x2000..0x200A + 0x2028..0x2029 + 0x202F + 0x205F + 0x3000
+        |  w0 == 0xE1 && w1 == 0x9A && w2 == 0x80
+        || w0 == 0xE2 && (w1 == 0x80 && isSpace (chr3 w0 w1 w2) || w1 == 0x81 && w2 == 0x9F)
+        || w0 == 0xE3 && w1 == 0x80 && w2 == 0x80 =
+            if start == n
+            then loop (n + 3) (n + 3)
+            else Text arr (start + off) (n - start) : loop (n + 3) (n + 3)
+        | otherwise = loop start (n + utf8LengthByLeader w0)
+        where
+            w0 = A.unsafeIndex arr (off + n)
+            w1 = A.unsafeIndex arr (off + n + 1)
+            w2 = A.unsafeIndex arr (off + n + 2)
 {-# INLINE words #-}
 
+-- Adapted from Data.ByteString.Internal.isSpaceWord8
+isAsciiSpace :: Word8 -> Bool
+isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5)
+{-# INLINE isAsciiSpace #-}
+
 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
 -- newline 'Char's. The resulting strings do not contain newlines.
 lines :: Text -> [Text]

From de17480ee8093cecc552ba64e4e55e3c42127416 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 22 May 2021 23:58:54 +0100
Subject: [PATCH 11/38] Use AVX/SSE instructions for length/take/drop

---
 cbits/measure_off.c   | 158 ++++++++++++++++++++++++++++++++++++++++++
 src/Data/Text.hs      |  50 +++++++++----
 src/Data/Text/Lazy.hs |  27 +++++---
 text.cabal            |   5 ++
 4 files changed, 217 insertions(+), 23 deletions(-)
 create mode 100644 cbits/measure_off.c

diff --git a/cbits/measure_off.c b/cbits/measure_off.c
new file mode 100644
index 00000000..f10f6460
--- /dev/null
+++ b/cbits/measure_off.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#ifdef __x86_64__
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <cpuid.h>
+#endif
+#include <stdbool.h>
+
+#ifndef __STDC_NO_ATOMICS__
+#include <stdatomic.h>
+#endif
+
+bool has_avx512_vl_bw() {
+#ifdef __x86_64__
+  uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+  __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+  // https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features
+  const bool has_avx512_bw = ebx & (1 << 30);
+  const bool has_avx512_vl = ebx & (1 << 31);
+  // printf("cpuid=%d=cpuid\n", has_avx512_bw && has_avx512_vl);
+  return has_avx512_bw && has_avx512_vl;
+#else
+  return false;
+#endif
+}
+
+/*
+  measure_off_naive / measure_off_avx / measure_off_sse
+  take a UTF-8 sequence between src and srcend, and a number of characters cnt.
+  If the sequence is long enough to contain cnt characters, then return how many bytes
+  remained unconsumed. Otherwise, if the sequence is shorter, return
+  negated count of lacking characters. Cf. _hs_text_measure_off below.
+*/
+
+inline const ssize_t measure_off_naive(const uint8_t *src, const uint8_t *srcend, size_t cnt)
+{
+  // Count leading bytes in 8 byte sequence
+  while (src < srcend - 7){
+    uint64_t w64;
+    memcpy(&w64, src, sizeof(uint64_t));
+    size_t leads = __builtin_popcountll(((w64 << 1) | ~w64) & 0x8080808080808080ULL);
+    if (cnt < leads) break;
+    cnt-= leads;
+    src+= 8;
+  }
+
+  // Skip until next leading byte
+  while (src < srcend){
+    uint8_t w8 = *src;
+    if ((int8_t)w8 >= -0x40) break;
+    src++;
+  }
+
+  // Finish up with tail
+  while (src < srcend && cnt > 0){
+    uint8_t leadByte = *src++;
+    cnt--;
+    src+= (leadByte >= 0xc0) + (leadByte >= 0xe0) + (leadByte >= 0xf0);
+  }
+
+  return cnt == 0 ? (ssize_t)(srcend - src) : (ssize_t)(- cnt);
+}
+
+#ifdef __x86_64__
+__attribute__((target("avx512vl,avx512bw")))
+const ssize_t measure_off_avx(const uint8_t *src, const uint8_t *srcend, size_t cnt)
+{
+  while (src < srcend - 63){
+    __m512i w512 = _mm512_loadu_si512((__m512i *)src);
+    // Which bytes are either < 128 or >= 192?
+    uint64_t mask = _mm512_cmpgt_epi8_mask(w512, _mm512_set1_epi8(0xBF));
+    size_t leads = __builtin_popcountll(mask);
+    if (cnt < leads) break;
+    cnt-= leads;
+    src+= 64;
+  }
+
+  // Cannot proceed to measure_off_sse, because of AVX-SSE transition penalties
+  // https://software.intel.com/content/www/us/en/develop/articles/avoiding-avx-sse-transition-penalties.html
+
+  if (src < srcend - 31){
+    __m256i w256 = _mm256_loadu_si256((__m256i *)src);
+    uint32_t mask = _mm256_cmpgt_epi8_mask(w256, _mm256_set1_epi8(0xBF));
+    size_t leads = __builtin_popcountl(mask);
+    if (cnt >= leads){
+      cnt-= leads;
+      src+= 32;
+    }
+  }
+
+  if (src < srcend - 15){
+    __m128i w128 = _mm_maskz_loadu_epi16(0xFF, (__m128i *)src); // not _mm_loadu_si128; and GCC does not have _mm_loadu_epi16
+    uint16_t mask = _mm_cmpgt_epi8_mask(w128, _mm_set1_epi8(0xBF)); // not _mm_movemask_epi8
+    size_t leads = __builtin_popcountl(mask);
+    if (cnt >= leads){
+      cnt-= leads;
+      src+= 16;
+    }
+  }
+
+  return measure_off_naive(src, srcend, cnt);
+}
+#endif
+
+const ssize_t measure_off_sse(const uint8_t *src, const uint8_t *srcend, size_t cnt)
+{
+#ifdef __x86_64__
+  while (src < srcend - 15){
+    __m128i w128 = _mm_loadu_si128((__m128i *)src);
+    // Which bytes are either < 128 or >= 192?
+    uint16_t mask = _mm_movemask_epi8(_mm_cmpgt_epi8(w128, _mm_set1_epi8(0xBF)));
+    size_t leads = __builtin_popcount(mask);
+    if (cnt < leads) break;
+    cnt-= leads;
+    src+= 16;
+  }
+#endif
+
+  return measure_off_naive(src, srcend, cnt);
+}
+
+typedef const ssize_t (*measure_off_t) (const uint8_t*, const uint8_t*, size_t);
+
+/*
+  _hs_text_measure_off takes a UTF-8 encoded buffer, specified by (src, off, len),
+  and a number of code points (aka characters) cnt. If the buffer is long enough
+  to contain cnt characters, then _hs_text_measure_off returns a non-negative number,
+  measuring their size in code units (aka bytes). If the buffer is shorter,
+  _hs_text_measure_off returns a non-positive number, which is a negated total count
+  of characters available in the buffer. If len = 0 or cnt = 0, this function returns 0
+  as well.
+
+  This scheme allows us to implement both take/drop and length with the same C function.
+
+  The input buffer (src, off, len) must be a valid UTF-8 sequence,
+  this condition is not checked.
+*/
+ssize_t _hs_text_measure_off(const uint8_t *src, size_t off, size_t len, size_t cnt) {
+  static _Atomic measure_off_t s_impl = (measure_off_t)NULL;
+  measure_off_t impl = atomic_load_explicit(&s_impl, memory_order_relaxed);
+  if (!impl) {
+#ifdef __x86_64__
+    impl = has_avx512_vl_bw() ? measure_off_avx : measure_off_sse;
+#else
+    impl = measure_off_sse;
+#endif
+    atomic_store_explicit(&s_impl, impl, memory_order_relaxed);
+  }
+  ssize_t ret = (*impl)(src + off, src + off + len, cnt);
+  return ret >= 0 ? ((ssize_t)len - ret) : (- (cnt + ret));
+}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 7a1e20e4..ea7ebad4 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1,6 +1,7 @@
 {-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples, TypeFamilies #-}
 {-# LANGUAGE TemplateHaskellQuotes #-}
 {-# LANGUAGE Trustworthy #-}
+{-# LANGUAGE UnliftedFFITypes #-}
 
 {-# OPTIONS_GHC -fno-warn-orphans #-}
 
@@ -196,13 +197,15 @@ module Data.Text
     -- * Low level operations
     , copy
     , unpackCString#
+
+    , measureOff
     ) where
 
 import Prelude (Char, Bool(..), Int, Maybe(..), String,
                 Eq(..), Ord(..), Ordering(..), (++),
                 Read(..),
                 (&&), (||), (+), (-), (.), ($), ($!), (>>),
-                not, return, otherwise, quot)
+                not, return, otherwise, quot, IO)
 import Control.DeepSeq (NFData(rnf))
 #if defined(ASSERTS)
 import Control.Exception (assert)
@@ -230,7 +233,7 @@ import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
-                         reverseIter_, unsafeHead, unsafeTail)
+                         reverseIter_, unsafeHead, unsafeTail, unsafeDupablePerformIO)
 import Data.Text.Internal.Search (indices)
 #if defined(__HADDOCK__)
 import Data.ByteString (ByteString)
@@ -238,11 +241,13 @@ import qualified Data.Text.Lazy as L
 import Data.Int (Int64)
 #endif
 import Data.Word (Word8)
-import GHC.Base (eqInt, neInt, gtInt, geInt, ltInt, leInt)
+import Foreign.C.Types
+import GHC.Base (eqInt, neInt, gtInt, geInt, ltInt, leInt, ByteArray#)
 import qualified GHC.Exts as Exts
 import qualified Language.Haskell.TH.Lib as TH
 import qualified Language.Haskell.TH.Syntax as TH
 import Text.Printf (PrintfArg, formatArg, formatString)
+import System.Posix.Types (CSsize(..))
 
 -- $setup
 -- >>> import Data.Text
@@ -538,7 +543,7 @@ length ::
   HasCallStack =>
 #endif
   Text -> Int
-length t = S.length (stream t)
+length = P.negate . measureOff P.maxBound
 {-# INLINE [1] length #-}
 -- length needs to be phased after the compareN/length rules otherwise
 -- it may inline before the rules have an opportunity to fire.
@@ -1069,15 +1074,25 @@ take :: Int -> Text -> Text
 take n t@(Text arr off len)
     | n <= 0    = empty
     | n >= len  = t
-    | otherwise = text arr off (iterN n t)
+    | otherwise = let m = measureOff n t in if m >= 0 then text arr off m else t
 {-# INLINE [1] take #-}
 
-iterN :: Int -> Text -> Int
-iterN n t@(Text _arr _off len) = loop 0 0
-  where loop !i !cnt
-            | i >= len || cnt >= n = i
-            | otherwise            = loop (i+d) (cnt+1)
-          where d = iter_ t i
+-- | /O(n)/ If @t@ is long enough to contain @n@ characters, 'measureOff' @n@ @t@
+-- returns a non-negative number, measuring their size in 'Word8'. Otherwise,
+-- if @t@ is shorter, return a non-positive number, which is a negated total count
+-- of 'Char' available in @t@. If @t@ is empty or @n = 0@, return 0.
+--
+-- This function is used to implement 'take', 'drop', 'splitAt' and 'length'
+-- and is useful on its own in streaming and parsing libraries.
+measureOff :: Int -> Text -> Int
+measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
+  cSsizeToInt $ unsafeDupablePerformIO $
+    c_measure_off arr (intToCSize off) (intToCSize len) (intToCSize n)
+
+-- | The input buffer (arr :: ByteArray#, off :: CSize, len :: CSize)
+-- must specify a valid UTF-8 sequence, this condition is not checked.
+foreign import ccall unsafe "_hs_text_measure_off" c_measure_off
+    :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
 
 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
 -- taking @n@ characters from the end of @t@.
@@ -1110,8 +1125,8 @@ drop :: Int -> Text -> Text
 drop n t@(Text arr off len)
     | n <= 0    = t
     | n >= len  = empty
-    | otherwise = text arr (off+i) (len-i)
-  where i = iterN n t
+    | otherwise = if m >= 0 then text arr (off+m) (len-m) else mempty
+  where m = measureOff n t
 {-# INLINE [1] drop #-}
 
 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
@@ -1219,8 +1234,8 @@ splitAt :: Int -> Text -> (Text, Text)
 splitAt n t@(Text arr off len)
     | n <= 0    = (empty, t)
     | n >= len  = (t, empty)
-    | otherwise = let k = iterN n t
-                  in (text arr off k, text arr (off+k) (len-k))
+    | otherwise = let m = measureOff n t in
+    if m >= 0 then (text arr off m, text arr (off+m) (len-m)) else (t, mempty)
 
 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
 -- a pair whose first element is the longest prefix (possibly empty)
@@ -1786,6 +1801,11 @@ copy (Text arr off len) = Text (A.run go) 0 len
       A.copyI len marr 0 arr off
       return marr
 
+intToCSize :: Int -> CSize
+intToCSize = P.fromIntegral
+
+cSsizeToInt :: CSsize -> Int
+cSsizeToInt = P.fromIntegral
 
 -------------------------------------------------
 -- NOTE: the named chunk below used by doctest;
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index bb716286..1ed1d217 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -203,6 +203,7 @@ import Prelude (Char, Bool(..), Maybe(..), String,
                 error, flip, fmap, fromIntegral, not, otherwise, quot)
 import qualified Prelude as P
 import Control.DeepSeq (NFData(..))
+import Data.Bits (finiteBitSize)
 import Data.Int (Int64)
 import qualified Data.List as L
 import Data.Char (isSpace)
@@ -972,10 +973,15 @@ take i t0         = take' i t0
     take' :: Int64 -> Text -> Text
     take' 0 _            = Empty
     take' _ Empty        = Empty
-    take' n (Chunk t ts)
-        | n < len   = Chunk (T.take (int64ToInt n) t) Empty
-        | otherwise = Chunk t (take' (n - len) ts)
-        where len = intToInt64 (T.length t)
+    take' n (Chunk t@(T.Text arr off _) ts)
+        | finiteBitSize (0 :: P.Int) == 64, m <- T.measureOff (int64ToInt n) t =
+          if m >= 0
+          then fromStrict (T.Text arr off m)
+          else Chunk t (take' (n + intToInt64 m) ts)
+
+        | n < l     = Chunk (T.take (int64ToInt n) t) Empty
+        | otherwise = Chunk t (take' (n - l) ts)
+        where l = intToInt64 (T.length t)
 {-# INLINE [1] take #-}
 
 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
@@ -1009,10 +1015,15 @@ drop i t0
     drop' :: Int64 -> Text -> Text
     drop' 0 ts           = ts
     drop' _ Empty        = Empty
-    drop' n (Chunk t ts)
-        | n < len   = Chunk (T.drop (int64ToInt n) t) ts
-        | otherwise = drop' (n - len) ts
-        where len   = intToInt64 (T.length t)
+    drop' n (Chunk t@(T.Text arr off len) ts)
+        | finiteBitSize (0 :: P.Int) == 64, m <- T.measureOff (int64ToInt n) t =
+          if m >= 0
+          then chunk (T.Text arr (off + m) (len - m)) ts
+          else drop' (n + intToInt64 m) ts
+
+        | n < l     = Chunk (T.drop (int64ToInt n) t) ts
+        | otherwise = drop' (n - l) ts
+        where l   = intToInt64 (T.length t)
 {-# INLINE [1] drop #-}
 
 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
diff --git a/text.cabal b/text.cabal
index ccf86a6d..e0c0ce69 100644
--- a/text.cabal
+++ b/text.cabal
@@ -65,6 +65,7 @@ flag developer
 
 library
   c-sources:    cbits/cbits.c
+                cbits/measure_off.c
                 cbits/utils.c
   include-dirs: include
   hs-source-dirs: src
@@ -131,6 +132,10 @@ library
     ghc-options: -fno-ignore-asserts
     cpp-options: -DASSERTS
 
+  -- https://gitlab.haskell.org/ghc/ghc/-/issues/19900
+  if os(windows)
+    extra-libraries: gcc_s
+
   default-language: Haskell2010
   default-extensions:
     NondecreasingIndentation

From b41d1419f62f11f1f492aa55d814fc1f02abda81 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 23 Jun 2021 00:14:58 +0100
Subject: [PATCH 12/38] More rewrite rules

---
 src/Data/Text.hs      | 45 ++++++++++++++++++++++++++++++++++++-------
 src/Data/Text/Lazy.hs | 31 ++++++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index ea7ebad4..bba6c13e 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -437,8 +437,8 @@ pack = unstream . S.map safe . S.streamList
 -- copying a new array.  Performs replacement on
 -- invalid scalar values.
 cons :: Char -> Text -> Text
-cons c t = unstream (S.cons (safe c) (stream t))
-{-# INLINE cons #-}
+cons c = unstream . S.cons (safe c) . stream
+{-# INLINE [1] cons #-}
 
 infixr 5 `cons`
 
@@ -548,6 +548,27 @@ length = P.negate . measureOff P.maxBound
 -- length needs to be phased after the compareN/length rules otherwise
 -- it may inline before the rules have an opportunity to fire.
 
+{-# RULES
+"TEXT length/filter -> S.length/S.filter" forall p t.
+    length (filter p t) = S.length (S.filter p (stream t))
+"TEXT length/unstream -> S.length" forall t.
+    length (unstream t) = S.length t
+"TEXT length/pack -> P.length" forall t.
+    length (pack t) = P.length t
+"TEXT length/map -> length" forall f t.
+    length (map f t) = length t
+"TEXT length/zipWith -> length" forall f t1 t2.
+    length (zipWith f t1 t2) = min (length t1) (length t2)
+"TEXT length/replicate -> n" forall n t.
+    length (replicate n t) = mul (max 0 n) (length t)
+"TEXT length/cons -> length+1" forall c t.
+    length (cons c t) = 1 + length t
+"TEXT length/intersperse -> 2*length-1" forall c t.
+    length (intersperse c t) = max 0 (mul 2 (length t) - 1)
+"TEXT length/intercalate -> n*length" forall s ts.
+    length (intercalate s ts) = let lenS = length s in max 0 (P.sum (P.map (\t -> length t + lenS) ts) - lenS)
+  #-}
+
 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
 --
 -- This function gives the same answer as comparing against the result
@@ -608,6 +629,11 @@ map :: (Char -> Char) -> Text -> Text
 map f t = unstream (S.map (safe . f) (stream t))
 {-# INLINE [1] map #-}
 
+{-# RULES
+"TEXT map/map -> map" forall f g t.
+    map f (map g t) = map (f . safe . g) t
+#-}
+
 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
 -- 'Text's and concatenates the list after interspersing the first
 -- argument between each element of the list.
@@ -618,7 +644,7 @@ map f t = unstream (S.map (safe . f) (stream t))
 -- "WeNI!seekNI!theNI!HolyNI!Grail"
 intercalate :: Text -> [Text] -> Text
 intercalate t = concat . L.intersperse t
-{-# INLINE intercalate #-}
+{-# INLINE [1] intercalate #-}
 
 -- | /O(n)/ The 'intersperse' function takes a character and places it
 -- between the characters of a 'Text'.
@@ -631,7 +657,7 @@ intercalate t = concat . L.intersperse t
 -- Performs replacement on invalid scalar values.
 intersperse     :: Char -> Text -> Text
 intersperse c t = unstream (S.intersperse (safe c) (stream t))
-{-# INLINE intersperse #-}
+{-# INLINE [1] intersperse #-}
 
 -- | /O(n)/ Reverse the characters of a string.
 --
@@ -1412,8 +1438,13 @@ partition p t = (filter p t, filter (not . p) t)
 -- returns a 'Text' containing those characters that satisfy the
 -- predicate.
 filter :: (Char -> Bool) -> Text -> Text
-filter p t = unstream (S.filter p (stream t))
-{-# INLINE filter #-}
+filter p = unstream . S.filter p . stream
+{-# INLINE [1] filter #-}
+
+{-# RULES
+"TEXT filter/filter -> filter" forall p q t.
+    filter p (filter q t) = filter (\c -> p c && q c) t
+#-}
 
 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
 -- non-'null') in @haystack@.  The first element of the returned tuple
@@ -1563,7 +1594,7 @@ zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
     where g a b = safe (f a b)
-{-# INLINE zipWith #-}
+{-# INLINE [1] zipWith #-}
 
 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
 -- representing white space.
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 1ed1d217..0f4e3258 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -540,6 +540,21 @@ length = foldlChunks go 0
         go l t = l + intToInt64 (T.length t)
 {-# INLINE [1] length #-}
 
+{-# RULES
+"TEXT length/map -> length" forall f t.
+    length (map f t) = length t
+"TEXT length/zipWith -> length" forall f t1 t2.
+    length (zipWith f t1 t2) = min (length t1) (length t2)
+"TEXT length/replicate -> n" forall n t.
+    length (replicate n t) = max 0 n P.* length t
+"TEXT length/cons -> length+1" forall c t.
+    length (cons c t) = 1 + length t
+"TEXT length/intersperse -> 2*length-1" forall c t.
+    length (intersperse c t) = max 0 (2 P.* length t - 1)
+"TEXT length/intercalate -> n*length" forall s ts.
+    length (intercalate s ts) = let lenS = length s in max 0 (P.sum (P.map (\t -> length t + lenS) ts) - lenS)
+  #-}
+
 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
 --
 -- This function gives the same answer as comparing against the result
@@ -560,19 +575,24 @@ map :: (Char -> Char) -> Text -> Text
 map f t = unstream (S.map (safe . f) (stream t))
 {-# INLINE [1] map #-}
 
+{-# RULES
+"TEXT map/map -> map" forall f g t.
+    map f (map g t) = map (f . safe . g) t
+#-}
+
 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
 -- 'Text's and concatenates the list after interspersing the first
 -- argument between each element of the list.
 intercalate :: Text -> [Text] -> Text
 intercalate t = concat . L.intersperse t
-{-# INLINE intercalate #-}
+{-# INLINE [1] intercalate #-}
 
 -- | /O(n)/ The 'intersperse' function takes a character and places it
 -- between the characters of a 'Text'. Performs
 -- replacement on invalid scalar values.
 intersperse :: Char -> Text -> Text
 intersperse c t = unstream (S.intersperse (safe c) (stream t))
-{-# INLINE intersperse #-}
+{-# INLINE [1] intersperse #-}
 
 -- | /O(n)/ Left-justify a string to the given length, using the
 -- specified fill character on the right. Performs
@@ -1547,7 +1567,12 @@ stripSuffix p t = reverse `fmap` stripPrefix (reverse p) (reverse t)
 -- predicate.
 filter :: (Char -> Bool) -> Text -> Text
 filter p t = unstream (S.filter p (stream t))
-{-# INLINE filter #-}
+{-# INLINE [1] filter #-}
+
+{-# RULES
+"TEXT filter/filter -> filter" forall p q t.
+    filter p (filter q t) = filter (\c -> p c && q c) t
+#-}
 
 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
 -- returns the first element in matching the predicate, or 'Nothing'

From b1602444c0b940734db53e0ca5de20d0fb8b6787 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 25 Jun 2021 19:41:49 +0100
Subject: [PATCH 13/38] Speed up encodeUtf8 for strict and lazy Text

---
 src/Data/Text/Array.hs         | 12 ++++++++++++
 src/Data/Text/Encoding.hs      | 16 ++++++++++++----
 src/Data/Text/Lazy/Encoding.hs | 13 +------------
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 00427750..5a20e4d4 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -39,6 +39,7 @@ module Data.Text.Array
     , unsafeFreeze
     , unsafeIndex
     , new
+    , newPinned
     , unsafeWrite
     ) where
 
@@ -71,6 +72,17 @@ new (I# len#)
       (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
 {-# INLINE new #-}
 
+-- | Create an uninitialized mutable pinned array.
+newPinned :: forall s. Int -> ST s (MArray s)
+newPinned (I# len#)
+#if defined(ASSERTS)
+  | I# len# < 0 = error "Data.Text.Array.newPinned: size overflow"
+#endif
+  | otherwise = ST $ \s1# ->
+    case newPinnedByteArray# len# s1# of
+      (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
+{-# INLINE newPinned #-}
+
 -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
 unsafeFreeze :: MArray s -> ST s Array
 unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 49aced62..0312cd73 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -65,7 +65,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
 import Control.Monad.ST (runST)
 import Data.ByteString as B
-import qualified Data.ByteString.Short.Internal as SBS
+import qualified Data.ByteString.Internal as B
 import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, text)
@@ -79,7 +79,8 @@ import Foreign.C.Types (CSize)
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
 import Foreign.Storable (Storable, peek, poke)
-import GHC.Base (MutableByteArray#)
+import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
+import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
@@ -436,9 +437,16 @@ encodeUtf8BuilderEscaped be =
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text (A.ByteArray arr) off len)
+encodeUtf8 (Text arr off len)
   | len == 0  = B.empty
-  | otherwise = B.take len $ B.drop off $ SBS.fromShort $ SBS.SBS arr
+  -- It would be easier to use Data.ByteString.Short.fromShort and slice later,
+  -- but this is undesirable when len is significantly smaller than length arr.
+  | otherwise = unsafeDupablePerformIO $ do
+    marr@(A.MutableByteArray mba) <- unsafeSTToIO $ A.newPinned len
+    unsafeSTToIO $ A.copyI len marr 0 arr off
+    let fp = ForeignPtr (byteArrayContents# (unsafeCoerce# mba))
+                        (PlainPtr mba)
+    pure $ B.fromForeignPtr fp 0 len
 
 -- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index d10e074d..a82ba8c2 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -55,7 +55,6 @@ import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word8)
 import qualified Data.ByteString as S
 import qualified Data.ByteString.Builder as B
-import qualified Data.ByteString.Builder.Extra as B (safeStrategy, toLazyByteStringWith)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Lazy as B
 import qualified Data.ByteString.Lazy.Internal as B
@@ -139,17 +138,7 @@ decodeUtf8' bs = unsafeDupablePerformIO $ do
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> B.ByteString
-encodeUtf8    Empty       = B.empty
-encodeUtf8 lt@(Chunk t _) =
-    B.toLazyByteStringWith strategy B.empty $ encodeUtf8Builder lt
-  where
-    -- To improve our small string performance, we use a strategy that
-    -- allocates a buffer that is guaranteed to be large enough for the
-    -- encoding of the first chunk, but not larger than the default
-    -- B.smallChunkSize. We clamp the firstChunkSize to ensure that we don't
-    -- generate too large buffers which hamper streaming.
-    firstChunkSize  = min B.smallChunkSize (4 * (T.length t + 1))
-    strategy        = B.safeStrategy firstChunkSize B.defaultChunkSize
+encodeUtf8 = foldrChunks (B.Chunk . TE.encodeUtf8) B.Empty
 
 -- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
 --

From 8b5bc09c159a457c23b3124bc6a545faae434742 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 25 Jun 2021 21:41:56 +0100
Subject: [PATCH 14/38] Speed up reverse

---
 cbits/reverse.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 src/Data/Text.hs | 14 ++++++++++++--
 text.cabal       |  1 +
 3 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 cbits/reverse.c

diff --git a/cbits/reverse.c b/cbits/reverse.c
new file mode 100644
index 00000000..97c66f12
--- /dev/null
+++ b/cbits/reverse.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+/*
+  _hs_text_reverse takes a UTF-8 encoded buffer, specified by (src0, off, len),
+  and reverses it, writing output starting from dst0.
+
+  The input buffer (src0, off, len) must be a valid UTF-8 sequence,
+  this condition is not checked.
+*/
+void _hs_text_reverse(uint8_t *dst0, const uint8_t *src0, size_t off, size_t len)
+{
+  const uint8_t *src = src0 + off;
+  const uint8_t *srcend = src + len;
+  uint8_t *dst = dst0 + len - 1;
+
+  while (src < srcend){
+    uint8_t leadByte = *src++;
+    if (leadByte < 0x80){
+      *dst-- = leadByte;
+    } else if (leadByte < 0xe0){
+      *(dst-1) = leadByte;
+      *dst     = *src++;
+      dst-=2;
+    } else if (leadByte < 0xf0){
+      *(dst-2) = leadByte;
+      *(dst-1) = *src++;
+      *dst     = *src++;
+      dst-=3;
+    } else {
+      *(dst-3) = leadByte;
+      *(dst-2) = *src++;
+      *(dst-1) = *src++;
+      *dst     = *src++;
+      dst-=4;
+    }
+  }
+}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index bba6c13e..8c57d894 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -216,7 +216,8 @@ import Data.Char (isSpace)
 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
                   Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
 import Control.Monad (foldM)
-import Control.Monad.ST (ST)
+import Control.Monad.ST (ST, runST)
+import Control.Monad.ST.Unsafe (unsafeIOToST)
 import qualified Data.Text.Array as A
 import qualified Data.List as L
 import Data.Binary (Binary(get, put))
@@ -670,9 +671,18 @@ reverse ::
   HasCallStack =>
 #endif
   Text -> Text
-reverse t = S.reverse (stream t)
+reverse (Text (A.ByteArray ba) off len) = runST $ do
+    marr@(A.MutableByteArray mba) <- A.new len
+    unsafeIOToST $ c_reverse mba ba (intToCSize off) (intToCSize len)
+    brr <- A.unsafeFreeze marr
+    return $ Text brr 0 len
 {-# INLINE reverse #-}
 
+-- | The input buffer (src :: ByteArray#, off :: CSize, len :: CSize)
+-- must specify a valid UTF-8 sequence, this condition is not checked.
+foreign import ccall unsafe "_hs_text_reverse" c_reverse
+    :: Exts.MutableByteArray# s -> ByteArray# -> CSize -> CSize -> IO ()
+
 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
 -- @haystack@ with @replacement@.
 --
diff --git a/text.cabal b/text.cabal
index e0c0ce69..4e634af6 100644
--- a/text.cabal
+++ b/text.cabal
@@ -66,6 +66,7 @@ flag developer
 library
   c-sources:    cbits/cbits.c
                 cbits/measure_off.c
+                cbits/reverse.c
                 cbits/utils.c
   include-dirs: include
   hs-source-dirs: src

From b78ece16b3fb6147d2b3558106257ea15815548a Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 25 Jun 2021 23:30:02 +0100
Subject: [PATCH 15/38] Improve replicateChar

---
 src/Data/Text.hs       | 44 ++++++++++++++++++++++++++----------------
 src/Data/Text/Array.hs | 20 ++++++++++++++++++-
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 8c57d894..8f16567c 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -211,8 +211,8 @@ import Control.DeepSeq (NFData(rnf))
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits (shiftL, (.&.))
-import Data.Char (isSpace)
+import Data.Bits ((.&.))
+import Data.Char (isSpace, isAscii, ord)
 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
                   Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
 import Control.Monad (foldM)
@@ -224,13 +224,14 @@ import Data.Binary (Binary(get, put))
 import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
-import Data.Text.Internal.Encoding.Utf8 (chr3, utf8LengthByLeader)
+import Data.Text.Internal.Encoding.Utf8 (chr3, utf8Length, utf8LengthByLeader)
 import qualified Data.Text.Internal.Fusion as S
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
 import Data.Text.Internal.Private (span_)
 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
@@ -1053,21 +1054,15 @@ replicate n t@(Text a o l)
     | n <= 0 || l <= 0       = empty
     | n == 1                 = t
     | isSingleton t          = replicateChar n (unsafeHead t)
-    | otherwise              = Text (A.run x) 0 len
-  where
-    len = l `mul` n -- TODO: detect overflows
-    x :: ST s (A.MArray s)
-    x = do
-      arr <- A.new len
-      A.copyI l arr 0 a o
-      let loop !l1 =
-            let rest = len - l1 in
-            if rest <= l1 then A.copyM arr l1 arr 0 rest >> return arr
-            else A.copyM arr l1 arr 0 l1 >> loop (l1 `shiftL` 1)
-      loop l
+    | otherwise              = runST $ do
+        let totalLen = n `mul` l
+        marr <- A.new totalLen
+        A.copyI l marr 0 a o
+        A.tile marr l
+        arr  <- A.unsafeFreeze marr
+        return $ Text arr 0 totalLen
 {-# INLINE [1] replicate #-}
 
-
 {-# RULES
 "TEXT replicate/singleton -> replicateChar" [~1] forall n c.
     replicate n (singleton c) = replicateChar n c
@@ -1076,7 +1071,22 @@ replicate n t@(Text a o l)
 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
 -- value of every element.
 replicateChar :: Int -> Char -> Text
-replicateChar n c = unstream (S.replicateCharI n (safe c))
+replicateChar !len !c'
+  | len <= 0  = empty
+  | isAscii c = runST $ do
+    marr <- A.newFilled len (ord c)
+    arr  <- A.unsafeFreeze marr
+    return $ Text arr 0 len
+  | otherwise = runST $ do
+    let cLen = utf8Length c
+        totalLen = cLen P.* len
+    marr <- A.new totalLen
+    _ <- unsafeWrite marr 0 c
+    A.tile marr cLen
+    arr  <- A.unsafeFreeze marr
+    return $ Text arr 0 totalLen
+  where
+    c = safe c'
 {-# INLINE replicateChar #-}
 
 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 5a20e4d4..80d93dea 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -40,7 +40,9 @@ module Data.Text.Array
     , unsafeIndex
     , new
     , newPinned
+    , newFilled
     , unsafeWrite
+    , tile
     ) where
 
 #if defined(ASSERTS)
@@ -83,6 +85,22 @@ newPinned (I# len#)
       (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
 {-# INLINE newPinned #-}
 
+newFilled :: Int -> Int -> ST s (MArray s)
+newFilled (I# len#) (I# c#) = ST $ \s1# ->
+  case newByteArray# len# s1# of
+    (# s2#, marr# #) -> case setByteArray# marr# 0# len# c# s2# of
+      s3# -> (# s3#, MutableByteArray marr# #)
+{-# INLINE newFilled #-}
+
+tile :: MArray s -> Int -> ST s ()
+tile marr tileLen = do
+  totalLen <- getSizeofMArray marr
+  let go l
+        | 2 * l > totalLen = copyM marr l marr 0 (totalLen - l)
+        | otherwise = copyM marr l marr 0 l >> go (2 * l)
+  go tileLen
+{-# INLINE tile #-}
+
 -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
 unsafeFreeze :: MArray s -> ST s Array
 unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
@@ -107,7 +125,6 @@ unsafeIndex (ByteArray arr) i@(I# i#) =
   case indexWord8Array# arr i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
-#if defined(ASSERTS)
 -- sizeofMutableByteArray# is deprecated, because it is unsafe in the presence of
 -- shrinkMutableByteArray# and resizeMutableByteArray#.
 getSizeofMArray :: MArray s -> ST s Int
@@ -115,6 +132,7 @@ getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
   case getSizeofMutableByteArray# marr s0# of
     (# s1#, word8len# #) -> (# s1#, I# word8len# #)
 
+#if defined(ASSERTS)
 checkBoundsM :: HasCallStack => MArray s -> Int -> Int -> ST s ()
 checkBoundsM ma i elSize = do
   len <- getSizeofMArray ma

From 4970b72477ccf96137e7e1c5567ef8a279fe590a Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 14 Jul 2021 22:24:28 +0100
Subject: [PATCH 16/38] Define iterArray

---
 src/Data/Text/Unsafe.hs | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs
index 64cc83c4..4832c5d1 100644
--- a/src/Data/Text/Unsafe.hs
+++ b/src/Data/Text/Unsafe.hs
@@ -15,8 +15,10 @@ module Data.Text.Unsafe
     , unsafeDupablePerformIO
     , Iter(..)
     , iter
+    , iterArray
     , iter_
     , reverseIter
+    , reverseIterArray
     , reverseIter_
     , unsafeHead
     , unsafeTail
@@ -74,19 +76,22 @@ iter ::
   HasCallStack =>
 #endif
   Text -> Int -> Iter
-iter (Text arr off _len) i = Iter chr l
+iter (Text arr off _len) i = iterArray arr (off + i)
+{-# INLINE iter #-}
+
+iterArray :: A.Array -> Int -> Iter
+iterArray arr j = Iter chr l
   where m0 = A.unsafeIndex arr j
         m1 = A.unsafeIndex arr (j+1)
         m2 = A.unsafeIndex arr (j+2)
         m3 = A.unsafeIndex arr (j+3)
-        j = off + i
         l = utf8LengthByLeader m0
         chr = case l of
             1 -> unsafeChr8 m0
             2 -> chr2 m0 m1
             3 -> chr3 m0 m1 m2
             _ -> chr4 m0 m1 m2 m3
-{-# INLINE iter #-}
+{-# INLINE iterArray #-}
 
 -- | /O(1)/ Iterate one step through a UTF-8 array, returning the
 -- delta to add to give the next offset to iterate at.
@@ -98,8 +103,12 @@ iter_ (Text arr off _len) i = utf8LengthByLeader m
 -- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the current character and the delta to add (i.e. a
 -- negative number) to give the next offset to iterate at.
-reverseIter :: Text -> Int ->  Iter
-reverseIter (Text arr off _len) i
+reverseIter :: Text -> Int -> Iter
+reverseIter (Text arr off _len) i = reverseIterArray arr (off + i)
+{-# INLINE reverseIter #-}
+
+reverseIterArray :: A.Array -> Int -> Iter
+reverseIterArray arr j
     | m0 <  0x80 = Iter (unsafeChr8 m0) (-1)
     | m1 >= 0xC0 = Iter (chr2 m1 m0) (-2)
     | m2 >= 0xC0 = Iter (chr3 m2 m1 m0) (-3)
@@ -108,8 +117,7 @@ reverseIter (Text arr off _len) i
         m1 = A.unsafeIndex arr (j-1)
         m2 = A.unsafeIndex arr (j-2)
         m3 = A.unsafeIndex arr (j-3)
-        j = off + i
-{-# INLINE reverseIter #-}
+{-# INLINE reverseIterArray #-}
 
 -- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the delta to add (i.e. a negative number) to give the

From 172a2ae24c7758b4f9e6bc3a2cbedd3a4ee53f65 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 14 Jul 2021 22:23:54 +0100
Subject: [PATCH 17/38] Implement mapAccum{L,R}

---
 src/Data/Text.hs       | 66 +++++++++++++++++++++++++++++++++++-------
 src/Data/Text/Array.hs |  1 +
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 8f16567c..37c9f702 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -2,6 +2,7 @@
 {-# LANGUAGE TemplateHaskellQuotes #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE UnliftedFFITypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 
 {-# OPTIONS_GHC -fno-warn-orphans #-}
 
@@ -235,7 +236,7 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
-                         reverseIter_, unsafeHead, unsafeTail, unsafeDupablePerformIO)
+                         reverseIter_, unsafeHead, unsafeTail, unsafeDupablePerformIO, iterArray, reverseIterArray)
 import Data.Text.Internal.Search (indices)
 #if defined(__HADDOCK__)
 import Data.ByteString (ByteString)
@@ -484,10 +485,6 @@ uncons t@(Text arr off len)
                          in (c, text arr (off+d) (len-d))
 {-# INLINE [1] uncons #-}
 
--- | Lifted from Control.Arrow and specialized.
-second :: (b -> c) -> (a,b) -> (a,c)
-second f (a, b) = (a, f b)
-
 -- | /O(1)/ Returns the last character of a 'Text', which must be
 -- non-empty.
 last :: Text -> Char
@@ -1028,9 +1025,30 @@ scanr1 f t | null t    = empty
 -- function to each element of a 'Text', passing an accumulating
 -- parameter from left to right, and returns a final 'Text'.  Performs
 -- replacement on invalid scalar values.
-mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
-mapAccumL f z0 = S.mapAccumL g z0 . stream
-    where g a b = second safe (f a b)
+mapAccumL :: forall a. (a -> Char -> (a, Char)) -> a -> Text -> (a, Text)
+mapAccumL f z0 = go
+  where
+    go (Text src o l) = runST $ do
+      marr <- A.new (l + 4)
+      outer marr (l + 4) o 0 z0
+      where
+        outer :: forall s. A.MArray s -> Int -> Int -> Int -> a -> ST s (a, Text)
+        outer !dst !dstLen = inner
+          where
+            inner !srcOff !dstOff !z
+              | srcOff >= l + o = do
+                A.shrinkM dst dstOff
+                arr <- A.unsafeFreeze dst
+                return (z, Text arr 0 dstOff)
+              | dstOff + 4 > dstLen = do
+                let !dstLen' = dstLen + (l + o) - srcOff + 4
+                dst' <- A.resizeM dst dstLen'
+                outer dst' dstLen' srcOff dstOff z
+              | otherwise = do
+                let !(Iter c d) = iterArray src srcOff
+                    (z', c') = f z c
+                d' <- unsafeWrite dst dstOff (safe c')
+                inner (srcOff + d) (dstOff + d') z'
 {-# INLINE mapAccumL #-}
 
 -- | The 'mapAccumR' function behaves like a combination of 'map' and
@@ -1039,9 +1057,35 @@ mapAccumL f z0 = S.mapAccumL g z0 . stream
 -- returning a final value of this accumulator together with the new
 -- 'Text'.
 -- Performs replacement on invalid scalar values.
-mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
-mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
-    where g a b = second safe (f a b)
+mapAccumR :: forall a. (a -> Char -> (a, Char)) -> a -> Text -> (a, Text)
+mapAccumR f z0 = go
+  where
+    go (Text src o l) = runST $ do
+      marr <- A.new (l + 4)
+      outer marr (l + o - 1) (l + 4 - 1) z0
+      where
+        outer :: forall s. A.MArray s -> Int -> Int -> a -> ST s (a, Text)
+        outer !dst = inner
+          where
+            inner !srcOff !dstOff !z
+              | srcOff < o = do
+                dstLen <- A.getSizeofMArray dst
+                arr <- A.unsafeFreeze dst
+                return (z, Text arr (dstOff + 1) (dstLen - dstOff - 1))
+              | dstOff < 3 = do
+                dstLen <- A.getSizeofMArray dst
+                let !dstLen' = dstLen + (srcOff - o) + 4
+                dst' <- A.new dstLen'
+                A.copyM dst' (dstLen' - dstLen) dst 0 dstLen
+                outer dst' srcOff (dstOff + dstLen' - dstLen) z
+              | otherwise = do
+                let !(Iter c d) = reverseIterArray src (srcOff)
+                    (z', c') = f z c
+                    c'' = safe c'
+                    !d' = utf8Length c''
+                    dstOff' = dstOff - d'
+                _ <- unsafeWrite dst (dstOff' + 1) c''
+                inner (srcOff + d) dstOff' z'
 {-# INLINE mapAccumR #-}
 
 -- -----------------------------------------------------------------------------
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 80d93dea..ec7cee88 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -43,6 +43,7 @@ module Data.Text.Array
     , newFilled
     , unsafeWrite
     , tile
+    , getSizeofMArray
     ) where
 
 #if defined(ASSERTS)

From dac5ab748305211bb222395722489bf6fc88e657 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 17 Jul 2021 17:53:58 +0100
Subject: [PATCH 18/38] Implement filter

---
 src/Data/Text.hs      | 66 +++++++++++++++++++++++++++++++++++++++++--
 src/Data/Text/Lazy.hs |  2 +-
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 37c9f702..dd7fb4f8 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -225,14 +225,14 @@ import Data.Binary (Binary(get, put))
 import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
-import Data.Text.Internal.Encoding.Utf8 (chr3, utf8Length, utf8LengthByLeader)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr2, chr3, chr4)
 import qualified Data.Text.Internal.Fusion as S
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
 import Data.Text.Internal.Private (span_)
 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
-import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
@@ -1502,7 +1502,67 @@ partition p t = (filter p t, filter (not . p) t)
 -- returns a 'Text' containing those characters that satisfy the
 -- predicate.
 filter :: (Char -> Bool) -> Text -> Text
-filter p = unstream . S.filter p . stream
+filter p = go
+  where
+    go (Text src o l) = runST $ do
+      -- It's tempting to allocate l elements at once and avoid resizing.
+      -- However, this can be unacceptable in scenarios where a huge array
+      -- is filtered with a rare predicate, resulting in a much shorter buffer.
+      let !dstLen = min l 64
+      dst <- A.new dstLen
+      outer dst dstLen o 0
+      where
+        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s Text
+        outer !dst !dstLen = inner
+          where
+            inner !srcOff !dstOff
+              | srcOff >= o + l = do
+                A.shrinkM dst dstOff
+                arr <- A.unsafeFreeze dst
+                return (Text arr 0 dstOff)
+              | dstOff + 4 > dstLen = do
+                -- Double size of the buffer, unless it becomes longer than
+                -- source string. Ensure to extend it by least 4 bytes.
+                let !dstLen' = dstLen + max 4 (min (l + o - srcOff) dstLen)
+                dst' <- A.resizeM dst dstLen'
+                outer dst' dstLen' srcOff dstOff
+              -- In case of success, filter writes exactly the same character
+              -- it just read (this is not a case for map, for example).
+              -- We leverage this fact below: no need to decode Char back into UTF8,
+              -- just copy bytes from input.
+              | otherwise = do
+                let m0 = A.unsafeIndex src srcOff
+                    m1 = A.unsafeIndex src (srcOff + 1)
+                    m2 = A.unsafeIndex src (srcOff + 2)
+                    m3 = A.unsafeIndex src (srcOff + 3)
+                    !d = utf8LengthByLeader m0
+                case d of
+                  1 -> do
+                    let !c = unsafeChr8 m0
+                    if not (p c) then inner (srcOff + 1) dstOff else do
+                      A.unsafeWrite dst dstOff m0
+                      inner (srcOff + 1) (dstOff + 1)
+                  2 -> do
+                    let !c = chr2 m0 m1
+                    if not (p c) then inner (srcOff + 2) dstOff else do
+                      A.unsafeWrite dst dstOff m0
+                      A.unsafeWrite dst (dstOff + 1) m1
+                      inner (srcOff + 2) (dstOff + 2)
+                  3 -> do
+                    let !c = chr3 m0 m1 m2
+                    if not (p c) then inner (srcOff + 3) dstOff else do
+                      A.unsafeWrite dst dstOff m0
+                      A.unsafeWrite dst (dstOff + 1) m1
+                      A.unsafeWrite dst (dstOff + 2) m2
+                      inner (srcOff + 3) (dstOff + 3)
+                  _ -> do
+                    let !c = chr4 m0 m1 m2 m3
+                    if not (p c) then inner (srcOff + 4) dstOff else do
+                      A.unsafeWrite dst dstOff m0
+                      A.unsafeWrite dst (dstOff + 1) m1
+                      A.unsafeWrite dst (dstOff + 2) m2
+                      A.unsafeWrite dst (dstOff + 3) m3
+                      inner (srcOff + 4) (dstOff + 4)
 {-# INLINE [1] filter #-}
 
 {-# RULES
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 0f4e3258..c326a32b 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -1566,7 +1566,7 @@ stripSuffix p t = reverse `fmap` stripPrefix (reverse p) (reverse t)
 -- returns a 'Text' containing those characters that satisfy the
 -- predicate.
 filter :: (Char -> Bool) -> Text -> Text
-filter p t = unstream (S.filter p (stream t))
+filter p = foldrChunks (chunk . T.filter p) Empty
 {-# INLINE [1] filter #-}
 
 {-# RULES

From d3e772a30e7f040b57764906cf03684c5158d788 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 18 Jul 2021 20:56:40 +0100
Subject: [PATCH 19/38] Experiment with case conversions

---
 scripts/CaseFolding.hs                       |   21 +-
 scripts/CaseMapping.hs                       |   11 +-
 scripts/SpecialCasing.hs                     |   21 +-
 src/Data/Text/Internal/Fusion/CaseMapping.hs | 6984 ++++++++++++++++--
 src/Data/Text/Internal/Fusion/Common.hs      |   52 +-
 src/Data/Text/Internal/Fusion/Types.hs       |    3 +-
 6 files changed, 6313 insertions(+), 779 deletions(-)

diff --git a/scripts/CaseFolding.hs b/scripts/CaseFolding.hs
index 11d180ca..84a61f4e 100644
--- a/scripts/CaseFolding.hs
+++ b/scripts/CaseFolding.hs
@@ -11,6 +11,7 @@ module CaseFolding
     ) where
 
 import Arsec
+import Data.Bits
 
 data Fold = Fold {
       code :: Char
@@ -34,13 +35,19 @@ parseCF :: FilePath -> IO (Either ParseError CaseFolding)
 parseCF name = parse entries name <$> readFile name
 
 mapCF :: CaseFolding -> [String]
-mapCF (CF _ ms) = typ ++ (map nice . filter p $ ms) ++ [last]
+mapCF (CF _ ms) = typ ++ map printUnusual ms' ++ map printUsual usual ++ [last]
   where
-    typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char"
-           ,"{-# NOINLINE foldMapping #-}"]
-    last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')"
-    nice c = "-- " ++ name c ++ "\n" ++
-             "foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
-       where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
+    ms' = filter p ms
     p f = status f `elem` "CF" &&
           mapping f /= [toLower (code f)]
+    unusual = map code ms'
+    usual = filter (\c -> toLower c /= c && c `notElem` unusual) [minBound..maxBound]
+
+    typ = ["foldMapping :: Char# -> _ {- unboxed Int64 -}"
+           ,"{-# NOINLINE foldMapping #-}"
+           ,"foldMapping = \\case"]
+    last = "  _ -> unI64 0"
+    printUnusual c = "  -- " ++ name c ++ "\n" ++
+             "  " ++ showC (code c) ++ "# -> unI64 "  ++ show (ord x + (ord y `shiftL` 21) + (ord z `shiftL` 42))
+       where x:y:z:_ = mapping c ++ repeat '\0'
+    printUsual c = "  " ++ showC c ++ "# -> unI64 " ++ show (ord (toLower c))
diff --git a/scripts/CaseMapping.hs b/scripts/CaseMapping.hs
index 4af11d90..b9d8e7e5 100644
--- a/scripts/CaseMapping.hs
+++ b/scripts/CaseMapping.hs
@@ -22,14 +22,17 @@ main = do
   let comments = map ("--" ++) $
                  take 2 (cfComments cfs) ++ take 2 (scComments scs)
   mapM_ (hPutStrLn h) $
-                      ["{-# LANGUAGE Rank2Types #-}"
-                      ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
+                      ["-- AUTOMATICALLY GENERATED - DO NOT EDIT"
                       ,"-- Generated by scripts/CaseMapping.hs"] ++
                       comments ++
                       [""
+                      ,"{-# LANGUAGE LambdaCase, MagicHash, PartialTypeSignatures #-}"
+                      ,"{-# OPTIONS_GHC -Wno-partial-type-signatures #-}"
                       ,"module Data.Text.Internal.Fusion.CaseMapping where"
-                      ,"import Data.Char"
-                      ,"import Data.Text.Internal.Fusion.Types"
+                      ,"import GHC.Int"
+                      ,"import GHC.Exts"
+                      ,"unI64 :: Int64 -> _ {- unboxed Int64 -}"
+                      ,"unI64 (I64# n) = n"
                       ,""]
   mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
   mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
diff --git a/scripts/SpecialCasing.hs b/scripts/SpecialCasing.hs
index 03be3d3b..099110b5 100644
--- a/scripts/SpecialCasing.hs
+++ b/scripts/SpecialCasing.hs
@@ -11,6 +11,7 @@ module SpecialCasing
     ) where
 
 import Arsec
+import Data.Bits
 
 data SpecialCasing = SC { scComments :: [Comment], scCasing :: [Case] }
                    deriving (Show)
@@ -40,17 +41,23 @@ parseSC name = parse entries name <$> readFile name
 mapSC :: String -> (Case -> String) -> (Char -> Char) -> SpecialCasing
          -> [String]
 mapSC which access twiddle (SC _ ms) =
-    typ ++ (map nice . filter p $ ms) ++ [last]
+    typ ++ map printUnusual ms' ++ map printUsual usual ++ [last]
   where
-    typ = [which ++ "Mapping :: forall s. Char -> s -> Step (CC s) Char"
-           ,"{-# NOINLINE " ++ which ++ "Mapping #-}"]
-    last = which ++ "Mapping c s = Yield (to" ++ ucFirst which ++ " c) (CC s '\\0' '\\0')"
-    nice c = "-- " ++ name c ++ "\n" ++
-             which ++ "Mapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
-       where [x,y,z] = (map showC . take 3) (access c ++ repeat '\0')
+    ms' = filter p ms
     p c = [k] /= a && a /= [twiddle k] && null (conditions c)
         where a = access c
               k = code c
+    unusual = map code ms'
+    usual = filter (\c -> twiddle c /= c && c `notElem` unusual) [minBound..maxBound]
+
+    typ = [which ++ "Mapping :: Char# -> _ {- unboxed Int64 -}"
+           ,"{-# NOINLINE " ++ which ++ "Mapping #-}"
+           ,which ++ "Mapping = \\case"]
+    last = "  _ -> unI64 0"
+    printUnusual c = "  -- " ++ name c ++ "\n" ++
+             "  " ++ showC (code c) ++ "# -> unI64 " ++ show (ord x + (ord y `shiftL` 21) + (ord z `shiftL` 42))
+       where x:y:z:_ = access c ++ repeat '\0'
+    printUsual c = "  " ++ showC c ++ "# -> unI64 " ++ show (ord (twiddle c))
 
 ucFirst (c:cs) = toUpper c : cs
 ucFirst [] = []
diff --git a/src/Data/Text/Internal/Fusion/CaseMapping.hs b/src/Data/Text/Internal/Fusion/CaseMapping.hs
index f9fc4228..bc8691e9 100644
--- a/src/Data/Text/Internal/Fusion/CaseMapping.hs
+++ b/src/Data/Text/Internal/Fusion/CaseMapping.hs
@@ -1,4 +1,3 @@
-{-# LANGUAGE Rank2Types #-}
 -- AUTOMATICALLY GENERATED - DO NOT EDIT
 -- Generated by scripts/CaseMapping.hs
 -- CaseFolding-13.0.0.txt
@@ -6,751 +5,6250 @@
 -- SpecialCasing-13.0.0.txt
 -- Date: 2019-09-08, 23:31:24 GMT
 
+{-# LANGUAGE LambdaCase, MagicHash, PartialTypeSignatures #-}
+{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
 module Data.Text.Internal.Fusion.CaseMapping where
-import Data.Char
-import Data.Text.Internal.Fusion.Types
+import GHC.Int
+import GHC.Exts
+unI64 :: Int64 -> _ {- unboxed Int64 -}
+unI64 (I64# n) = n
 
-upperMapping :: forall s. Char -> s -> Step (CC s) Char
+upperMapping :: Char# -> _ {- unboxed Int64 -}
 {-# NOINLINE upperMapping #-}
--- LATIN SMALL LETTER SHARP S
-upperMapping '\x00df' s = Yield '\x0053' (CC s '\x0053' '\x0000')
--- LATIN SMALL LIGATURE FF
-upperMapping '\xfb00' s = Yield '\x0046' (CC s '\x0046' '\x0000')
--- LATIN SMALL LIGATURE FI
-upperMapping '\xfb01' s = Yield '\x0046' (CC s '\x0049' '\x0000')
--- LATIN SMALL LIGATURE FL
-upperMapping '\xfb02' s = Yield '\x0046' (CC s '\x004c' '\x0000')
--- LATIN SMALL LIGATURE FFI
-upperMapping '\xfb03' s = Yield '\x0046' (CC s '\x0046' '\x0049')
--- LATIN SMALL LIGATURE FFL
-upperMapping '\xfb04' s = Yield '\x0046' (CC s '\x0046' '\x004c')
--- LATIN SMALL LIGATURE LONG S T
-upperMapping '\xfb05' s = Yield '\x0053' (CC s '\x0054' '\x0000')
--- LATIN SMALL LIGATURE ST
-upperMapping '\xfb06' s = Yield '\x0053' (CC s '\x0054' '\x0000')
--- ARMENIAN SMALL LIGATURE ECH YIWN
-upperMapping '\x0587' s = Yield '\x0535' (CC s '\x0552' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN NOW
-upperMapping '\xfb13' s = Yield '\x0544' (CC s '\x0546' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN ECH
-upperMapping '\xfb14' s = Yield '\x0544' (CC s '\x0535' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN INI
-upperMapping '\xfb15' s = Yield '\x0544' (CC s '\x053b' '\x0000')
--- ARMENIAN SMALL LIGATURE VEW NOW
-upperMapping '\xfb16' s = Yield '\x054e' (CC s '\x0546' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN XEH
-upperMapping '\xfb17' s = Yield '\x0544' (CC s '\x053d' '\x0000')
--- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-upperMapping '\x0149' s = Yield '\x02bc' (CC s '\x004e' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-upperMapping '\x0390' s = Yield '\x0399' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-upperMapping '\x03b0' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
--- LATIN SMALL LETTER J WITH CARON
-upperMapping '\x01f0' s = Yield '\x004a' (CC s '\x030c' '\x0000')
--- LATIN SMALL LETTER H WITH LINE BELOW
-upperMapping '\x1e96' s = Yield '\x0048' (CC s '\x0331' '\x0000')
--- LATIN SMALL LETTER T WITH DIAERESIS
-upperMapping '\x1e97' s = Yield '\x0054' (CC s '\x0308' '\x0000')
--- LATIN SMALL LETTER W WITH RING ABOVE
-upperMapping '\x1e98' s = Yield '\x0057' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER Y WITH RING ABOVE
-upperMapping '\x1e99' s = Yield '\x0059' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER A WITH RIGHT HALF RING
-upperMapping '\x1e9a' s = Yield '\x0041' (CC s '\x02be' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI
-upperMapping '\x1f50' s = Yield '\x03a5' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
-upperMapping '\x1f52' s = Yield '\x03a5' (CC s '\x0313' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
-upperMapping '\x1f54' s = Yield '\x03a5' (CC s '\x0313' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
-upperMapping '\x1f56' s = Yield '\x03a5' (CC s '\x0313' '\x0342')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
-upperMapping '\x1fb6' s = Yield '\x0391' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI
-upperMapping '\x1fc6' s = Yield '\x0397' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
-upperMapping '\x1fd2' s = Yield '\x0399' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-upperMapping '\x1fd3' s = Yield '\x0399' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER IOTA WITH PERISPOMENI
-upperMapping '\x1fd6' s = Yield '\x0399' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
-upperMapping '\x1fd7' s = Yield '\x0399' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
-upperMapping '\x1fe2' s = Yield '\x03a5' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
-upperMapping '\x1fe3' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER RHO WITH PSILI
-upperMapping '\x1fe4' s = Yield '\x03a1' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
-upperMapping '\x1fe6' s = Yield '\x03a5' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
-upperMapping '\x1fe7' s = Yield '\x03a5' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
-upperMapping '\x1ff6' s = Yield '\x03a9' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
-upperMapping '\x1f80' s = Yield '\x1f08' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
-upperMapping '\x1f81' s = Yield '\x1f09' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1f82' s = Yield '\x1f0a' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1f83' s = Yield '\x1f0b' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1f84' s = Yield '\x1f0c' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1f85' s = Yield '\x1f0d' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1f86' s = Yield '\x1f0e' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1f87' s = Yield '\x1f0f' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
-upperMapping '\x1f88' s = Yield '\x1f08' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
-upperMapping '\x1f89' s = Yield '\x1f09' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1f8a' s = Yield '\x1f0a' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1f8b' s = Yield '\x1f0b' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1f8c' s = Yield '\x1f0c' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1f8d' s = Yield '\x1f0d' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1f8e' s = Yield '\x1f0e' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1f8f' s = Yield '\x1f0f' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
-upperMapping '\x1f90' s = Yield '\x1f28' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
-upperMapping '\x1f91' s = Yield '\x1f29' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1f92' s = Yield '\x1f2a' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1f93' s = Yield '\x1f2b' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1f94' s = Yield '\x1f2c' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1f95' s = Yield '\x1f2d' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1f96' s = Yield '\x1f2e' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1f97' s = Yield '\x1f2f' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
-upperMapping '\x1f98' s = Yield '\x1f28' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
-upperMapping '\x1f99' s = Yield '\x1f29' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1f9a' s = Yield '\x1f2a' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1f9b' s = Yield '\x1f2b' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1f9c' s = Yield '\x1f2c' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1f9d' s = Yield '\x1f2d' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1f9e' s = Yield '\x1f2e' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1f9f' s = Yield '\x1f2f' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
-upperMapping '\x1fa0' s = Yield '\x1f68' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
-upperMapping '\x1fa1' s = Yield '\x1f69' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1fa2' s = Yield '\x1f6a' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-upperMapping '\x1fa3' s = Yield '\x1f6b' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1fa4' s = Yield '\x1f6c' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-upperMapping '\x1fa5' s = Yield '\x1f6d' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1fa6' s = Yield '\x1f6e' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1fa7' s = Yield '\x1f6f' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
-upperMapping '\x1fa8' s = Yield '\x1f68' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
-upperMapping '\x1fa9' s = Yield '\x1f69' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1faa' s = Yield '\x1f6a' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-upperMapping '\x1fab' s = Yield '\x1f6b' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1fac' s = Yield '\x1f6c' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-upperMapping '\x1fad' s = Yield '\x1f6d' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1fae' s = Yield '\x1f6e' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-upperMapping '\x1faf' s = Yield '\x1f6f' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
-upperMapping '\x1fb3' s = Yield '\x0391' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
-upperMapping '\x1fbc' s = Yield '\x0391' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
-upperMapping '\x1fc3' s = Yield '\x0397' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
-upperMapping '\x1fcc' s = Yield '\x0397' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
-upperMapping '\x1ff3' s = Yield '\x03a9' (CC s '\x0399' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
-upperMapping '\x1ffc' s = Yield '\x03a9' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
-upperMapping '\x1fb2' s = Yield '\x1fba' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
-upperMapping '\x1fb4' s = Yield '\x0386' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
-upperMapping '\x1fc2' s = Yield '\x1fca' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
-upperMapping '\x1fc4' s = Yield '\x0389' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
-upperMapping '\x1ff2' s = Yield '\x1ffa' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
-upperMapping '\x1ff4' s = Yield '\x038f' (CC s '\x0399' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1fb7' s = Yield '\x0391' (CC s '\x0342' '\x0399')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1fc7' s = Yield '\x0397' (CC s '\x0342' '\x0399')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
-upperMapping '\x1ff7' s = Yield '\x03a9' (CC s '\x0342' '\x0399')
-upperMapping c s = Yield (toUpper c) (CC s '\0' '\0')
-lowerMapping :: forall s. Char -> s -> Step (CC s) Char
+upperMapping = \case
+  -- LATIN SMALL LETTER SHARP S
+  '\x00df'# -> unI64 174063699
+  -- LATIN SMALL LIGATURE FF
+  '\xfb00'# -> unI64 146800710
+  -- LATIN SMALL LIGATURE FI
+  '\xfb01'# -> unI64 153092166
+  -- LATIN SMALL LIGATURE FL
+  '\xfb02'# -> unI64 159383622
+  -- LATIN SMALL LIGATURE FFI
+  '\xfb03'# -> unI64 321057542111302
+  -- LATIN SMALL LIGATURE FFL
+  '\xfb04'# -> unI64 334251681644614
+  -- LATIN SMALL LIGATURE LONG S T
+  '\xfb05'# -> unI64 176160851
+  -- LATIN SMALL LIGATURE ST
+  '\xfb06'# -> unI64 176160851
+  -- ARMENIAN SMALL LIGATURE ECH YIWN
+  '\x0587'# -> unI64 2856322357
+  -- ARMENIAN SMALL LIGATURE MEN NOW
+  '\xfb13'# -> unI64 2831156548
+  -- ARMENIAN SMALL LIGATURE MEN ECH
+  '\xfb14'# -> unI64 2795504964
+  -- ARMENIAN SMALL LIGATURE MEN INI
+  '\xfb15'# -> unI64 2808087876
+  -- ARMENIAN SMALL LIGATURE VEW NOW
+  '\xfb16'# -> unI64 2831156558
+  -- ARMENIAN SMALL LIGATURE MEN XEH
+  '\xfb17'# -> unI64 2812282180
+  -- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+  '\x0149'# -> unI64 163578556
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+  '\x0390'# -> unI64 3382099394429849
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+  '\x03b0'# -> unI64 3382099394429861
+  -- LATIN SMALL LETTER J WITH CARON
+  '\x01f0'# -> unI64 1635778634
+  -- LATIN SMALL LETTER H WITH LINE BELOW
+  '\x1e96'# -> unI64 1713373256
+  -- LATIN SMALL LETTER T WITH DIAERESIS
+  '\x1e97'# -> unI64 1627390036
+  -- LATIN SMALL LETTER W WITH RING ABOVE
+  '\x1e98'# -> unI64 1631584343
+  -- LATIN SMALL LETTER Y WITH RING ABOVE
+  '\x1e99'# -> unI64 1631584345
+  -- LATIN SMALL LETTER A WITH RIGHT HALF RING
+  '\x1e9a'# -> unI64 1472200769
+  -- GREEK SMALL LETTER UPSILON WITH PSILI
+  '\x1f50'# -> unI64 1650459557
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+  '\x1f52'# -> unI64 3377701370987429
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+  '\x1f54'# -> unI64 3382099417498533
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+  '\x1f56'# -> unI64 3667972440720293
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+  '\x1fb6'# -> unI64 1749025681
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI
+  '\x1fc6'# -> unI64 1749025687
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+  '\x1fd2'# -> unI64 3377701347918745
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+  '\x1fd3'# -> unI64 3382099394429849
+  -- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+  '\x1fd6'# -> unI64 1749025689
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+  '\x1fd7'# -> unI64 3667972417651609
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+  '\x1fe2'# -> unI64 3377701347918757
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+  '\x1fe3'# -> unI64 3382099394429861
+  -- GREEK SMALL LETTER RHO WITH PSILI
+  '\x1fe4'# -> unI64 1650459553
+  -- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+  '\x1fe6'# -> unI64 1749025701
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+  '\x1fe7'# -> unI64 3667972417651621
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+  '\x1ff6'# -> unI64 1749025705
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+  '\x1f80'# -> unI64 1931484936
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+  '\x1f81'# -> unI64 1931484937
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1f82'# -> unI64 1931484938
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1f83'# -> unI64 1931484939
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1f84'# -> unI64 1931484940
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1f85'# -> unI64 1931484941
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f86'# -> unI64 1931484942
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f87'# -> unI64 1931484943
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+  '\x1f88'# -> unI64 1931484936
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+  '\x1f89'# -> unI64 1931484937
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1f8a'# -> unI64 1931484938
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1f8b'# -> unI64 1931484939
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1f8c'# -> unI64 1931484940
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1f8d'# -> unI64 1931484941
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f8e'# -> unI64 1931484942
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f8f'# -> unI64 1931484943
+  -- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+  '\x1f90'# -> unI64 1931484968
+  -- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+  '\x1f91'# -> unI64 1931484969
+  -- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1f92'# -> unI64 1931484970
+  -- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1f93'# -> unI64 1931484971
+  -- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1f94'# -> unI64 1931484972
+  -- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1f95'# -> unI64 1931484973
+  -- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f96'# -> unI64 1931484974
+  -- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f97'# -> unI64 1931484975
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+  '\x1f98'# -> unI64 1931484968
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+  '\x1f99'# -> unI64 1931484969
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1f9a'# -> unI64 1931484970
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1f9b'# -> unI64 1931484971
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1f9c'# -> unI64 1931484972
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1f9d'# -> unI64 1931484973
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f9e'# -> unI64 1931484974
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f9f'# -> unI64 1931484975
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+  '\x1fa0'# -> unI64 1931485032
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+  '\x1fa1'# -> unI64 1931485033
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1fa2'# -> unI64 1931485034
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1fa3'# -> unI64 1931485035
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1fa4'# -> unI64 1931485036
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1fa5'# -> unI64 1931485037
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fa6'# -> unI64 1931485038
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fa7'# -> unI64 1931485039
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+  '\x1fa8'# -> unI64 1931485032
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+  '\x1fa9'# -> unI64 1931485033
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1faa'# -> unI64 1931485034
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1fab'# -> unI64 1931485035
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1fac'# -> unI64 1931485036
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1fad'# -> unI64 1931485037
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1fae'# -> unI64 1931485038
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1faf'# -> unI64 1931485039
+  -- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+  '\x1fb3'# -> unI64 1931477905
+  -- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+  '\x1fbc'# -> unI64 1931477905
+  -- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+  '\x1fc3'# -> unI64 1931477911
+  -- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+  '\x1fcc'# -> unI64 1931477911
+  -- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+  '\x1ff3'# -> unI64 1931477929
+  -- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+  '\x1ffc'# -> unI64 1931477929
+  -- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fb2'# -> unI64 1931485114
+  -- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fb4'# -> unI64 1931477894
+  -- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fc2'# -> unI64 1931485130
+  -- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fc4'# -> unI64 1931477897
+  -- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+  '\x1ff2'# -> unI64 1931485178
+  -- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+  '\x1ff4'# -> unI64 1931477903
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fb7'# -> unI64 4050602585752465
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fc7'# -> unI64 4050602585752471
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1ff7'# -> unI64 4050602585752489
+  '\x0061'# -> unI64 65
+  '\x0062'# -> unI64 66
+  '\x0063'# -> unI64 67
+  '\x0064'# -> unI64 68
+  '\x0065'# -> unI64 69
+  '\x0066'# -> unI64 70
+  '\x0067'# -> unI64 71
+  '\x0068'# -> unI64 72
+  '\x0069'# -> unI64 73
+  '\x006a'# -> unI64 74
+  '\x006b'# -> unI64 75
+  '\x006c'# -> unI64 76
+  '\x006d'# -> unI64 77
+  '\x006e'# -> unI64 78
+  '\x006f'# -> unI64 79
+  '\x0070'# -> unI64 80
+  '\x0071'# -> unI64 81
+  '\x0072'# -> unI64 82
+  '\x0073'# -> unI64 83
+  '\x0074'# -> unI64 84
+  '\x0075'# -> unI64 85
+  '\x0076'# -> unI64 86
+  '\x0077'# -> unI64 87
+  '\x0078'# -> unI64 88
+  '\x0079'# -> unI64 89
+  '\x007a'# -> unI64 90
+  '\x00b5'# -> unI64 924
+  '\x00e0'# -> unI64 192
+  '\x00e1'# -> unI64 193
+  '\x00e2'# -> unI64 194
+  '\x00e3'# -> unI64 195
+  '\x00e4'# -> unI64 196
+  '\x00e5'# -> unI64 197
+  '\x00e6'# -> unI64 198
+  '\x00e7'# -> unI64 199
+  '\x00e8'# -> unI64 200
+  '\x00e9'# -> unI64 201
+  '\x00ea'# -> unI64 202
+  '\x00eb'# -> unI64 203
+  '\x00ec'# -> unI64 204
+  '\x00ed'# -> unI64 205
+  '\x00ee'# -> unI64 206
+  '\x00ef'# -> unI64 207
+  '\x00f0'# -> unI64 208
+  '\x00f1'# -> unI64 209
+  '\x00f2'# -> unI64 210
+  '\x00f3'# -> unI64 211
+  '\x00f4'# -> unI64 212
+  '\x00f5'# -> unI64 213
+  '\x00f6'# -> unI64 214
+  '\x00f8'# -> unI64 216
+  '\x00f9'# -> unI64 217
+  '\x00fa'# -> unI64 218
+  '\x00fb'# -> unI64 219
+  '\x00fc'# -> unI64 220
+  '\x00fd'# -> unI64 221
+  '\x00fe'# -> unI64 222
+  '\x00ff'# -> unI64 376
+  '\x0101'# -> unI64 256
+  '\x0103'# -> unI64 258
+  '\x0105'# -> unI64 260
+  '\x0107'# -> unI64 262
+  '\x0109'# -> unI64 264
+  '\x010b'# -> unI64 266
+  '\x010d'# -> unI64 268
+  '\x010f'# -> unI64 270
+  '\x0111'# -> unI64 272
+  '\x0113'# -> unI64 274
+  '\x0115'# -> unI64 276
+  '\x0117'# -> unI64 278
+  '\x0119'# -> unI64 280
+  '\x011b'# -> unI64 282
+  '\x011d'# -> unI64 284
+  '\x011f'# -> unI64 286
+  '\x0121'# -> unI64 288
+  '\x0123'# -> unI64 290
+  '\x0125'# -> unI64 292
+  '\x0127'# -> unI64 294
+  '\x0129'# -> unI64 296
+  '\x012b'# -> unI64 298
+  '\x012d'# -> unI64 300
+  '\x012f'# -> unI64 302
+  '\x0131'# -> unI64 73
+  '\x0133'# -> unI64 306
+  '\x0135'# -> unI64 308
+  '\x0137'# -> unI64 310
+  '\x013a'# -> unI64 313
+  '\x013c'# -> unI64 315
+  '\x013e'# -> unI64 317
+  '\x0140'# -> unI64 319
+  '\x0142'# -> unI64 321
+  '\x0144'# -> unI64 323
+  '\x0146'# -> unI64 325
+  '\x0148'# -> unI64 327
+  '\x014b'# -> unI64 330
+  '\x014d'# -> unI64 332
+  '\x014f'# -> unI64 334
+  '\x0151'# -> unI64 336
+  '\x0153'# -> unI64 338
+  '\x0155'# -> unI64 340
+  '\x0157'# -> unI64 342
+  '\x0159'# -> unI64 344
+  '\x015b'# -> unI64 346
+  '\x015d'# -> unI64 348
+  '\x015f'# -> unI64 350
+  '\x0161'# -> unI64 352
+  '\x0163'# -> unI64 354
+  '\x0165'# -> unI64 356
+  '\x0167'# -> unI64 358
+  '\x0169'# -> unI64 360
+  '\x016b'# -> unI64 362
+  '\x016d'# -> unI64 364
+  '\x016f'# -> unI64 366
+  '\x0171'# -> unI64 368
+  '\x0173'# -> unI64 370
+  '\x0175'# -> unI64 372
+  '\x0177'# -> unI64 374
+  '\x017a'# -> unI64 377
+  '\x017c'# -> unI64 379
+  '\x017e'# -> unI64 381
+  '\x017f'# -> unI64 83
+  '\x0180'# -> unI64 579
+  '\x0183'# -> unI64 386
+  '\x0185'# -> unI64 388
+  '\x0188'# -> unI64 391
+  '\x018c'# -> unI64 395
+  '\x0192'# -> unI64 401
+  '\x0195'# -> unI64 502
+  '\x0199'# -> unI64 408
+  '\x019a'# -> unI64 573
+  '\x019e'# -> unI64 544
+  '\x01a1'# -> unI64 416
+  '\x01a3'# -> unI64 418
+  '\x01a5'# -> unI64 420
+  '\x01a8'# -> unI64 423
+  '\x01ad'# -> unI64 428
+  '\x01b0'# -> unI64 431
+  '\x01b4'# -> unI64 435
+  '\x01b6'# -> unI64 437
+  '\x01b9'# -> unI64 440
+  '\x01bd'# -> unI64 444
+  '\x01bf'# -> unI64 503
+  '\x01c5'# -> unI64 452
+  '\x01c6'# -> unI64 452
+  '\x01c8'# -> unI64 455
+  '\x01c9'# -> unI64 455
+  '\x01cb'# -> unI64 458
+  '\x01cc'# -> unI64 458
+  '\x01ce'# -> unI64 461
+  '\x01d0'# -> unI64 463
+  '\x01d2'# -> unI64 465
+  '\x01d4'# -> unI64 467
+  '\x01d6'# -> unI64 469
+  '\x01d8'# -> unI64 471
+  '\x01da'# -> unI64 473
+  '\x01dc'# -> unI64 475
+  '\x01dd'# -> unI64 398
+  '\x01df'# -> unI64 478
+  '\x01e1'# -> unI64 480
+  '\x01e3'# -> unI64 482
+  '\x01e5'# -> unI64 484
+  '\x01e7'# -> unI64 486
+  '\x01e9'# -> unI64 488
+  '\x01eb'# -> unI64 490
+  '\x01ed'# -> unI64 492
+  '\x01ef'# -> unI64 494
+  '\x01f2'# -> unI64 497
+  '\x01f3'# -> unI64 497
+  '\x01f5'# -> unI64 500
+  '\x01f9'# -> unI64 504
+  '\x01fb'# -> unI64 506
+  '\x01fd'# -> unI64 508
+  '\x01ff'# -> unI64 510
+  '\x0201'# -> unI64 512
+  '\x0203'# -> unI64 514
+  '\x0205'# -> unI64 516
+  '\x0207'# -> unI64 518
+  '\x0209'# -> unI64 520
+  '\x020b'# -> unI64 522
+  '\x020d'# -> unI64 524
+  '\x020f'# -> unI64 526
+  '\x0211'# -> unI64 528
+  '\x0213'# -> unI64 530
+  '\x0215'# -> unI64 532
+  '\x0217'# -> unI64 534
+  '\x0219'# -> unI64 536
+  '\x021b'# -> unI64 538
+  '\x021d'# -> unI64 540
+  '\x021f'# -> unI64 542
+  '\x0223'# -> unI64 546
+  '\x0225'# -> unI64 548
+  '\x0227'# -> unI64 550
+  '\x0229'# -> unI64 552
+  '\x022b'# -> unI64 554
+  '\x022d'# -> unI64 556
+  '\x022f'# -> unI64 558
+  '\x0231'# -> unI64 560
+  '\x0233'# -> unI64 562
+  '\x023c'# -> unI64 571
+  '\x023f'# -> unI64 11390
+  '\x0240'# -> unI64 11391
+  '\x0242'# -> unI64 577
+  '\x0247'# -> unI64 582
+  '\x0249'# -> unI64 584
+  '\x024b'# -> unI64 586
+  '\x024d'# -> unI64 588
+  '\x024f'# -> unI64 590
+  '\x0250'# -> unI64 11375
+  '\x0251'# -> unI64 11373
+  '\x0252'# -> unI64 11376
+  '\x0253'# -> unI64 385
+  '\x0254'# -> unI64 390
+  '\x0256'# -> unI64 393
+  '\x0257'# -> unI64 394
+  '\x0259'# -> unI64 399
+  '\x025b'# -> unI64 400
+  '\x025c'# -> unI64 42923
+  '\x0260'# -> unI64 403
+  '\x0261'# -> unI64 42924
+  '\x0263'# -> unI64 404
+  '\x0265'# -> unI64 42893
+  '\x0266'# -> unI64 42922
+  '\x0268'# -> unI64 407
+  '\x0269'# -> unI64 406
+  '\x026a'# -> unI64 42926
+  '\x026b'# -> unI64 11362
+  '\x026c'# -> unI64 42925
+  '\x026f'# -> unI64 412
+  '\x0271'# -> unI64 11374
+  '\x0272'# -> unI64 413
+  '\x0275'# -> unI64 415
+  '\x027d'# -> unI64 11364
+  '\x0280'# -> unI64 422
+  '\x0282'# -> unI64 42949
+  '\x0283'# -> unI64 425
+  '\x0287'# -> unI64 42929
+  '\x0288'# -> unI64 430
+  '\x0289'# -> unI64 580
+  '\x028a'# -> unI64 433
+  '\x028b'# -> unI64 434
+  '\x028c'# -> unI64 581
+  '\x0292'# -> unI64 439
+  '\x029d'# -> unI64 42930
+  '\x029e'# -> unI64 42928
+  '\x0345'# -> unI64 921
+  '\x0371'# -> unI64 880
+  '\x0373'# -> unI64 882
+  '\x0377'# -> unI64 886
+  '\x037b'# -> unI64 1021
+  '\x037c'# -> unI64 1022
+  '\x037d'# -> unI64 1023
+  '\x03ac'# -> unI64 902
+  '\x03ad'# -> unI64 904
+  '\x03ae'# -> unI64 905
+  '\x03af'# -> unI64 906
+  '\x03b1'# -> unI64 913
+  '\x03b2'# -> unI64 914
+  '\x03b3'# -> unI64 915
+  '\x03b4'# -> unI64 916
+  '\x03b5'# -> unI64 917
+  '\x03b6'# -> unI64 918
+  '\x03b7'# -> unI64 919
+  '\x03b8'# -> unI64 920
+  '\x03b9'# -> unI64 921
+  '\x03ba'# -> unI64 922
+  '\x03bb'# -> unI64 923
+  '\x03bc'# -> unI64 924
+  '\x03bd'# -> unI64 925
+  '\x03be'# -> unI64 926
+  '\x03bf'# -> unI64 927
+  '\x03c0'# -> unI64 928
+  '\x03c1'# -> unI64 929
+  '\x03c2'# -> unI64 931
+  '\x03c3'# -> unI64 931
+  '\x03c4'# -> unI64 932
+  '\x03c5'# -> unI64 933
+  '\x03c6'# -> unI64 934
+  '\x03c7'# -> unI64 935
+  '\x03c8'# -> unI64 936
+  '\x03c9'# -> unI64 937
+  '\x03ca'# -> unI64 938
+  '\x03cb'# -> unI64 939
+  '\x03cc'# -> unI64 908
+  '\x03cd'# -> unI64 910
+  '\x03ce'# -> unI64 911
+  '\x03d0'# -> unI64 914
+  '\x03d1'# -> unI64 920
+  '\x03d5'# -> unI64 934
+  '\x03d6'# -> unI64 928
+  '\x03d7'# -> unI64 975
+  '\x03d9'# -> unI64 984
+  '\x03db'# -> unI64 986
+  '\x03dd'# -> unI64 988
+  '\x03df'# -> unI64 990
+  '\x03e1'# -> unI64 992
+  '\x03e3'# -> unI64 994
+  '\x03e5'# -> unI64 996
+  '\x03e7'# -> unI64 998
+  '\x03e9'# -> unI64 1000
+  '\x03eb'# -> unI64 1002
+  '\x03ed'# -> unI64 1004
+  '\x03ef'# -> unI64 1006
+  '\x03f0'# -> unI64 922
+  '\x03f1'# -> unI64 929
+  '\x03f2'# -> unI64 1017
+  '\x03f3'# -> unI64 895
+  '\x03f5'# -> unI64 917
+  '\x03f8'# -> unI64 1015
+  '\x03fb'# -> unI64 1018
+  '\x0430'# -> unI64 1040
+  '\x0431'# -> unI64 1041
+  '\x0432'# -> unI64 1042
+  '\x0433'# -> unI64 1043
+  '\x0434'# -> unI64 1044
+  '\x0435'# -> unI64 1045
+  '\x0436'# -> unI64 1046
+  '\x0437'# -> unI64 1047
+  '\x0438'# -> unI64 1048
+  '\x0439'# -> unI64 1049
+  '\x043a'# -> unI64 1050
+  '\x043b'# -> unI64 1051
+  '\x043c'# -> unI64 1052
+  '\x043d'# -> unI64 1053
+  '\x043e'# -> unI64 1054
+  '\x043f'# -> unI64 1055
+  '\x0440'# -> unI64 1056
+  '\x0441'# -> unI64 1057
+  '\x0442'# -> unI64 1058
+  '\x0443'# -> unI64 1059
+  '\x0444'# -> unI64 1060
+  '\x0445'# -> unI64 1061
+  '\x0446'# -> unI64 1062
+  '\x0447'# -> unI64 1063
+  '\x0448'# -> unI64 1064
+  '\x0449'# -> unI64 1065
+  '\x044a'# -> unI64 1066
+  '\x044b'# -> unI64 1067
+  '\x044c'# -> unI64 1068
+  '\x044d'# -> unI64 1069
+  '\x044e'# -> unI64 1070
+  '\x044f'# -> unI64 1071
+  '\x0450'# -> unI64 1024
+  '\x0451'# -> unI64 1025
+  '\x0452'# -> unI64 1026
+  '\x0453'# -> unI64 1027
+  '\x0454'# -> unI64 1028
+  '\x0455'# -> unI64 1029
+  '\x0456'# -> unI64 1030
+  '\x0457'# -> unI64 1031
+  '\x0458'# -> unI64 1032
+  '\x0459'# -> unI64 1033
+  '\x045a'# -> unI64 1034
+  '\x045b'# -> unI64 1035
+  '\x045c'# -> unI64 1036
+  '\x045d'# -> unI64 1037
+  '\x045e'# -> unI64 1038
+  '\x045f'# -> unI64 1039
+  '\x0461'# -> unI64 1120
+  '\x0463'# -> unI64 1122
+  '\x0465'# -> unI64 1124
+  '\x0467'# -> unI64 1126
+  '\x0469'# -> unI64 1128
+  '\x046b'# -> unI64 1130
+  '\x046d'# -> unI64 1132
+  '\x046f'# -> unI64 1134
+  '\x0471'# -> unI64 1136
+  '\x0473'# -> unI64 1138
+  '\x0475'# -> unI64 1140
+  '\x0477'# -> unI64 1142
+  '\x0479'# -> unI64 1144
+  '\x047b'# -> unI64 1146
+  '\x047d'# -> unI64 1148
+  '\x047f'# -> unI64 1150
+  '\x0481'# -> unI64 1152
+  '\x048b'# -> unI64 1162
+  '\x048d'# -> unI64 1164
+  '\x048f'# -> unI64 1166
+  '\x0491'# -> unI64 1168
+  '\x0493'# -> unI64 1170
+  '\x0495'# -> unI64 1172
+  '\x0497'# -> unI64 1174
+  '\x0499'# -> unI64 1176
+  '\x049b'# -> unI64 1178
+  '\x049d'# -> unI64 1180
+  '\x049f'# -> unI64 1182
+  '\x04a1'# -> unI64 1184
+  '\x04a3'# -> unI64 1186
+  '\x04a5'# -> unI64 1188
+  '\x04a7'# -> unI64 1190
+  '\x04a9'# -> unI64 1192
+  '\x04ab'# -> unI64 1194
+  '\x04ad'# -> unI64 1196
+  '\x04af'# -> unI64 1198
+  '\x04b1'# -> unI64 1200
+  '\x04b3'# -> unI64 1202
+  '\x04b5'# -> unI64 1204
+  '\x04b7'# -> unI64 1206
+  '\x04b9'# -> unI64 1208
+  '\x04bb'# -> unI64 1210
+  '\x04bd'# -> unI64 1212
+  '\x04bf'# -> unI64 1214
+  '\x04c2'# -> unI64 1217
+  '\x04c4'# -> unI64 1219
+  '\x04c6'# -> unI64 1221
+  '\x04c8'# -> unI64 1223
+  '\x04ca'# -> unI64 1225
+  '\x04cc'# -> unI64 1227
+  '\x04ce'# -> unI64 1229
+  '\x04cf'# -> unI64 1216
+  '\x04d1'# -> unI64 1232
+  '\x04d3'# -> unI64 1234
+  '\x04d5'# -> unI64 1236
+  '\x04d7'# -> unI64 1238
+  '\x04d9'# -> unI64 1240
+  '\x04db'# -> unI64 1242
+  '\x04dd'# -> unI64 1244
+  '\x04df'# -> unI64 1246
+  '\x04e1'# -> unI64 1248
+  '\x04e3'# -> unI64 1250
+  '\x04e5'# -> unI64 1252
+  '\x04e7'# -> unI64 1254
+  '\x04e9'# -> unI64 1256
+  '\x04eb'# -> unI64 1258
+  '\x04ed'# -> unI64 1260
+  '\x04ef'# -> unI64 1262
+  '\x04f1'# -> unI64 1264
+  '\x04f3'# -> unI64 1266
+  '\x04f5'# -> unI64 1268
+  '\x04f7'# -> unI64 1270
+  '\x04f9'# -> unI64 1272
+  '\x04fb'# -> unI64 1274
+  '\x04fd'# -> unI64 1276
+  '\x04ff'# -> unI64 1278
+  '\x0501'# -> unI64 1280
+  '\x0503'# -> unI64 1282
+  '\x0505'# -> unI64 1284
+  '\x0507'# -> unI64 1286
+  '\x0509'# -> unI64 1288
+  '\x050b'# -> unI64 1290
+  '\x050d'# -> unI64 1292
+  '\x050f'# -> unI64 1294
+  '\x0511'# -> unI64 1296
+  '\x0513'# -> unI64 1298
+  '\x0515'# -> unI64 1300
+  '\x0517'# -> unI64 1302
+  '\x0519'# -> unI64 1304
+  '\x051b'# -> unI64 1306
+  '\x051d'# -> unI64 1308
+  '\x051f'# -> unI64 1310
+  '\x0521'# -> unI64 1312
+  '\x0523'# -> unI64 1314
+  '\x0525'# -> unI64 1316
+  '\x0527'# -> unI64 1318
+  '\x0529'# -> unI64 1320
+  '\x052b'# -> unI64 1322
+  '\x052d'# -> unI64 1324
+  '\x052f'# -> unI64 1326
+  '\x0561'# -> unI64 1329
+  '\x0562'# -> unI64 1330
+  '\x0563'# -> unI64 1331
+  '\x0564'# -> unI64 1332
+  '\x0565'# -> unI64 1333
+  '\x0566'# -> unI64 1334
+  '\x0567'# -> unI64 1335
+  '\x0568'# -> unI64 1336
+  '\x0569'# -> unI64 1337
+  '\x056a'# -> unI64 1338
+  '\x056b'# -> unI64 1339
+  '\x056c'# -> unI64 1340
+  '\x056d'# -> unI64 1341
+  '\x056e'# -> unI64 1342
+  '\x056f'# -> unI64 1343
+  '\x0570'# -> unI64 1344
+  '\x0571'# -> unI64 1345
+  '\x0572'# -> unI64 1346
+  '\x0573'# -> unI64 1347
+  '\x0574'# -> unI64 1348
+  '\x0575'# -> unI64 1349
+  '\x0576'# -> unI64 1350
+  '\x0577'# -> unI64 1351
+  '\x0578'# -> unI64 1352
+  '\x0579'# -> unI64 1353
+  '\x057a'# -> unI64 1354
+  '\x057b'# -> unI64 1355
+  '\x057c'# -> unI64 1356
+  '\x057d'# -> unI64 1357
+  '\x057e'# -> unI64 1358
+  '\x057f'# -> unI64 1359
+  '\x0580'# -> unI64 1360
+  '\x0581'# -> unI64 1361
+  '\x0582'# -> unI64 1362
+  '\x0583'# -> unI64 1363
+  '\x0584'# -> unI64 1364
+  '\x0585'# -> unI64 1365
+  '\x0586'# -> unI64 1366
+  '\x10d0'# -> unI64 7312
+  '\x10d1'# -> unI64 7313
+  '\x10d2'# -> unI64 7314
+  '\x10d3'# -> unI64 7315
+  '\x10d4'# -> unI64 7316
+  '\x10d5'# -> unI64 7317
+  '\x10d6'# -> unI64 7318
+  '\x10d7'# -> unI64 7319
+  '\x10d8'# -> unI64 7320
+  '\x10d9'# -> unI64 7321
+  '\x10da'# -> unI64 7322
+  '\x10db'# -> unI64 7323
+  '\x10dc'# -> unI64 7324
+  '\x10dd'# -> unI64 7325
+  '\x10de'# -> unI64 7326
+  '\x10df'# -> unI64 7327
+  '\x10e0'# -> unI64 7328
+  '\x10e1'# -> unI64 7329
+  '\x10e2'# -> unI64 7330
+  '\x10e3'# -> unI64 7331
+  '\x10e4'# -> unI64 7332
+  '\x10e5'# -> unI64 7333
+  '\x10e6'# -> unI64 7334
+  '\x10e7'# -> unI64 7335
+  '\x10e8'# -> unI64 7336
+  '\x10e9'# -> unI64 7337
+  '\x10ea'# -> unI64 7338
+  '\x10eb'# -> unI64 7339
+  '\x10ec'# -> unI64 7340
+  '\x10ed'# -> unI64 7341
+  '\x10ee'# -> unI64 7342
+  '\x10ef'# -> unI64 7343
+  '\x10f0'# -> unI64 7344
+  '\x10f1'# -> unI64 7345
+  '\x10f2'# -> unI64 7346
+  '\x10f3'# -> unI64 7347
+  '\x10f4'# -> unI64 7348
+  '\x10f5'# -> unI64 7349
+  '\x10f6'# -> unI64 7350
+  '\x10f7'# -> unI64 7351
+  '\x10f8'# -> unI64 7352
+  '\x10f9'# -> unI64 7353
+  '\x10fa'# -> unI64 7354
+  '\x10fd'# -> unI64 7357
+  '\x10fe'# -> unI64 7358
+  '\x10ff'# -> unI64 7359
+  '\x13f8'# -> unI64 5104
+  '\x13f9'# -> unI64 5105
+  '\x13fa'# -> unI64 5106
+  '\x13fb'# -> unI64 5107
+  '\x13fc'# -> unI64 5108
+  '\x13fd'# -> unI64 5109
+  '\x1c80'# -> unI64 1042
+  '\x1c81'# -> unI64 1044
+  '\x1c82'# -> unI64 1054
+  '\x1c83'# -> unI64 1057
+  '\x1c84'# -> unI64 1058
+  '\x1c85'# -> unI64 1058
+  '\x1c86'# -> unI64 1066
+  '\x1c87'# -> unI64 1122
+  '\x1c88'# -> unI64 42570
+  '\x1d79'# -> unI64 42877
+  '\x1d7d'# -> unI64 11363
+  '\x1d8e'# -> unI64 42950
+  '\x1e01'# -> unI64 7680
+  '\x1e03'# -> unI64 7682
+  '\x1e05'# -> unI64 7684
+  '\x1e07'# -> unI64 7686
+  '\x1e09'# -> unI64 7688
+  '\x1e0b'# -> unI64 7690
+  '\x1e0d'# -> unI64 7692
+  '\x1e0f'# -> unI64 7694
+  '\x1e11'# -> unI64 7696
+  '\x1e13'# -> unI64 7698
+  '\x1e15'# -> unI64 7700
+  '\x1e17'# -> unI64 7702
+  '\x1e19'# -> unI64 7704
+  '\x1e1b'# -> unI64 7706
+  '\x1e1d'# -> unI64 7708
+  '\x1e1f'# -> unI64 7710
+  '\x1e21'# -> unI64 7712
+  '\x1e23'# -> unI64 7714
+  '\x1e25'# -> unI64 7716
+  '\x1e27'# -> unI64 7718
+  '\x1e29'# -> unI64 7720
+  '\x1e2b'# -> unI64 7722
+  '\x1e2d'# -> unI64 7724
+  '\x1e2f'# -> unI64 7726
+  '\x1e31'# -> unI64 7728
+  '\x1e33'# -> unI64 7730
+  '\x1e35'# -> unI64 7732
+  '\x1e37'# -> unI64 7734
+  '\x1e39'# -> unI64 7736
+  '\x1e3b'# -> unI64 7738
+  '\x1e3d'# -> unI64 7740
+  '\x1e3f'# -> unI64 7742
+  '\x1e41'# -> unI64 7744
+  '\x1e43'# -> unI64 7746
+  '\x1e45'# -> unI64 7748
+  '\x1e47'# -> unI64 7750
+  '\x1e49'# -> unI64 7752
+  '\x1e4b'# -> unI64 7754
+  '\x1e4d'# -> unI64 7756
+  '\x1e4f'# -> unI64 7758
+  '\x1e51'# -> unI64 7760
+  '\x1e53'# -> unI64 7762
+  '\x1e55'# -> unI64 7764
+  '\x1e57'# -> unI64 7766
+  '\x1e59'# -> unI64 7768
+  '\x1e5b'# -> unI64 7770
+  '\x1e5d'# -> unI64 7772
+  '\x1e5f'# -> unI64 7774
+  '\x1e61'# -> unI64 7776
+  '\x1e63'# -> unI64 7778
+  '\x1e65'# -> unI64 7780
+  '\x1e67'# -> unI64 7782
+  '\x1e69'# -> unI64 7784
+  '\x1e6b'# -> unI64 7786
+  '\x1e6d'# -> unI64 7788
+  '\x1e6f'# -> unI64 7790
+  '\x1e71'# -> unI64 7792
+  '\x1e73'# -> unI64 7794
+  '\x1e75'# -> unI64 7796
+  '\x1e77'# -> unI64 7798
+  '\x1e79'# -> unI64 7800
+  '\x1e7b'# -> unI64 7802
+  '\x1e7d'# -> unI64 7804
+  '\x1e7f'# -> unI64 7806
+  '\x1e81'# -> unI64 7808
+  '\x1e83'# -> unI64 7810
+  '\x1e85'# -> unI64 7812
+  '\x1e87'# -> unI64 7814
+  '\x1e89'# -> unI64 7816
+  '\x1e8b'# -> unI64 7818
+  '\x1e8d'# -> unI64 7820
+  '\x1e8f'# -> unI64 7822
+  '\x1e91'# -> unI64 7824
+  '\x1e93'# -> unI64 7826
+  '\x1e95'# -> unI64 7828
+  '\x1e9b'# -> unI64 7776
+  '\x1ea1'# -> unI64 7840
+  '\x1ea3'# -> unI64 7842
+  '\x1ea5'# -> unI64 7844
+  '\x1ea7'# -> unI64 7846
+  '\x1ea9'# -> unI64 7848
+  '\x1eab'# -> unI64 7850
+  '\x1ead'# -> unI64 7852
+  '\x1eaf'# -> unI64 7854
+  '\x1eb1'# -> unI64 7856
+  '\x1eb3'# -> unI64 7858
+  '\x1eb5'# -> unI64 7860
+  '\x1eb7'# -> unI64 7862
+  '\x1eb9'# -> unI64 7864
+  '\x1ebb'# -> unI64 7866
+  '\x1ebd'# -> unI64 7868
+  '\x1ebf'# -> unI64 7870
+  '\x1ec1'# -> unI64 7872
+  '\x1ec3'# -> unI64 7874
+  '\x1ec5'# -> unI64 7876
+  '\x1ec7'# -> unI64 7878
+  '\x1ec9'# -> unI64 7880
+  '\x1ecb'# -> unI64 7882
+  '\x1ecd'# -> unI64 7884
+  '\x1ecf'# -> unI64 7886
+  '\x1ed1'# -> unI64 7888
+  '\x1ed3'# -> unI64 7890
+  '\x1ed5'# -> unI64 7892
+  '\x1ed7'# -> unI64 7894
+  '\x1ed9'# -> unI64 7896
+  '\x1edb'# -> unI64 7898
+  '\x1edd'# -> unI64 7900
+  '\x1edf'# -> unI64 7902
+  '\x1ee1'# -> unI64 7904
+  '\x1ee3'# -> unI64 7906
+  '\x1ee5'# -> unI64 7908
+  '\x1ee7'# -> unI64 7910
+  '\x1ee9'# -> unI64 7912
+  '\x1eeb'# -> unI64 7914
+  '\x1eed'# -> unI64 7916
+  '\x1eef'# -> unI64 7918
+  '\x1ef1'# -> unI64 7920
+  '\x1ef3'# -> unI64 7922
+  '\x1ef5'# -> unI64 7924
+  '\x1ef7'# -> unI64 7926
+  '\x1ef9'# -> unI64 7928
+  '\x1efb'# -> unI64 7930
+  '\x1efd'# -> unI64 7932
+  '\x1eff'# -> unI64 7934
+  '\x1f00'# -> unI64 7944
+  '\x1f01'# -> unI64 7945
+  '\x1f02'# -> unI64 7946
+  '\x1f03'# -> unI64 7947
+  '\x1f04'# -> unI64 7948
+  '\x1f05'# -> unI64 7949
+  '\x1f06'# -> unI64 7950
+  '\x1f07'# -> unI64 7951
+  '\x1f10'# -> unI64 7960
+  '\x1f11'# -> unI64 7961
+  '\x1f12'# -> unI64 7962
+  '\x1f13'# -> unI64 7963
+  '\x1f14'# -> unI64 7964
+  '\x1f15'# -> unI64 7965
+  '\x1f20'# -> unI64 7976
+  '\x1f21'# -> unI64 7977
+  '\x1f22'# -> unI64 7978
+  '\x1f23'# -> unI64 7979
+  '\x1f24'# -> unI64 7980
+  '\x1f25'# -> unI64 7981
+  '\x1f26'# -> unI64 7982
+  '\x1f27'# -> unI64 7983
+  '\x1f30'# -> unI64 7992
+  '\x1f31'# -> unI64 7993
+  '\x1f32'# -> unI64 7994
+  '\x1f33'# -> unI64 7995
+  '\x1f34'# -> unI64 7996
+  '\x1f35'# -> unI64 7997
+  '\x1f36'# -> unI64 7998
+  '\x1f37'# -> unI64 7999
+  '\x1f40'# -> unI64 8008
+  '\x1f41'# -> unI64 8009
+  '\x1f42'# -> unI64 8010
+  '\x1f43'# -> unI64 8011
+  '\x1f44'# -> unI64 8012
+  '\x1f45'# -> unI64 8013
+  '\x1f51'# -> unI64 8025
+  '\x1f53'# -> unI64 8027
+  '\x1f55'# -> unI64 8029
+  '\x1f57'# -> unI64 8031
+  '\x1f60'# -> unI64 8040
+  '\x1f61'# -> unI64 8041
+  '\x1f62'# -> unI64 8042
+  '\x1f63'# -> unI64 8043
+  '\x1f64'# -> unI64 8044
+  '\x1f65'# -> unI64 8045
+  '\x1f66'# -> unI64 8046
+  '\x1f67'# -> unI64 8047
+  '\x1f70'# -> unI64 8122
+  '\x1f71'# -> unI64 8123
+  '\x1f72'# -> unI64 8136
+  '\x1f73'# -> unI64 8137
+  '\x1f74'# -> unI64 8138
+  '\x1f75'# -> unI64 8139
+  '\x1f76'# -> unI64 8154
+  '\x1f77'# -> unI64 8155
+  '\x1f78'# -> unI64 8184
+  '\x1f79'# -> unI64 8185
+  '\x1f7a'# -> unI64 8170
+  '\x1f7b'# -> unI64 8171
+  '\x1f7c'# -> unI64 8186
+  '\x1f7d'# -> unI64 8187
+  '\x1fb0'# -> unI64 8120
+  '\x1fb1'# -> unI64 8121
+  '\x1fbe'# -> unI64 921
+  '\x1fd0'# -> unI64 8152
+  '\x1fd1'# -> unI64 8153
+  '\x1fe0'# -> unI64 8168
+  '\x1fe1'# -> unI64 8169
+  '\x1fe5'# -> unI64 8172
+  '\x214e'# -> unI64 8498
+  '\x2170'# -> unI64 8544
+  '\x2171'# -> unI64 8545
+  '\x2172'# -> unI64 8546
+  '\x2173'# -> unI64 8547
+  '\x2174'# -> unI64 8548
+  '\x2175'# -> unI64 8549
+  '\x2176'# -> unI64 8550
+  '\x2177'# -> unI64 8551
+  '\x2178'# -> unI64 8552
+  '\x2179'# -> unI64 8553
+  '\x217a'# -> unI64 8554
+  '\x217b'# -> unI64 8555
+  '\x217c'# -> unI64 8556
+  '\x217d'# -> unI64 8557
+  '\x217e'# -> unI64 8558
+  '\x217f'# -> unI64 8559
+  '\x2184'# -> unI64 8579
+  '\x24d0'# -> unI64 9398
+  '\x24d1'# -> unI64 9399
+  '\x24d2'# -> unI64 9400
+  '\x24d3'# -> unI64 9401
+  '\x24d4'# -> unI64 9402
+  '\x24d5'# -> unI64 9403
+  '\x24d6'# -> unI64 9404
+  '\x24d7'# -> unI64 9405
+  '\x24d8'# -> unI64 9406
+  '\x24d9'# -> unI64 9407
+  '\x24da'# -> unI64 9408
+  '\x24db'# -> unI64 9409
+  '\x24dc'# -> unI64 9410
+  '\x24dd'# -> unI64 9411
+  '\x24de'# -> unI64 9412
+  '\x24df'# -> unI64 9413
+  '\x24e0'# -> unI64 9414
+  '\x24e1'# -> unI64 9415
+  '\x24e2'# -> unI64 9416
+  '\x24e3'# -> unI64 9417
+  '\x24e4'# -> unI64 9418
+  '\x24e5'# -> unI64 9419
+  '\x24e6'# -> unI64 9420
+  '\x24e7'# -> unI64 9421
+  '\x24e8'# -> unI64 9422
+  '\x24e9'# -> unI64 9423
+  '\x2c30'# -> unI64 11264
+  '\x2c31'# -> unI64 11265
+  '\x2c32'# -> unI64 11266
+  '\x2c33'# -> unI64 11267
+  '\x2c34'# -> unI64 11268
+  '\x2c35'# -> unI64 11269
+  '\x2c36'# -> unI64 11270
+  '\x2c37'# -> unI64 11271
+  '\x2c38'# -> unI64 11272
+  '\x2c39'# -> unI64 11273
+  '\x2c3a'# -> unI64 11274
+  '\x2c3b'# -> unI64 11275
+  '\x2c3c'# -> unI64 11276
+  '\x2c3d'# -> unI64 11277
+  '\x2c3e'# -> unI64 11278
+  '\x2c3f'# -> unI64 11279
+  '\x2c40'# -> unI64 11280
+  '\x2c41'# -> unI64 11281
+  '\x2c42'# -> unI64 11282
+  '\x2c43'# -> unI64 11283
+  '\x2c44'# -> unI64 11284
+  '\x2c45'# -> unI64 11285
+  '\x2c46'# -> unI64 11286
+  '\x2c47'# -> unI64 11287
+  '\x2c48'# -> unI64 11288
+  '\x2c49'# -> unI64 11289
+  '\x2c4a'# -> unI64 11290
+  '\x2c4b'# -> unI64 11291
+  '\x2c4c'# -> unI64 11292
+  '\x2c4d'# -> unI64 11293
+  '\x2c4e'# -> unI64 11294
+  '\x2c4f'# -> unI64 11295
+  '\x2c50'# -> unI64 11296
+  '\x2c51'# -> unI64 11297
+  '\x2c52'# -> unI64 11298
+  '\x2c53'# -> unI64 11299
+  '\x2c54'# -> unI64 11300
+  '\x2c55'# -> unI64 11301
+  '\x2c56'# -> unI64 11302
+  '\x2c57'# -> unI64 11303
+  '\x2c58'# -> unI64 11304
+  '\x2c59'# -> unI64 11305
+  '\x2c5a'# -> unI64 11306
+  '\x2c5b'# -> unI64 11307
+  '\x2c5c'# -> unI64 11308
+  '\x2c5d'# -> unI64 11309
+  '\x2c5e'# -> unI64 11310
+  '\x2c61'# -> unI64 11360
+  '\x2c65'# -> unI64 570
+  '\x2c66'# -> unI64 574
+  '\x2c68'# -> unI64 11367
+  '\x2c6a'# -> unI64 11369
+  '\x2c6c'# -> unI64 11371
+  '\x2c73'# -> unI64 11378
+  '\x2c76'# -> unI64 11381
+  '\x2c81'# -> unI64 11392
+  '\x2c83'# -> unI64 11394
+  '\x2c85'# -> unI64 11396
+  '\x2c87'# -> unI64 11398
+  '\x2c89'# -> unI64 11400
+  '\x2c8b'# -> unI64 11402
+  '\x2c8d'# -> unI64 11404
+  '\x2c8f'# -> unI64 11406
+  '\x2c91'# -> unI64 11408
+  '\x2c93'# -> unI64 11410
+  '\x2c95'# -> unI64 11412
+  '\x2c97'# -> unI64 11414
+  '\x2c99'# -> unI64 11416
+  '\x2c9b'# -> unI64 11418
+  '\x2c9d'# -> unI64 11420
+  '\x2c9f'# -> unI64 11422
+  '\x2ca1'# -> unI64 11424
+  '\x2ca3'# -> unI64 11426
+  '\x2ca5'# -> unI64 11428
+  '\x2ca7'# -> unI64 11430
+  '\x2ca9'# -> unI64 11432
+  '\x2cab'# -> unI64 11434
+  '\x2cad'# -> unI64 11436
+  '\x2caf'# -> unI64 11438
+  '\x2cb1'# -> unI64 11440
+  '\x2cb3'# -> unI64 11442
+  '\x2cb5'# -> unI64 11444
+  '\x2cb7'# -> unI64 11446
+  '\x2cb9'# -> unI64 11448
+  '\x2cbb'# -> unI64 11450
+  '\x2cbd'# -> unI64 11452
+  '\x2cbf'# -> unI64 11454
+  '\x2cc1'# -> unI64 11456
+  '\x2cc3'# -> unI64 11458
+  '\x2cc5'# -> unI64 11460
+  '\x2cc7'# -> unI64 11462
+  '\x2cc9'# -> unI64 11464
+  '\x2ccb'# -> unI64 11466
+  '\x2ccd'# -> unI64 11468
+  '\x2ccf'# -> unI64 11470
+  '\x2cd1'# -> unI64 11472
+  '\x2cd3'# -> unI64 11474
+  '\x2cd5'# -> unI64 11476
+  '\x2cd7'# -> unI64 11478
+  '\x2cd9'# -> unI64 11480
+  '\x2cdb'# -> unI64 11482
+  '\x2cdd'# -> unI64 11484
+  '\x2cdf'# -> unI64 11486
+  '\x2ce1'# -> unI64 11488
+  '\x2ce3'# -> unI64 11490
+  '\x2cec'# -> unI64 11499
+  '\x2cee'# -> unI64 11501
+  '\x2cf3'# -> unI64 11506
+  '\x2d00'# -> unI64 4256
+  '\x2d01'# -> unI64 4257
+  '\x2d02'# -> unI64 4258
+  '\x2d03'# -> unI64 4259
+  '\x2d04'# -> unI64 4260
+  '\x2d05'# -> unI64 4261
+  '\x2d06'# -> unI64 4262
+  '\x2d07'# -> unI64 4263
+  '\x2d08'# -> unI64 4264
+  '\x2d09'# -> unI64 4265
+  '\x2d0a'# -> unI64 4266
+  '\x2d0b'# -> unI64 4267
+  '\x2d0c'# -> unI64 4268
+  '\x2d0d'# -> unI64 4269
+  '\x2d0e'# -> unI64 4270
+  '\x2d0f'# -> unI64 4271
+  '\x2d10'# -> unI64 4272
+  '\x2d11'# -> unI64 4273
+  '\x2d12'# -> unI64 4274
+  '\x2d13'# -> unI64 4275
+  '\x2d14'# -> unI64 4276
+  '\x2d15'# -> unI64 4277
+  '\x2d16'# -> unI64 4278
+  '\x2d17'# -> unI64 4279
+  '\x2d18'# -> unI64 4280
+  '\x2d19'# -> unI64 4281
+  '\x2d1a'# -> unI64 4282
+  '\x2d1b'# -> unI64 4283
+  '\x2d1c'# -> unI64 4284
+  '\x2d1d'# -> unI64 4285
+  '\x2d1e'# -> unI64 4286
+  '\x2d1f'# -> unI64 4287
+  '\x2d20'# -> unI64 4288
+  '\x2d21'# -> unI64 4289
+  '\x2d22'# -> unI64 4290
+  '\x2d23'# -> unI64 4291
+  '\x2d24'# -> unI64 4292
+  '\x2d25'# -> unI64 4293
+  '\x2d27'# -> unI64 4295
+  '\x2d2d'# -> unI64 4301
+  '\xa641'# -> unI64 42560
+  '\xa643'# -> unI64 42562
+  '\xa645'# -> unI64 42564
+  '\xa647'# -> unI64 42566
+  '\xa649'# -> unI64 42568
+  '\xa64b'# -> unI64 42570
+  '\xa64d'# -> unI64 42572
+  '\xa64f'# -> unI64 42574
+  '\xa651'# -> unI64 42576
+  '\xa653'# -> unI64 42578
+  '\xa655'# -> unI64 42580
+  '\xa657'# -> unI64 42582
+  '\xa659'# -> unI64 42584
+  '\xa65b'# -> unI64 42586
+  '\xa65d'# -> unI64 42588
+  '\xa65f'# -> unI64 42590
+  '\xa661'# -> unI64 42592
+  '\xa663'# -> unI64 42594
+  '\xa665'# -> unI64 42596
+  '\xa667'# -> unI64 42598
+  '\xa669'# -> unI64 42600
+  '\xa66b'# -> unI64 42602
+  '\xa66d'# -> unI64 42604
+  '\xa681'# -> unI64 42624
+  '\xa683'# -> unI64 42626
+  '\xa685'# -> unI64 42628
+  '\xa687'# -> unI64 42630
+  '\xa689'# -> unI64 42632
+  '\xa68b'# -> unI64 42634
+  '\xa68d'# -> unI64 42636
+  '\xa68f'# -> unI64 42638
+  '\xa691'# -> unI64 42640
+  '\xa693'# -> unI64 42642
+  '\xa695'# -> unI64 42644
+  '\xa697'# -> unI64 42646
+  '\xa699'# -> unI64 42648
+  '\xa69b'# -> unI64 42650
+  '\xa723'# -> unI64 42786
+  '\xa725'# -> unI64 42788
+  '\xa727'# -> unI64 42790
+  '\xa729'# -> unI64 42792
+  '\xa72b'# -> unI64 42794
+  '\xa72d'# -> unI64 42796
+  '\xa72f'# -> unI64 42798
+  '\xa733'# -> unI64 42802
+  '\xa735'# -> unI64 42804
+  '\xa737'# -> unI64 42806
+  '\xa739'# -> unI64 42808
+  '\xa73b'# -> unI64 42810
+  '\xa73d'# -> unI64 42812
+  '\xa73f'# -> unI64 42814
+  '\xa741'# -> unI64 42816
+  '\xa743'# -> unI64 42818
+  '\xa745'# -> unI64 42820
+  '\xa747'# -> unI64 42822
+  '\xa749'# -> unI64 42824
+  '\xa74b'# -> unI64 42826
+  '\xa74d'# -> unI64 42828
+  '\xa74f'# -> unI64 42830
+  '\xa751'# -> unI64 42832
+  '\xa753'# -> unI64 42834
+  '\xa755'# -> unI64 42836
+  '\xa757'# -> unI64 42838
+  '\xa759'# -> unI64 42840
+  '\xa75b'# -> unI64 42842
+  '\xa75d'# -> unI64 42844
+  '\xa75f'# -> unI64 42846
+  '\xa761'# -> unI64 42848
+  '\xa763'# -> unI64 42850
+  '\xa765'# -> unI64 42852
+  '\xa767'# -> unI64 42854
+  '\xa769'# -> unI64 42856
+  '\xa76b'# -> unI64 42858
+  '\xa76d'# -> unI64 42860
+  '\xa76f'# -> unI64 42862
+  '\xa77a'# -> unI64 42873
+  '\xa77c'# -> unI64 42875
+  '\xa77f'# -> unI64 42878
+  '\xa781'# -> unI64 42880
+  '\xa783'# -> unI64 42882
+  '\xa785'# -> unI64 42884
+  '\xa787'# -> unI64 42886
+  '\xa78c'# -> unI64 42891
+  '\xa791'# -> unI64 42896
+  '\xa793'# -> unI64 42898
+  '\xa794'# -> unI64 42948
+  '\xa797'# -> unI64 42902
+  '\xa799'# -> unI64 42904
+  '\xa79b'# -> unI64 42906
+  '\xa79d'# -> unI64 42908
+  '\xa79f'# -> unI64 42910
+  '\xa7a1'# -> unI64 42912
+  '\xa7a3'# -> unI64 42914
+  '\xa7a5'# -> unI64 42916
+  '\xa7a7'# -> unI64 42918
+  '\xa7a9'# -> unI64 42920
+  '\xa7b5'# -> unI64 42932
+  '\xa7b7'# -> unI64 42934
+  '\xa7b9'# -> unI64 42936
+  '\xa7bb'# -> unI64 42938
+  '\xa7bd'# -> unI64 42940
+  '\xa7bf'# -> unI64 42942
+  '\xa7c3'# -> unI64 42946
+  '\xab53'# -> unI64 42931
+  '\xab70'# -> unI64 5024
+  '\xab71'# -> unI64 5025
+  '\xab72'# -> unI64 5026
+  '\xab73'# -> unI64 5027
+  '\xab74'# -> unI64 5028
+  '\xab75'# -> unI64 5029
+  '\xab76'# -> unI64 5030
+  '\xab77'# -> unI64 5031
+  '\xab78'# -> unI64 5032
+  '\xab79'# -> unI64 5033
+  '\xab7a'# -> unI64 5034
+  '\xab7b'# -> unI64 5035
+  '\xab7c'# -> unI64 5036
+  '\xab7d'# -> unI64 5037
+  '\xab7e'# -> unI64 5038
+  '\xab7f'# -> unI64 5039
+  '\xab80'# -> unI64 5040
+  '\xab81'# -> unI64 5041
+  '\xab82'# -> unI64 5042
+  '\xab83'# -> unI64 5043
+  '\xab84'# -> unI64 5044
+  '\xab85'# -> unI64 5045
+  '\xab86'# -> unI64 5046
+  '\xab87'# -> unI64 5047
+  '\xab88'# -> unI64 5048
+  '\xab89'# -> unI64 5049
+  '\xab8a'# -> unI64 5050
+  '\xab8b'# -> unI64 5051
+  '\xab8c'# -> unI64 5052
+  '\xab8d'# -> unI64 5053
+  '\xab8e'# -> unI64 5054
+  '\xab8f'# -> unI64 5055
+  '\xab90'# -> unI64 5056
+  '\xab91'# -> unI64 5057
+  '\xab92'# -> unI64 5058
+  '\xab93'# -> unI64 5059
+  '\xab94'# -> unI64 5060
+  '\xab95'# -> unI64 5061
+  '\xab96'# -> unI64 5062
+  '\xab97'# -> unI64 5063
+  '\xab98'# -> unI64 5064
+  '\xab99'# -> unI64 5065
+  '\xab9a'# -> unI64 5066
+  '\xab9b'# -> unI64 5067
+  '\xab9c'# -> unI64 5068
+  '\xab9d'# -> unI64 5069
+  '\xab9e'# -> unI64 5070
+  '\xab9f'# -> unI64 5071
+  '\xaba0'# -> unI64 5072
+  '\xaba1'# -> unI64 5073
+  '\xaba2'# -> unI64 5074
+  '\xaba3'# -> unI64 5075
+  '\xaba4'# -> unI64 5076
+  '\xaba5'# -> unI64 5077
+  '\xaba6'# -> unI64 5078
+  '\xaba7'# -> unI64 5079
+  '\xaba8'# -> unI64 5080
+  '\xaba9'# -> unI64 5081
+  '\xabaa'# -> unI64 5082
+  '\xabab'# -> unI64 5083
+  '\xabac'# -> unI64 5084
+  '\xabad'# -> unI64 5085
+  '\xabae'# -> unI64 5086
+  '\xabaf'# -> unI64 5087
+  '\xabb0'# -> unI64 5088
+  '\xabb1'# -> unI64 5089
+  '\xabb2'# -> unI64 5090
+  '\xabb3'# -> unI64 5091
+  '\xabb4'# -> unI64 5092
+  '\xabb5'# -> unI64 5093
+  '\xabb6'# -> unI64 5094
+  '\xabb7'# -> unI64 5095
+  '\xabb8'# -> unI64 5096
+  '\xabb9'# -> unI64 5097
+  '\xabba'# -> unI64 5098
+  '\xabbb'# -> unI64 5099
+  '\xabbc'# -> unI64 5100
+  '\xabbd'# -> unI64 5101
+  '\xabbe'# -> unI64 5102
+  '\xabbf'# -> unI64 5103
+  '\xff41'# -> unI64 65313
+  '\xff42'# -> unI64 65314
+  '\xff43'# -> unI64 65315
+  '\xff44'# -> unI64 65316
+  '\xff45'# -> unI64 65317
+  '\xff46'# -> unI64 65318
+  '\xff47'# -> unI64 65319
+  '\xff48'# -> unI64 65320
+  '\xff49'# -> unI64 65321
+  '\xff4a'# -> unI64 65322
+  '\xff4b'# -> unI64 65323
+  '\xff4c'# -> unI64 65324
+  '\xff4d'# -> unI64 65325
+  '\xff4e'# -> unI64 65326
+  '\xff4f'# -> unI64 65327
+  '\xff50'# -> unI64 65328
+  '\xff51'# -> unI64 65329
+  '\xff52'# -> unI64 65330
+  '\xff53'# -> unI64 65331
+  '\xff54'# -> unI64 65332
+  '\xff55'# -> unI64 65333
+  '\xff56'# -> unI64 65334
+  '\xff57'# -> unI64 65335
+  '\xff58'# -> unI64 65336
+  '\xff59'# -> unI64 65337
+  '\xff5a'# -> unI64 65338
+  '\x10428'# -> unI64 66560
+  '\x10429'# -> unI64 66561
+  '\x1042a'# -> unI64 66562
+  '\x1042b'# -> unI64 66563
+  '\x1042c'# -> unI64 66564
+  '\x1042d'# -> unI64 66565
+  '\x1042e'# -> unI64 66566
+  '\x1042f'# -> unI64 66567
+  '\x10430'# -> unI64 66568
+  '\x10431'# -> unI64 66569
+  '\x10432'# -> unI64 66570
+  '\x10433'# -> unI64 66571
+  '\x10434'# -> unI64 66572
+  '\x10435'# -> unI64 66573
+  '\x10436'# -> unI64 66574
+  '\x10437'# -> unI64 66575
+  '\x10438'# -> unI64 66576
+  '\x10439'# -> unI64 66577
+  '\x1043a'# -> unI64 66578
+  '\x1043b'# -> unI64 66579
+  '\x1043c'# -> unI64 66580
+  '\x1043d'# -> unI64 66581
+  '\x1043e'# -> unI64 66582
+  '\x1043f'# -> unI64 66583
+  '\x10440'# -> unI64 66584
+  '\x10441'# -> unI64 66585
+  '\x10442'# -> unI64 66586
+  '\x10443'# -> unI64 66587
+  '\x10444'# -> unI64 66588
+  '\x10445'# -> unI64 66589
+  '\x10446'# -> unI64 66590
+  '\x10447'# -> unI64 66591
+  '\x10448'# -> unI64 66592
+  '\x10449'# -> unI64 66593
+  '\x1044a'# -> unI64 66594
+  '\x1044b'# -> unI64 66595
+  '\x1044c'# -> unI64 66596
+  '\x1044d'# -> unI64 66597
+  '\x1044e'# -> unI64 66598
+  '\x1044f'# -> unI64 66599
+  '\x104d8'# -> unI64 66736
+  '\x104d9'# -> unI64 66737
+  '\x104da'# -> unI64 66738
+  '\x104db'# -> unI64 66739
+  '\x104dc'# -> unI64 66740
+  '\x104dd'# -> unI64 66741
+  '\x104de'# -> unI64 66742
+  '\x104df'# -> unI64 66743
+  '\x104e0'# -> unI64 66744
+  '\x104e1'# -> unI64 66745
+  '\x104e2'# -> unI64 66746
+  '\x104e3'# -> unI64 66747
+  '\x104e4'# -> unI64 66748
+  '\x104e5'# -> unI64 66749
+  '\x104e6'# -> unI64 66750
+  '\x104e7'# -> unI64 66751
+  '\x104e8'# -> unI64 66752
+  '\x104e9'# -> unI64 66753
+  '\x104ea'# -> unI64 66754
+  '\x104eb'# -> unI64 66755
+  '\x104ec'# -> unI64 66756
+  '\x104ed'# -> unI64 66757
+  '\x104ee'# -> unI64 66758
+  '\x104ef'# -> unI64 66759
+  '\x104f0'# -> unI64 66760
+  '\x104f1'# -> unI64 66761
+  '\x104f2'# -> unI64 66762
+  '\x104f3'# -> unI64 66763
+  '\x104f4'# -> unI64 66764
+  '\x104f5'# -> unI64 66765
+  '\x104f6'# -> unI64 66766
+  '\x104f7'# -> unI64 66767
+  '\x104f8'# -> unI64 66768
+  '\x104f9'# -> unI64 66769
+  '\x104fa'# -> unI64 66770
+  '\x104fb'# -> unI64 66771
+  '\x10cc0'# -> unI64 68736
+  '\x10cc1'# -> unI64 68737
+  '\x10cc2'# -> unI64 68738
+  '\x10cc3'# -> unI64 68739
+  '\x10cc4'# -> unI64 68740
+  '\x10cc5'# -> unI64 68741
+  '\x10cc6'# -> unI64 68742
+  '\x10cc7'# -> unI64 68743
+  '\x10cc8'# -> unI64 68744
+  '\x10cc9'# -> unI64 68745
+  '\x10cca'# -> unI64 68746
+  '\x10ccb'# -> unI64 68747
+  '\x10ccc'# -> unI64 68748
+  '\x10ccd'# -> unI64 68749
+  '\x10cce'# -> unI64 68750
+  '\x10ccf'# -> unI64 68751
+  '\x10cd0'# -> unI64 68752
+  '\x10cd1'# -> unI64 68753
+  '\x10cd2'# -> unI64 68754
+  '\x10cd3'# -> unI64 68755
+  '\x10cd4'# -> unI64 68756
+  '\x10cd5'# -> unI64 68757
+  '\x10cd6'# -> unI64 68758
+  '\x10cd7'# -> unI64 68759
+  '\x10cd8'# -> unI64 68760
+  '\x10cd9'# -> unI64 68761
+  '\x10cda'# -> unI64 68762
+  '\x10cdb'# -> unI64 68763
+  '\x10cdc'# -> unI64 68764
+  '\x10cdd'# -> unI64 68765
+  '\x10cde'# -> unI64 68766
+  '\x10cdf'# -> unI64 68767
+  '\x10ce0'# -> unI64 68768
+  '\x10ce1'# -> unI64 68769
+  '\x10ce2'# -> unI64 68770
+  '\x10ce3'# -> unI64 68771
+  '\x10ce4'# -> unI64 68772
+  '\x10ce5'# -> unI64 68773
+  '\x10ce6'# -> unI64 68774
+  '\x10ce7'# -> unI64 68775
+  '\x10ce8'# -> unI64 68776
+  '\x10ce9'# -> unI64 68777
+  '\x10cea'# -> unI64 68778
+  '\x10ceb'# -> unI64 68779
+  '\x10cec'# -> unI64 68780
+  '\x10ced'# -> unI64 68781
+  '\x10cee'# -> unI64 68782
+  '\x10cef'# -> unI64 68783
+  '\x10cf0'# -> unI64 68784
+  '\x10cf1'# -> unI64 68785
+  '\x10cf2'# -> unI64 68786
+  '\x118c0'# -> unI64 71840
+  '\x118c1'# -> unI64 71841
+  '\x118c2'# -> unI64 71842
+  '\x118c3'# -> unI64 71843
+  '\x118c4'# -> unI64 71844
+  '\x118c5'# -> unI64 71845
+  '\x118c6'# -> unI64 71846
+  '\x118c7'# -> unI64 71847
+  '\x118c8'# -> unI64 71848
+  '\x118c9'# -> unI64 71849
+  '\x118ca'# -> unI64 71850
+  '\x118cb'# -> unI64 71851
+  '\x118cc'# -> unI64 71852
+  '\x118cd'# -> unI64 71853
+  '\x118ce'# -> unI64 71854
+  '\x118cf'# -> unI64 71855
+  '\x118d0'# -> unI64 71856
+  '\x118d1'# -> unI64 71857
+  '\x118d2'# -> unI64 71858
+  '\x118d3'# -> unI64 71859
+  '\x118d4'# -> unI64 71860
+  '\x118d5'# -> unI64 71861
+  '\x118d6'# -> unI64 71862
+  '\x118d7'# -> unI64 71863
+  '\x118d8'# -> unI64 71864
+  '\x118d9'# -> unI64 71865
+  '\x118da'# -> unI64 71866
+  '\x118db'# -> unI64 71867
+  '\x118dc'# -> unI64 71868
+  '\x118dd'# -> unI64 71869
+  '\x118de'# -> unI64 71870
+  '\x118df'# -> unI64 71871
+  '\x16e60'# -> unI64 93760
+  '\x16e61'# -> unI64 93761
+  '\x16e62'# -> unI64 93762
+  '\x16e63'# -> unI64 93763
+  '\x16e64'# -> unI64 93764
+  '\x16e65'# -> unI64 93765
+  '\x16e66'# -> unI64 93766
+  '\x16e67'# -> unI64 93767
+  '\x16e68'# -> unI64 93768
+  '\x16e69'# -> unI64 93769
+  '\x16e6a'# -> unI64 93770
+  '\x16e6b'# -> unI64 93771
+  '\x16e6c'# -> unI64 93772
+  '\x16e6d'# -> unI64 93773
+  '\x16e6e'# -> unI64 93774
+  '\x16e6f'# -> unI64 93775
+  '\x16e70'# -> unI64 93776
+  '\x16e71'# -> unI64 93777
+  '\x16e72'# -> unI64 93778
+  '\x16e73'# -> unI64 93779
+  '\x16e74'# -> unI64 93780
+  '\x16e75'# -> unI64 93781
+  '\x16e76'# -> unI64 93782
+  '\x16e77'# -> unI64 93783
+  '\x16e78'# -> unI64 93784
+  '\x16e79'# -> unI64 93785
+  '\x16e7a'# -> unI64 93786
+  '\x16e7b'# -> unI64 93787
+  '\x16e7c'# -> unI64 93788
+  '\x16e7d'# -> unI64 93789
+  '\x16e7e'# -> unI64 93790
+  '\x16e7f'# -> unI64 93791
+  '\x1e922'# -> unI64 125184
+  '\x1e923'# -> unI64 125185
+  '\x1e924'# -> unI64 125186
+  '\x1e925'# -> unI64 125187
+  '\x1e926'# -> unI64 125188
+  '\x1e927'# -> unI64 125189
+  '\x1e928'# -> unI64 125190
+  '\x1e929'# -> unI64 125191
+  '\x1e92a'# -> unI64 125192
+  '\x1e92b'# -> unI64 125193
+  '\x1e92c'# -> unI64 125194
+  '\x1e92d'# -> unI64 125195
+  '\x1e92e'# -> unI64 125196
+  '\x1e92f'# -> unI64 125197
+  '\x1e930'# -> unI64 125198
+  '\x1e931'# -> unI64 125199
+  '\x1e932'# -> unI64 125200
+  '\x1e933'# -> unI64 125201
+  '\x1e934'# -> unI64 125202
+  '\x1e935'# -> unI64 125203
+  '\x1e936'# -> unI64 125204
+  '\x1e937'# -> unI64 125205
+  '\x1e938'# -> unI64 125206
+  '\x1e939'# -> unI64 125207
+  '\x1e93a'# -> unI64 125208
+  '\x1e93b'# -> unI64 125209
+  '\x1e93c'# -> unI64 125210
+  '\x1e93d'# -> unI64 125211
+  '\x1e93e'# -> unI64 125212
+  '\x1e93f'# -> unI64 125213
+  '\x1e940'# -> unI64 125214
+  '\x1e941'# -> unI64 125215
+  '\x1e942'# -> unI64 125216
+  '\x1e943'# -> unI64 125217
+  _ -> unI64 0
+lowerMapping :: Char# -> _ {- unboxed Int64 -}
 {-# NOINLINE lowerMapping #-}
--- LATIN CAPITAL LETTER I WITH DOT ABOVE
-lowerMapping '\x0130' s = Yield '\x0069' (CC s '\x0307' '\x0000')
-lowerMapping c s = Yield (toLower c) (CC s '\0' '\0')
-titleMapping :: forall s. Char -> s -> Step (CC s) Char
+lowerMapping = \case
+  -- LATIN CAPITAL LETTER I WITH DOT ABOVE
+  '\x0130'# -> unI64 1625292905
+  '\x0041'# -> unI64 97
+  '\x0042'# -> unI64 98
+  '\x0043'# -> unI64 99
+  '\x0044'# -> unI64 100
+  '\x0045'# -> unI64 101
+  '\x0046'# -> unI64 102
+  '\x0047'# -> unI64 103
+  '\x0048'# -> unI64 104
+  '\x0049'# -> unI64 105
+  '\x004a'# -> unI64 106
+  '\x004b'# -> unI64 107
+  '\x004c'# -> unI64 108
+  '\x004d'# -> unI64 109
+  '\x004e'# -> unI64 110
+  '\x004f'# -> unI64 111
+  '\x0050'# -> unI64 112
+  '\x0051'# -> unI64 113
+  '\x0052'# -> unI64 114
+  '\x0053'# -> unI64 115
+  '\x0054'# -> unI64 116
+  '\x0055'# -> unI64 117
+  '\x0056'# -> unI64 118
+  '\x0057'# -> unI64 119
+  '\x0058'# -> unI64 120
+  '\x0059'# -> unI64 121
+  '\x005a'# -> unI64 122
+  '\x00c0'# -> unI64 224
+  '\x00c1'# -> unI64 225
+  '\x00c2'# -> unI64 226
+  '\x00c3'# -> unI64 227
+  '\x00c4'# -> unI64 228
+  '\x00c5'# -> unI64 229
+  '\x00c6'# -> unI64 230
+  '\x00c7'# -> unI64 231
+  '\x00c8'# -> unI64 232
+  '\x00c9'# -> unI64 233
+  '\x00ca'# -> unI64 234
+  '\x00cb'# -> unI64 235
+  '\x00cc'# -> unI64 236
+  '\x00cd'# -> unI64 237
+  '\x00ce'# -> unI64 238
+  '\x00cf'# -> unI64 239
+  '\x00d0'# -> unI64 240
+  '\x00d1'# -> unI64 241
+  '\x00d2'# -> unI64 242
+  '\x00d3'# -> unI64 243
+  '\x00d4'# -> unI64 244
+  '\x00d5'# -> unI64 245
+  '\x00d6'# -> unI64 246
+  '\x00d8'# -> unI64 248
+  '\x00d9'# -> unI64 249
+  '\x00da'# -> unI64 250
+  '\x00db'# -> unI64 251
+  '\x00dc'# -> unI64 252
+  '\x00dd'# -> unI64 253
+  '\x00de'# -> unI64 254
+  '\x0100'# -> unI64 257
+  '\x0102'# -> unI64 259
+  '\x0104'# -> unI64 261
+  '\x0106'# -> unI64 263
+  '\x0108'# -> unI64 265
+  '\x010a'# -> unI64 267
+  '\x010c'# -> unI64 269
+  '\x010e'# -> unI64 271
+  '\x0110'# -> unI64 273
+  '\x0112'# -> unI64 275
+  '\x0114'# -> unI64 277
+  '\x0116'# -> unI64 279
+  '\x0118'# -> unI64 281
+  '\x011a'# -> unI64 283
+  '\x011c'# -> unI64 285
+  '\x011e'# -> unI64 287
+  '\x0120'# -> unI64 289
+  '\x0122'# -> unI64 291
+  '\x0124'# -> unI64 293
+  '\x0126'# -> unI64 295
+  '\x0128'# -> unI64 297
+  '\x012a'# -> unI64 299
+  '\x012c'# -> unI64 301
+  '\x012e'# -> unI64 303
+  '\x0132'# -> unI64 307
+  '\x0134'# -> unI64 309
+  '\x0136'# -> unI64 311
+  '\x0139'# -> unI64 314
+  '\x013b'# -> unI64 316
+  '\x013d'# -> unI64 318
+  '\x013f'# -> unI64 320
+  '\x0141'# -> unI64 322
+  '\x0143'# -> unI64 324
+  '\x0145'# -> unI64 326
+  '\x0147'# -> unI64 328
+  '\x014a'# -> unI64 331
+  '\x014c'# -> unI64 333
+  '\x014e'# -> unI64 335
+  '\x0150'# -> unI64 337
+  '\x0152'# -> unI64 339
+  '\x0154'# -> unI64 341
+  '\x0156'# -> unI64 343
+  '\x0158'# -> unI64 345
+  '\x015a'# -> unI64 347
+  '\x015c'# -> unI64 349
+  '\x015e'# -> unI64 351
+  '\x0160'# -> unI64 353
+  '\x0162'# -> unI64 355
+  '\x0164'# -> unI64 357
+  '\x0166'# -> unI64 359
+  '\x0168'# -> unI64 361
+  '\x016a'# -> unI64 363
+  '\x016c'# -> unI64 365
+  '\x016e'# -> unI64 367
+  '\x0170'# -> unI64 369
+  '\x0172'# -> unI64 371
+  '\x0174'# -> unI64 373
+  '\x0176'# -> unI64 375
+  '\x0178'# -> unI64 255
+  '\x0179'# -> unI64 378
+  '\x017b'# -> unI64 380
+  '\x017d'# -> unI64 382
+  '\x0181'# -> unI64 595
+  '\x0182'# -> unI64 387
+  '\x0184'# -> unI64 389
+  '\x0186'# -> unI64 596
+  '\x0187'# -> unI64 392
+  '\x0189'# -> unI64 598
+  '\x018a'# -> unI64 599
+  '\x018b'# -> unI64 396
+  '\x018e'# -> unI64 477
+  '\x018f'# -> unI64 601
+  '\x0190'# -> unI64 603
+  '\x0191'# -> unI64 402
+  '\x0193'# -> unI64 608
+  '\x0194'# -> unI64 611
+  '\x0196'# -> unI64 617
+  '\x0197'# -> unI64 616
+  '\x0198'# -> unI64 409
+  '\x019c'# -> unI64 623
+  '\x019d'# -> unI64 626
+  '\x019f'# -> unI64 629
+  '\x01a0'# -> unI64 417
+  '\x01a2'# -> unI64 419
+  '\x01a4'# -> unI64 421
+  '\x01a6'# -> unI64 640
+  '\x01a7'# -> unI64 424
+  '\x01a9'# -> unI64 643
+  '\x01ac'# -> unI64 429
+  '\x01ae'# -> unI64 648
+  '\x01af'# -> unI64 432
+  '\x01b1'# -> unI64 650
+  '\x01b2'# -> unI64 651
+  '\x01b3'# -> unI64 436
+  '\x01b5'# -> unI64 438
+  '\x01b7'# -> unI64 658
+  '\x01b8'# -> unI64 441
+  '\x01bc'# -> unI64 445
+  '\x01c4'# -> unI64 454
+  '\x01c5'# -> unI64 454
+  '\x01c7'# -> unI64 457
+  '\x01c8'# -> unI64 457
+  '\x01ca'# -> unI64 460
+  '\x01cb'# -> unI64 460
+  '\x01cd'# -> unI64 462
+  '\x01cf'# -> unI64 464
+  '\x01d1'# -> unI64 466
+  '\x01d3'# -> unI64 468
+  '\x01d5'# -> unI64 470
+  '\x01d7'# -> unI64 472
+  '\x01d9'# -> unI64 474
+  '\x01db'# -> unI64 476
+  '\x01de'# -> unI64 479
+  '\x01e0'# -> unI64 481
+  '\x01e2'# -> unI64 483
+  '\x01e4'# -> unI64 485
+  '\x01e6'# -> unI64 487
+  '\x01e8'# -> unI64 489
+  '\x01ea'# -> unI64 491
+  '\x01ec'# -> unI64 493
+  '\x01ee'# -> unI64 495
+  '\x01f1'# -> unI64 499
+  '\x01f2'# -> unI64 499
+  '\x01f4'# -> unI64 501
+  '\x01f6'# -> unI64 405
+  '\x01f7'# -> unI64 447
+  '\x01f8'# -> unI64 505
+  '\x01fa'# -> unI64 507
+  '\x01fc'# -> unI64 509
+  '\x01fe'# -> unI64 511
+  '\x0200'# -> unI64 513
+  '\x0202'# -> unI64 515
+  '\x0204'# -> unI64 517
+  '\x0206'# -> unI64 519
+  '\x0208'# -> unI64 521
+  '\x020a'# -> unI64 523
+  '\x020c'# -> unI64 525
+  '\x020e'# -> unI64 527
+  '\x0210'# -> unI64 529
+  '\x0212'# -> unI64 531
+  '\x0214'# -> unI64 533
+  '\x0216'# -> unI64 535
+  '\x0218'# -> unI64 537
+  '\x021a'# -> unI64 539
+  '\x021c'# -> unI64 541
+  '\x021e'# -> unI64 543
+  '\x0220'# -> unI64 414
+  '\x0222'# -> unI64 547
+  '\x0224'# -> unI64 549
+  '\x0226'# -> unI64 551
+  '\x0228'# -> unI64 553
+  '\x022a'# -> unI64 555
+  '\x022c'# -> unI64 557
+  '\x022e'# -> unI64 559
+  '\x0230'# -> unI64 561
+  '\x0232'# -> unI64 563
+  '\x023a'# -> unI64 11365
+  '\x023b'# -> unI64 572
+  '\x023d'# -> unI64 410
+  '\x023e'# -> unI64 11366
+  '\x0241'# -> unI64 578
+  '\x0243'# -> unI64 384
+  '\x0244'# -> unI64 649
+  '\x0245'# -> unI64 652
+  '\x0246'# -> unI64 583
+  '\x0248'# -> unI64 585
+  '\x024a'# -> unI64 587
+  '\x024c'# -> unI64 589
+  '\x024e'# -> unI64 591
+  '\x0370'# -> unI64 881
+  '\x0372'# -> unI64 883
+  '\x0376'# -> unI64 887
+  '\x037f'# -> unI64 1011
+  '\x0386'# -> unI64 940
+  '\x0388'# -> unI64 941
+  '\x0389'# -> unI64 942
+  '\x038a'# -> unI64 943
+  '\x038c'# -> unI64 972
+  '\x038e'# -> unI64 973
+  '\x038f'# -> unI64 974
+  '\x0391'# -> unI64 945
+  '\x0392'# -> unI64 946
+  '\x0393'# -> unI64 947
+  '\x0394'# -> unI64 948
+  '\x0395'# -> unI64 949
+  '\x0396'# -> unI64 950
+  '\x0397'# -> unI64 951
+  '\x0398'# -> unI64 952
+  '\x0399'# -> unI64 953
+  '\x039a'# -> unI64 954
+  '\x039b'# -> unI64 955
+  '\x039c'# -> unI64 956
+  '\x039d'# -> unI64 957
+  '\x039e'# -> unI64 958
+  '\x039f'# -> unI64 959
+  '\x03a0'# -> unI64 960
+  '\x03a1'# -> unI64 961
+  '\x03a3'# -> unI64 963
+  '\x03a4'# -> unI64 964
+  '\x03a5'# -> unI64 965
+  '\x03a6'# -> unI64 966
+  '\x03a7'# -> unI64 967
+  '\x03a8'# -> unI64 968
+  '\x03a9'# -> unI64 969
+  '\x03aa'# -> unI64 970
+  '\x03ab'# -> unI64 971
+  '\x03cf'# -> unI64 983
+  '\x03d8'# -> unI64 985
+  '\x03da'# -> unI64 987
+  '\x03dc'# -> unI64 989
+  '\x03de'# -> unI64 991
+  '\x03e0'# -> unI64 993
+  '\x03e2'# -> unI64 995
+  '\x03e4'# -> unI64 997
+  '\x03e6'# -> unI64 999
+  '\x03e8'# -> unI64 1001
+  '\x03ea'# -> unI64 1003
+  '\x03ec'# -> unI64 1005
+  '\x03ee'# -> unI64 1007
+  '\x03f4'# -> unI64 952
+  '\x03f7'# -> unI64 1016
+  '\x03f9'# -> unI64 1010
+  '\x03fa'# -> unI64 1019
+  '\x03fd'# -> unI64 891
+  '\x03fe'# -> unI64 892
+  '\x03ff'# -> unI64 893
+  '\x0400'# -> unI64 1104
+  '\x0401'# -> unI64 1105
+  '\x0402'# -> unI64 1106
+  '\x0403'# -> unI64 1107
+  '\x0404'# -> unI64 1108
+  '\x0405'# -> unI64 1109
+  '\x0406'# -> unI64 1110
+  '\x0407'# -> unI64 1111
+  '\x0408'# -> unI64 1112
+  '\x0409'# -> unI64 1113
+  '\x040a'# -> unI64 1114
+  '\x040b'# -> unI64 1115
+  '\x040c'# -> unI64 1116
+  '\x040d'# -> unI64 1117
+  '\x040e'# -> unI64 1118
+  '\x040f'# -> unI64 1119
+  '\x0410'# -> unI64 1072
+  '\x0411'# -> unI64 1073
+  '\x0412'# -> unI64 1074
+  '\x0413'# -> unI64 1075
+  '\x0414'# -> unI64 1076
+  '\x0415'# -> unI64 1077
+  '\x0416'# -> unI64 1078
+  '\x0417'# -> unI64 1079
+  '\x0418'# -> unI64 1080
+  '\x0419'# -> unI64 1081
+  '\x041a'# -> unI64 1082
+  '\x041b'# -> unI64 1083
+  '\x041c'# -> unI64 1084
+  '\x041d'# -> unI64 1085
+  '\x041e'# -> unI64 1086
+  '\x041f'# -> unI64 1087
+  '\x0420'# -> unI64 1088
+  '\x0421'# -> unI64 1089
+  '\x0422'# -> unI64 1090
+  '\x0423'# -> unI64 1091
+  '\x0424'# -> unI64 1092
+  '\x0425'# -> unI64 1093
+  '\x0426'# -> unI64 1094
+  '\x0427'# -> unI64 1095
+  '\x0428'# -> unI64 1096
+  '\x0429'# -> unI64 1097
+  '\x042a'# -> unI64 1098
+  '\x042b'# -> unI64 1099
+  '\x042c'# -> unI64 1100
+  '\x042d'# -> unI64 1101
+  '\x042e'# -> unI64 1102
+  '\x042f'# -> unI64 1103
+  '\x0460'# -> unI64 1121
+  '\x0462'# -> unI64 1123
+  '\x0464'# -> unI64 1125
+  '\x0466'# -> unI64 1127
+  '\x0468'# -> unI64 1129
+  '\x046a'# -> unI64 1131
+  '\x046c'# -> unI64 1133
+  '\x046e'# -> unI64 1135
+  '\x0470'# -> unI64 1137
+  '\x0472'# -> unI64 1139
+  '\x0474'# -> unI64 1141
+  '\x0476'# -> unI64 1143
+  '\x0478'# -> unI64 1145
+  '\x047a'# -> unI64 1147
+  '\x047c'# -> unI64 1149
+  '\x047e'# -> unI64 1151
+  '\x0480'# -> unI64 1153
+  '\x048a'# -> unI64 1163
+  '\x048c'# -> unI64 1165
+  '\x048e'# -> unI64 1167
+  '\x0490'# -> unI64 1169
+  '\x0492'# -> unI64 1171
+  '\x0494'# -> unI64 1173
+  '\x0496'# -> unI64 1175
+  '\x0498'# -> unI64 1177
+  '\x049a'# -> unI64 1179
+  '\x049c'# -> unI64 1181
+  '\x049e'# -> unI64 1183
+  '\x04a0'# -> unI64 1185
+  '\x04a2'# -> unI64 1187
+  '\x04a4'# -> unI64 1189
+  '\x04a6'# -> unI64 1191
+  '\x04a8'# -> unI64 1193
+  '\x04aa'# -> unI64 1195
+  '\x04ac'# -> unI64 1197
+  '\x04ae'# -> unI64 1199
+  '\x04b0'# -> unI64 1201
+  '\x04b2'# -> unI64 1203
+  '\x04b4'# -> unI64 1205
+  '\x04b6'# -> unI64 1207
+  '\x04b8'# -> unI64 1209
+  '\x04ba'# -> unI64 1211
+  '\x04bc'# -> unI64 1213
+  '\x04be'# -> unI64 1215
+  '\x04c0'# -> unI64 1231
+  '\x04c1'# -> unI64 1218
+  '\x04c3'# -> unI64 1220
+  '\x04c5'# -> unI64 1222
+  '\x04c7'# -> unI64 1224
+  '\x04c9'# -> unI64 1226
+  '\x04cb'# -> unI64 1228
+  '\x04cd'# -> unI64 1230
+  '\x04d0'# -> unI64 1233
+  '\x04d2'# -> unI64 1235
+  '\x04d4'# -> unI64 1237
+  '\x04d6'# -> unI64 1239
+  '\x04d8'# -> unI64 1241
+  '\x04da'# -> unI64 1243
+  '\x04dc'# -> unI64 1245
+  '\x04de'# -> unI64 1247
+  '\x04e0'# -> unI64 1249
+  '\x04e2'# -> unI64 1251
+  '\x04e4'# -> unI64 1253
+  '\x04e6'# -> unI64 1255
+  '\x04e8'# -> unI64 1257
+  '\x04ea'# -> unI64 1259
+  '\x04ec'# -> unI64 1261
+  '\x04ee'# -> unI64 1263
+  '\x04f0'# -> unI64 1265
+  '\x04f2'# -> unI64 1267
+  '\x04f4'# -> unI64 1269
+  '\x04f6'# -> unI64 1271
+  '\x04f8'# -> unI64 1273
+  '\x04fa'# -> unI64 1275
+  '\x04fc'# -> unI64 1277
+  '\x04fe'# -> unI64 1279
+  '\x0500'# -> unI64 1281
+  '\x0502'# -> unI64 1283
+  '\x0504'# -> unI64 1285
+  '\x0506'# -> unI64 1287
+  '\x0508'# -> unI64 1289
+  '\x050a'# -> unI64 1291
+  '\x050c'# -> unI64 1293
+  '\x050e'# -> unI64 1295
+  '\x0510'# -> unI64 1297
+  '\x0512'# -> unI64 1299
+  '\x0514'# -> unI64 1301
+  '\x0516'# -> unI64 1303
+  '\x0518'# -> unI64 1305
+  '\x051a'# -> unI64 1307
+  '\x051c'# -> unI64 1309
+  '\x051e'# -> unI64 1311
+  '\x0520'# -> unI64 1313
+  '\x0522'# -> unI64 1315
+  '\x0524'# -> unI64 1317
+  '\x0526'# -> unI64 1319
+  '\x0528'# -> unI64 1321
+  '\x052a'# -> unI64 1323
+  '\x052c'# -> unI64 1325
+  '\x052e'# -> unI64 1327
+  '\x0531'# -> unI64 1377
+  '\x0532'# -> unI64 1378
+  '\x0533'# -> unI64 1379
+  '\x0534'# -> unI64 1380
+  '\x0535'# -> unI64 1381
+  '\x0536'# -> unI64 1382
+  '\x0537'# -> unI64 1383
+  '\x0538'# -> unI64 1384
+  '\x0539'# -> unI64 1385
+  '\x053a'# -> unI64 1386
+  '\x053b'# -> unI64 1387
+  '\x053c'# -> unI64 1388
+  '\x053d'# -> unI64 1389
+  '\x053e'# -> unI64 1390
+  '\x053f'# -> unI64 1391
+  '\x0540'# -> unI64 1392
+  '\x0541'# -> unI64 1393
+  '\x0542'# -> unI64 1394
+  '\x0543'# -> unI64 1395
+  '\x0544'# -> unI64 1396
+  '\x0545'# -> unI64 1397
+  '\x0546'# -> unI64 1398
+  '\x0547'# -> unI64 1399
+  '\x0548'# -> unI64 1400
+  '\x0549'# -> unI64 1401
+  '\x054a'# -> unI64 1402
+  '\x054b'# -> unI64 1403
+  '\x054c'# -> unI64 1404
+  '\x054d'# -> unI64 1405
+  '\x054e'# -> unI64 1406
+  '\x054f'# -> unI64 1407
+  '\x0550'# -> unI64 1408
+  '\x0551'# -> unI64 1409
+  '\x0552'# -> unI64 1410
+  '\x0553'# -> unI64 1411
+  '\x0554'# -> unI64 1412
+  '\x0555'# -> unI64 1413
+  '\x0556'# -> unI64 1414
+  '\x10a0'# -> unI64 11520
+  '\x10a1'# -> unI64 11521
+  '\x10a2'# -> unI64 11522
+  '\x10a3'# -> unI64 11523
+  '\x10a4'# -> unI64 11524
+  '\x10a5'# -> unI64 11525
+  '\x10a6'# -> unI64 11526
+  '\x10a7'# -> unI64 11527
+  '\x10a8'# -> unI64 11528
+  '\x10a9'# -> unI64 11529
+  '\x10aa'# -> unI64 11530
+  '\x10ab'# -> unI64 11531
+  '\x10ac'# -> unI64 11532
+  '\x10ad'# -> unI64 11533
+  '\x10ae'# -> unI64 11534
+  '\x10af'# -> unI64 11535
+  '\x10b0'# -> unI64 11536
+  '\x10b1'# -> unI64 11537
+  '\x10b2'# -> unI64 11538
+  '\x10b3'# -> unI64 11539
+  '\x10b4'# -> unI64 11540
+  '\x10b5'# -> unI64 11541
+  '\x10b6'# -> unI64 11542
+  '\x10b7'# -> unI64 11543
+  '\x10b8'# -> unI64 11544
+  '\x10b9'# -> unI64 11545
+  '\x10ba'# -> unI64 11546
+  '\x10bb'# -> unI64 11547
+  '\x10bc'# -> unI64 11548
+  '\x10bd'# -> unI64 11549
+  '\x10be'# -> unI64 11550
+  '\x10bf'# -> unI64 11551
+  '\x10c0'# -> unI64 11552
+  '\x10c1'# -> unI64 11553
+  '\x10c2'# -> unI64 11554
+  '\x10c3'# -> unI64 11555
+  '\x10c4'# -> unI64 11556
+  '\x10c5'# -> unI64 11557
+  '\x10c7'# -> unI64 11559
+  '\x10cd'# -> unI64 11565
+  '\x13a0'# -> unI64 43888
+  '\x13a1'# -> unI64 43889
+  '\x13a2'# -> unI64 43890
+  '\x13a3'# -> unI64 43891
+  '\x13a4'# -> unI64 43892
+  '\x13a5'# -> unI64 43893
+  '\x13a6'# -> unI64 43894
+  '\x13a7'# -> unI64 43895
+  '\x13a8'# -> unI64 43896
+  '\x13a9'# -> unI64 43897
+  '\x13aa'# -> unI64 43898
+  '\x13ab'# -> unI64 43899
+  '\x13ac'# -> unI64 43900
+  '\x13ad'# -> unI64 43901
+  '\x13ae'# -> unI64 43902
+  '\x13af'# -> unI64 43903
+  '\x13b0'# -> unI64 43904
+  '\x13b1'# -> unI64 43905
+  '\x13b2'# -> unI64 43906
+  '\x13b3'# -> unI64 43907
+  '\x13b4'# -> unI64 43908
+  '\x13b5'# -> unI64 43909
+  '\x13b6'# -> unI64 43910
+  '\x13b7'# -> unI64 43911
+  '\x13b8'# -> unI64 43912
+  '\x13b9'# -> unI64 43913
+  '\x13ba'# -> unI64 43914
+  '\x13bb'# -> unI64 43915
+  '\x13bc'# -> unI64 43916
+  '\x13bd'# -> unI64 43917
+  '\x13be'# -> unI64 43918
+  '\x13bf'# -> unI64 43919
+  '\x13c0'# -> unI64 43920
+  '\x13c1'# -> unI64 43921
+  '\x13c2'# -> unI64 43922
+  '\x13c3'# -> unI64 43923
+  '\x13c4'# -> unI64 43924
+  '\x13c5'# -> unI64 43925
+  '\x13c6'# -> unI64 43926
+  '\x13c7'# -> unI64 43927
+  '\x13c8'# -> unI64 43928
+  '\x13c9'# -> unI64 43929
+  '\x13ca'# -> unI64 43930
+  '\x13cb'# -> unI64 43931
+  '\x13cc'# -> unI64 43932
+  '\x13cd'# -> unI64 43933
+  '\x13ce'# -> unI64 43934
+  '\x13cf'# -> unI64 43935
+  '\x13d0'# -> unI64 43936
+  '\x13d1'# -> unI64 43937
+  '\x13d2'# -> unI64 43938
+  '\x13d3'# -> unI64 43939
+  '\x13d4'# -> unI64 43940
+  '\x13d5'# -> unI64 43941
+  '\x13d6'# -> unI64 43942
+  '\x13d7'# -> unI64 43943
+  '\x13d8'# -> unI64 43944
+  '\x13d9'# -> unI64 43945
+  '\x13da'# -> unI64 43946
+  '\x13db'# -> unI64 43947
+  '\x13dc'# -> unI64 43948
+  '\x13dd'# -> unI64 43949
+  '\x13de'# -> unI64 43950
+  '\x13df'# -> unI64 43951
+  '\x13e0'# -> unI64 43952
+  '\x13e1'# -> unI64 43953
+  '\x13e2'# -> unI64 43954
+  '\x13e3'# -> unI64 43955
+  '\x13e4'# -> unI64 43956
+  '\x13e5'# -> unI64 43957
+  '\x13e6'# -> unI64 43958
+  '\x13e7'# -> unI64 43959
+  '\x13e8'# -> unI64 43960
+  '\x13e9'# -> unI64 43961
+  '\x13ea'# -> unI64 43962
+  '\x13eb'# -> unI64 43963
+  '\x13ec'# -> unI64 43964
+  '\x13ed'# -> unI64 43965
+  '\x13ee'# -> unI64 43966
+  '\x13ef'# -> unI64 43967
+  '\x13f0'# -> unI64 5112
+  '\x13f1'# -> unI64 5113
+  '\x13f2'# -> unI64 5114
+  '\x13f3'# -> unI64 5115
+  '\x13f4'# -> unI64 5116
+  '\x13f5'# -> unI64 5117
+  '\x1c90'# -> unI64 4304
+  '\x1c91'# -> unI64 4305
+  '\x1c92'# -> unI64 4306
+  '\x1c93'# -> unI64 4307
+  '\x1c94'# -> unI64 4308
+  '\x1c95'# -> unI64 4309
+  '\x1c96'# -> unI64 4310
+  '\x1c97'# -> unI64 4311
+  '\x1c98'# -> unI64 4312
+  '\x1c99'# -> unI64 4313
+  '\x1c9a'# -> unI64 4314
+  '\x1c9b'# -> unI64 4315
+  '\x1c9c'# -> unI64 4316
+  '\x1c9d'# -> unI64 4317
+  '\x1c9e'# -> unI64 4318
+  '\x1c9f'# -> unI64 4319
+  '\x1ca0'# -> unI64 4320
+  '\x1ca1'# -> unI64 4321
+  '\x1ca2'# -> unI64 4322
+  '\x1ca3'# -> unI64 4323
+  '\x1ca4'# -> unI64 4324
+  '\x1ca5'# -> unI64 4325
+  '\x1ca6'# -> unI64 4326
+  '\x1ca7'# -> unI64 4327
+  '\x1ca8'# -> unI64 4328
+  '\x1ca9'# -> unI64 4329
+  '\x1caa'# -> unI64 4330
+  '\x1cab'# -> unI64 4331
+  '\x1cac'# -> unI64 4332
+  '\x1cad'# -> unI64 4333
+  '\x1cae'# -> unI64 4334
+  '\x1caf'# -> unI64 4335
+  '\x1cb0'# -> unI64 4336
+  '\x1cb1'# -> unI64 4337
+  '\x1cb2'# -> unI64 4338
+  '\x1cb3'# -> unI64 4339
+  '\x1cb4'# -> unI64 4340
+  '\x1cb5'# -> unI64 4341
+  '\x1cb6'# -> unI64 4342
+  '\x1cb7'# -> unI64 4343
+  '\x1cb8'# -> unI64 4344
+  '\x1cb9'# -> unI64 4345
+  '\x1cba'# -> unI64 4346
+  '\x1cbd'# -> unI64 4349
+  '\x1cbe'# -> unI64 4350
+  '\x1cbf'# -> unI64 4351
+  '\x1e00'# -> unI64 7681
+  '\x1e02'# -> unI64 7683
+  '\x1e04'# -> unI64 7685
+  '\x1e06'# -> unI64 7687
+  '\x1e08'# -> unI64 7689
+  '\x1e0a'# -> unI64 7691
+  '\x1e0c'# -> unI64 7693
+  '\x1e0e'# -> unI64 7695
+  '\x1e10'# -> unI64 7697
+  '\x1e12'# -> unI64 7699
+  '\x1e14'# -> unI64 7701
+  '\x1e16'# -> unI64 7703
+  '\x1e18'# -> unI64 7705
+  '\x1e1a'# -> unI64 7707
+  '\x1e1c'# -> unI64 7709
+  '\x1e1e'# -> unI64 7711
+  '\x1e20'# -> unI64 7713
+  '\x1e22'# -> unI64 7715
+  '\x1e24'# -> unI64 7717
+  '\x1e26'# -> unI64 7719
+  '\x1e28'# -> unI64 7721
+  '\x1e2a'# -> unI64 7723
+  '\x1e2c'# -> unI64 7725
+  '\x1e2e'# -> unI64 7727
+  '\x1e30'# -> unI64 7729
+  '\x1e32'# -> unI64 7731
+  '\x1e34'# -> unI64 7733
+  '\x1e36'# -> unI64 7735
+  '\x1e38'# -> unI64 7737
+  '\x1e3a'# -> unI64 7739
+  '\x1e3c'# -> unI64 7741
+  '\x1e3e'# -> unI64 7743
+  '\x1e40'# -> unI64 7745
+  '\x1e42'# -> unI64 7747
+  '\x1e44'# -> unI64 7749
+  '\x1e46'# -> unI64 7751
+  '\x1e48'# -> unI64 7753
+  '\x1e4a'# -> unI64 7755
+  '\x1e4c'# -> unI64 7757
+  '\x1e4e'# -> unI64 7759
+  '\x1e50'# -> unI64 7761
+  '\x1e52'# -> unI64 7763
+  '\x1e54'# -> unI64 7765
+  '\x1e56'# -> unI64 7767
+  '\x1e58'# -> unI64 7769
+  '\x1e5a'# -> unI64 7771
+  '\x1e5c'# -> unI64 7773
+  '\x1e5e'# -> unI64 7775
+  '\x1e60'# -> unI64 7777
+  '\x1e62'# -> unI64 7779
+  '\x1e64'# -> unI64 7781
+  '\x1e66'# -> unI64 7783
+  '\x1e68'# -> unI64 7785
+  '\x1e6a'# -> unI64 7787
+  '\x1e6c'# -> unI64 7789
+  '\x1e6e'# -> unI64 7791
+  '\x1e70'# -> unI64 7793
+  '\x1e72'# -> unI64 7795
+  '\x1e74'# -> unI64 7797
+  '\x1e76'# -> unI64 7799
+  '\x1e78'# -> unI64 7801
+  '\x1e7a'# -> unI64 7803
+  '\x1e7c'# -> unI64 7805
+  '\x1e7e'# -> unI64 7807
+  '\x1e80'# -> unI64 7809
+  '\x1e82'# -> unI64 7811
+  '\x1e84'# -> unI64 7813
+  '\x1e86'# -> unI64 7815
+  '\x1e88'# -> unI64 7817
+  '\x1e8a'# -> unI64 7819
+  '\x1e8c'# -> unI64 7821
+  '\x1e8e'# -> unI64 7823
+  '\x1e90'# -> unI64 7825
+  '\x1e92'# -> unI64 7827
+  '\x1e94'# -> unI64 7829
+  '\x1e9e'# -> unI64 223
+  '\x1ea0'# -> unI64 7841
+  '\x1ea2'# -> unI64 7843
+  '\x1ea4'# -> unI64 7845
+  '\x1ea6'# -> unI64 7847
+  '\x1ea8'# -> unI64 7849
+  '\x1eaa'# -> unI64 7851
+  '\x1eac'# -> unI64 7853
+  '\x1eae'# -> unI64 7855
+  '\x1eb0'# -> unI64 7857
+  '\x1eb2'# -> unI64 7859
+  '\x1eb4'# -> unI64 7861
+  '\x1eb6'# -> unI64 7863
+  '\x1eb8'# -> unI64 7865
+  '\x1eba'# -> unI64 7867
+  '\x1ebc'# -> unI64 7869
+  '\x1ebe'# -> unI64 7871
+  '\x1ec0'# -> unI64 7873
+  '\x1ec2'# -> unI64 7875
+  '\x1ec4'# -> unI64 7877
+  '\x1ec6'# -> unI64 7879
+  '\x1ec8'# -> unI64 7881
+  '\x1eca'# -> unI64 7883
+  '\x1ecc'# -> unI64 7885
+  '\x1ece'# -> unI64 7887
+  '\x1ed0'# -> unI64 7889
+  '\x1ed2'# -> unI64 7891
+  '\x1ed4'# -> unI64 7893
+  '\x1ed6'# -> unI64 7895
+  '\x1ed8'# -> unI64 7897
+  '\x1eda'# -> unI64 7899
+  '\x1edc'# -> unI64 7901
+  '\x1ede'# -> unI64 7903
+  '\x1ee0'# -> unI64 7905
+  '\x1ee2'# -> unI64 7907
+  '\x1ee4'# -> unI64 7909
+  '\x1ee6'# -> unI64 7911
+  '\x1ee8'# -> unI64 7913
+  '\x1eea'# -> unI64 7915
+  '\x1eec'# -> unI64 7917
+  '\x1eee'# -> unI64 7919
+  '\x1ef0'# -> unI64 7921
+  '\x1ef2'# -> unI64 7923
+  '\x1ef4'# -> unI64 7925
+  '\x1ef6'# -> unI64 7927
+  '\x1ef8'# -> unI64 7929
+  '\x1efa'# -> unI64 7931
+  '\x1efc'# -> unI64 7933
+  '\x1efe'# -> unI64 7935
+  '\x1f08'# -> unI64 7936
+  '\x1f09'# -> unI64 7937
+  '\x1f0a'# -> unI64 7938
+  '\x1f0b'# -> unI64 7939
+  '\x1f0c'# -> unI64 7940
+  '\x1f0d'# -> unI64 7941
+  '\x1f0e'# -> unI64 7942
+  '\x1f0f'# -> unI64 7943
+  '\x1f18'# -> unI64 7952
+  '\x1f19'# -> unI64 7953
+  '\x1f1a'# -> unI64 7954
+  '\x1f1b'# -> unI64 7955
+  '\x1f1c'# -> unI64 7956
+  '\x1f1d'# -> unI64 7957
+  '\x1f28'# -> unI64 7968
+  '\x1f29'# -> unI64 7969
+  '\x1f2a'# -> unI64 7970
+  '\x1f2b'# -> unI64 7971
+  '\x1f2c'# -> unI64 7972
+  '\x1f2d'# -> unI64 7973
+  '\x1f2e'# -> unI64 7974
+  '\x1f2f'# -> unI64 7975
+  '\x1f38'# -> unI64 7984
+  '\x1f39'# -> unI64 7985
+  '\x1f3a'# -> unI64 7986
+  '\x1f3b'# -> unI64 7987
+  '\x1f3c'# -> unI64 7988
+  '\x1f3d'# -> unI64 7989
+  '\x1f3e'# -> unI64 7990
+  '\x1f3f'# -> unI64 7991
+  '\x1f48'# -> unI64 8000
+  '\x1f49'# -> unI64 8001
+  '\x1f4a'# -> unI64 8002
+  '\x1f4b'# -> unI64 8003
+  '\x1f4c'# -> unI64 8004
+  '\x1f4d'# -> unI64 8005
+  '\x1f59'# -> unI64 8017
+  '\x1f5b'# -> unI64 8019
+  '\x1f5d'# -> unI64 8021
+  '\x1f5f'# -> unI64 8023
+  '\x1f68'# -> unI64 8032
+  '\x1f69'# -> unI64 8033
+  '\x1f6a'# -> unI64 8034
+  '\x1f6b'# -> unI64 8035
+  '\x1f6c'# -> unI64 8036
+  '\x1f6d'# -> unI64 8037
+  '\x1f6e'# -> unI64 8038
+  '\x1f6f'# -> unI64 8039
+  '\x1f88'# -> unI64 8064
+  '\x1f89'# -> unI64 8065
+  '\x1f8a'# -> unI64 8066
+  '\x1f8b'# -> unI64 8067
+  '\x1f8c'# -> unI64 8068
+  '\x1f8d'# -> unI64 8069
+  '\x1f8e'# -> unI64 8070
+  '\x1f8f'# -> unI64 8071
+  '\x1f98'# -> unI64 8080
+  '\x1f99'# -> unI64 8081
+  '\x1f9a'# -> unI64 8082
+  '\x1f9b'# -> unI64 8083
+  '\x1f9c'# -> unI64 8084
+  '\x1f9d'# -> unI64 8085
+  '\x1f9e'# -> unI64 8086
+  '\x1f9f'# -> unI64 8087
+  '\x1fa8'# -> unI64 8096
+  '\x1fa9'# -> unI64 8097
+  '\x1faa'# -> unI64 8098
+  '\x1fab'# -> unI64 8099
+  '\x1fac'# -> unI64 8100
+  '\x1fad'# -> unI64 8101
+  '\x1fae'# -> unI64 8102
+  '\x1faf'# -> unI64 8103
+  '\x1fb8'# -> unI64 8112
+  '\x1fb9'# -> unI64 8113
+  '\x1fba'# -> unI64 8048
+  '\x1fbb'# -> unI64 8049
+  '\x1fbc'# -> unI64 8115
+  '\x1fc8'# -> unI64 8050
+  '\x1fc9'# -> unI64 8051
+  '\x1fca'# -> unI64 8052
+  '\x1fcb'# -> unI64 8053
+  '\x1fcc'# -> unI64 8131
+  '\x1fd8'# -> unI64 8144
+  '\x1fd9'# -> unI64 8145
+  '\x1fda'# -> unI64 8054
+  '\x1fdb'# -> unI64 8055
+  '\x1fe8'# -> unI64 8160
+  '\x1fe9'# -> unI64 8161
+  '\x1fea'# -> unI64 8058
+  '\x1feb'# -> unI64 8059
+  '\x1fec'# -> unI64 8165
+  '\x1ff8'# -> unI64 8056
+  '\x1ff9'# -> unI64 8057
+  '\x1ffa'# -> unI64 8060
+  '\x1ffb'# -> unI64 8061
+  '\x1ffc'# -> unI64 8179
+  '\x2126'# -> unI64 969
+  '\x212a'# -> unI64 107
+  '\x212b'# -> unI64 229
+  '\x2132'# -> unI64 8526
+  '\x2160'# -> unI64 8560
+  '\x2161'# -> unI64 8561
+  '\x2162'# -> unI64 8562
+  '\x2163'# -> unI64 8563
+  '\x2164'# -> unI64 8564
+  '\x2165'# -> unI64 8565
+  '\x2166'# -> unI64 8566
+  '\x2167'# -> unI64 8567
+  '\x2168'# -> unI64 8568
+  '\x2169'# -> unI64 8569
+  '\x216a'# -> unI64 8570
+  '\x216b'# -> unI64 8571
+  '\x216c'# -> unI64 8572
+  '\x216d'# -> unI64 8573
+  '\x216e'# -> unI64 8574
+  '\x216f'# -> unI64 8575
+  '\x2183'# -> unI64 8580
+  '\x24b6'# -> unI64 9424
+  '\x24b7'# -> unI64 9425
+  '\x24b8'# -> unI64 9426
+  '\x24b9'# -> unI64 9427
+  '\x24ba'# -> unI64 9428
+  '\x24bb'# -> unI64 9429
+  '\x24bc'# -> unI64 9430
+  '\x24bd'# -> unI64 9431
+  '\x24be'# -> unI64 9432
+  '\x24bf'# -> unI64 9433
+  '\x24c0'# -> unI64 9434
+  '\x24c1'# -> unI64 9435
+  '\x24c2'# -> unI64 9436
+  '\x24c3'# -> unI64 9437
+  '\x24c4'# -> unI64 9438
+  '\x24c5'# -> unI64 9439
+  '\x24c6'# -> unI64 9440
+  '\x24c7'# -> unI64 9441
+  '\x24c8'# -> unI64 9442
+  '\x24c9'# -> unI64 9443
+  '\x24ca'# -> unI64 9444
+  '\x24cb'# -> unI64 9445
+  '\x24cc'# -> unI64 9446
+  '\x24cd'# -> unI64 9447
+  '\x24ce'# -> unI64 9448
+  '\x24cf'# -> unI64 9449
+  '\x2c00'# -> unI64 11312
+  '\x2c01'# -> unI64 11313
+  '\x2c02'# -> unI64 11314
+  '\x2c03'# -> unI64 11315
+  '\x2c04'# -> unI64 11316
+  '\x2c05'# -> unI64 11317
+  '\x2c06'# -> unI64 11318
+  '\x2c07'# -> unI64 11319
+  '\x2c08'# -> unI64 11320
+  '\x2c09'# -> unI64 11321
+  '\x2c0a'# -> unI64 11322
+  '\x2c0b'# -> unI64 11323
+  '\x2c0c'# -> unI64 11324
+  '\x2c0d'# -> unI64 11325
+  '\x2c0e'# -> unI64 11326
+  '\x2c0f'# -> unI64 11327
+  '\x2c10'# -> unI64 11328
+  '\x2c11'# -> unI64 11329
+  '\x2c12'# -> unI64 11330
+  '\x2c13'# -> unI64 11331
+  '\x2c14'# -> unI64 11332
+  '\x2c15'# -> unI64 11333
+  '\x2c16'# -> unI64 11334
+  '\x2c17'# -> unI64 11335
+  '\x2c18'# -> unI64 11336
+  '\x2c19'# -> unI64 11337
+  '\x2c1a'# -> unI64 11338
+  '\x2c1b'# -> unI64 11339
+  '\x2c1c'# -> unI64 11340
+  '\x2c1d'# -> unI64 11341
+  '\x2c1e'# -> unI64 11342
+  '\x2c1f'# -> unI64 11343
+  '\x2c20'# -> unI64 11344
+  '\x2c21'# -> unI64 11345
+  '\x2c22'# -> unI64 11346
+  '\x2c23'# -> unI64 11347
+  '\x2c24'# -> unI64 11348
+  '\x2c25'# -> unI64 11349
+  '\x2c26'# -> unI64 11350
+  '\x2c27'# -> unI64 11351
+  '\x2c28'# -> unI64 11352
+  '\x2c29'# -> unI64 11353
+  '\x2c2a'# -> unI64 11354
+  '\x2c2b'# -> unI64 11355
+  '\x2c2c'# -> unI64 11356
+  '\x2c2d'# -> unI64 11357
+  '\x2c2e'# -> unI64 11358
+  '\x2c60'# -> unI64 11361
+  '\x2c62'# -> unI64 619
+  '\x2c63'# -> unI64 7549
+  '\x2c64'# -> unI64 637
+  '\x2c67'# -> unI64 11368
+  '\x2c69'# -> unI64 11370
+  '\x2c6b'# -> unI64 11372
+  '\x2c6d'# -> unI64 593
+  '\x2c6e'# -> unI64 625
+  '\x2c6f'# -> unI64 592
+  '\x2c70'# -> unI64 594
+  '\x2c72'# -> unI64 11379
+  '\x2c75'# -> unI64 11382
+  '\x2c7e'# -> unI64 575
+  '\x2c7f'# -> unI64 576
+  '\x2c80'# -> unI64 11393
+  '\x2c82'# -> unI64 11395
+  '\x2c84'# -> unI64 11397
+  '\x2c86'# -> unI64 11399
+  '\x2c88'# -> unI64 11401
+  '\x2c8a'# -> unI64 11403
+  '\x2c8c'# -> unI64 11405
+  '\x2c8e'# -> unI64 11407
+  '\x2c90'# -> unI64 11409
+  '\x2c92'# -> unI64 11411
+  '\x2c94'# -> unI64 11413
+  '\x2c96'# -> unI64 11415
+  '\x2c98'# -> unI64 11417
+  '\x2c9a'# -> unI64 11419
+  '\x2c9c'# -> unI64 11421
+  '\x2c9e'# -> unI64 11423
+  '\x2ca0'# -> unI64 11425
+  '\x2ca2'# -> unI64 11427
+  '\x2ca4'# -> unI64 11429
+  '\x2ca6'# -> unI64 11431
+  '\x2ca8'# -> unI64 11433
+  '\x2caa'# -> unI64 11435
+  '\x2cac'# -> unI64 11437
+  '\x2cae'# -> unI64 11439
+  '\x2cb0'# -> unI64 11441
+  '\x2cb2'# -> unI64 11443
+  '\x2cb4'# -> unI64 11445
+  '\x2cb6'# -> unI64 11447
+  '\x2cb8'# -> unI64 11449
+  '\x2cba'# -> unI64 11451
+  '\x2cbc'# -> unI64 11453
+  '\x2cbe'# -> unI64 11455
+  '\x2cc0'# -> unI64 11457
+  '\x2cc2'# -> unI64 11459
+  '\x2cc4'# -> unI64 11461
+  '\x2cc6'# -> unI64 11463
+  '\x2cc8'# -> unI64 11465
+  '\x2cca'# -> unI64 11467
+  '\x2ccc'# -> unI64 11469
+  '\x2cce'# -> unI64 11471
+  '\x2cd0'# -> unI64 11473
+  '\x2cd2'# -> unI64 11475
+  '\x2cd4'# -> unI64 11477
+  '\x2cd6'# -> unI64 11479
+  '\x2cd8'# -> unI64 11481
+  '\x2cda'# -> unI64 11483
+  '\x2cdc'# -> unI64 11485
+  '\x2cde'# -> unI64 11487
+  '\x2ce0'# -> unI64 11489
+  '\x2ce2'# -> unI64 11491
+  '\x2ceb'# -> unI64 11500
+  '\x2ced'# -> unI64 11502
+  '\x2cf2'# -> unI64 11507
+  '\xa640'# -> unI64 42561
+  '\xa642'# -> unI64 42563
+  '\xa644'# -> unI64 42565
+  '\xa646'# -> unI64 42567
+  '\xa648'# -> unI64 42569
+  '\xa64a'# -> unI64 42571
+  '\xa64c'# -> unI64 42573
+  '\xa64e'# -> unI64 42575
+  '\xa650'# -> unI64 42577
+  '\xa652'# -> unI64 42579
+  '\xa654'# -> unI64 42581
+  '\xa656'# -> unI64 42583
+  '\xa658'# -> unI64 42585
+  '\xa65a'# -> unI64 42587
+  '\xa65c'# -> unI64 42589
+  '\xa65e'# -> unI64 42591
+  '\xa660'# -> unI64 42593
+  '\xa662'# -> unI64 42595
+  '\xa664'# -> unI64 42597
+  '\xa666'# -> unI64 42599
+  '\xa668'# -> unI64 42601
+  '\xa66a'# -> unI64 42603
+  '\xa66c'# -> unI64 42605
+  '\xa680'# -> unI64 42625
+  '\xa682'# -> unI64 42627
+  '\xa684'# -> unI64 42629
+  '\xa686'# -> unI64 42631
+  '\xa688'# -> unI64 42633
+  '\xa68a'# -> unI64 42635
+  '\xa68c'# -> unI64 42637
+  '\xa68e'# -> unI64 42639
+  '\xa690'# -> unI64 42641
+  '\xa692'# -> unI64 42643
+  '\xa694'# -> unI64 42645
+  '\xa696'# -> unI64 42647
+  '\xa698'# -> unI64 42649
+  '\xa69a'# -> unI64 42651
+  '\xa722'# -> unI64 42787
+  '\xa724'# -> unI64 42789
+  '\xa726'# -> unI64 42791
+  '\xa728'# -> unI64 42793
+  '\xa72a'# -> unI64 42795
+  '\xa72c'# -> unI64 42797
+  '\xa72e'# -> unI64 42799
+  '\xa732'# -> unI64 42803
+  '\xa734'# -> unI64 42805
+  '\xa736'# -> unI64 42807
+  '\xa738'# -> unI64 42809
+  '\xa73a'# -> unI64 42811
+  '\xa73c'# -> unI64 42813
+  '\xa73e'# -> unI64 42815
+  '\xa740'# -> unI64 42817
+  '\xa742'# -> unI64 42819
+  '\xa744'# -> unI64 42821
+  '\xa746'# -> unI64 42823
+  '\xa748'# -> unI64 42825
+  '\xa74a'# -> unI64 42827
+  '\xa74c'# -> unI64 42829
+  '\xa74e'# -> unI64 42831
+  '\xa750'# -> unI64 42833
+  '\xa752'# -> unI64 42835
+  '\xa754'# -> unI64 42837
+  '\xa756'# -> unI64 42839
+  '\xa758'# -> unI64 42841
+  '\xa75a'# -> unI64 42843
+  '\xa75c'# -> unI64 42845
+  '\xa75e'# -> unI64 42847
+  '\xa760'# -> unI64 42849
+  '\xa762'# -> unI64 42851
+  '\xa764'# -> unI64 42853
+  '\xa766'# -> unI64 42855
+  '\xa768'# -> unI64 42857
+  '\xa76a'# -> unI64 42859
+  '\xa76c'# -> unI64 42861
+  '\xa76e'# -> unI64 42863
+  '\xa779'# -> unI64 42874
+  '\xa77b'# -> unI64 42876
+  '\xa77d'# -> unI64 7545
+  '\xa77e'# -> unI64 42879
+  '\xa780'# -> unI64 42881
+  '\xa782'# -> unI64 42883
+  '\xa784'# -> unI64 42885
+  '\xa786'# -> unI64 42887
+  '\xa78b'# -> unI64 42892
+  '\xa78d'# -> unI64 613
+  '\xa790'# -> unI64 42897
+  '\xa792'# -> unI64 42899
+  '\xa796'# -> unI64 42903
+  '\xa798'# -> unI64 42905
+  '\xa79a'# -> unI64 42907
+  '\xa79c'# -> unI64 42909
+  '\xa79e'# -> unI64 42911
+  '\xa7a0'# -> unI64 42913
+  '\xa7a2'# -> unI64 42915
+  '\xa7a4'# -> unI64 42917
+  '\xa7a6'# -> unI64 42919
+  '\xa7a8'# -> unI64 42921
+  '\xa7aa'# -> unI64 614
+  '\xa7ab'# -> unI64 604
+  '\xa7ac'# -> unI64 609
+  '\xa7ad'# -> unI64 620
+  '\xa7ae'# -> unI64 618
+  '\xa7b0'# -> unI64 670
+  '\xa7b1'# -> unI64 647
+  '\xa7b2'# -> unI64 669
+  '\xa7b3'# -> unI64 43859
+  '\xa7b4'# -> unI64 42933
+  '\xa7b6'# -> unI64 42935
+  '\xa7b8'# -> unI64 42937
+  '\xa7ba'# -> unI64 42939
+  '\xa7bc'# -> unI64 42941
+  '\xa7be'# -> unI64 42943
+  '\xa7c2'# -> unI64 42947
+  '\xa7c4'# -> unI64 42900
+  '\xa7c5'# -> unI64 642
+  '\xa7c6'# -> unI64 7566
+  '\xff21'# -> unI64 65345
+  '\xff22'# -> unI64 65346
+  '\xff23'# -> unI64 65347
+  '\xff24'# -> unI64 65348
+  '\xff25'# -> unI64 65349
+  '\xff26'# -> unI64 65350
+  '\xff27'# -> unI64 65351
+  '\xff28'# -> unI64 65352
+  '\xff29'# -> unI64 65353
+  '\xff2a'# -> unI64 65354
+  '\xff2b'# -> unI64 65355
+  '\xff2c'# -> unI64 65356
+  '\xff2d'# -> unI64 65357
+  '\xff2e'# -> unI64 65358
+  '\xff2f'# -> unI64 65359
+  '\xff30'# -> unI64 65360
+  '\xff31'# -> unI64 65361
+  '\xff32'# -> unI64 65362
+  '\xff33'# -> unI64 65363
+  '\xff34'# -> unI64 65364
+  '\xff35'# -> unI64 65365
+  '\xff36'# -> unI64 65366
+  '\xff37'# -> unI64 65367
+  '\xff38'# -> unI64 65368
+  '\xff39'# -> unI64 65369
+  '\xff3a'# -> unI64 65370
+  '\x10400'# -> unI64 66600
+  '\x10401'# -> unI64 66601
+  '\x10402'# -> unI64 66602
+  '\x10403'# -> unI64 66603
+  '\x10404'# -> unI64 66604
+  '\x10405'# -> unI64 66605
+  '\x10406'# -> unI64 66606
+  '\x10407'# -> unI64 66607
+  '\x10408'# -> unI64 66608
+  '\x10409'# -> unI64 66609
+  '\x1040a'# -> unI64 66610
+  '\x1040b'# -> unI64 66611
+  '\x1040c'# -> unI64 66612
+  '\x1040d'# -> unI64 66613
+  '\x1040e'# -> unI64 66614
+  '\x1040f'# -> unI64 66615
+  '\x10410'# -> unI64 66616
+  '\x10411'# -> unI64 66617
+  '\x10412'# -> unI64 66618
+  '\x10413'# -> unI64 66619
+  '\x10414'# -> unI64 66620
+  '\x10415'# -> unI64 66621
+  '\x10416'# -> unI64 66622
+  '\x10417'# -> unI64 66623
+  '\x10418'# -> unI64 66624
+  '\x10419'# -> unI64 66625
+  '\x1041a'# -> unI64 66626
+  '\x1041b'# -> unI64 66627
+  '\x1041c'# -> unI64 66628
+  '\x1041d'# -> unI64 66629
+  '\x1041e'# -> unI64 66630
+  '\x1041f'# -> unI64 66631
+  '\x10420'# -> unI64 66632
+  '\x10421'# -> unI64 66633
+  '\x10422'# -> unI64 66634
+  '\x10423'# -> unI64 66635
+  '\x10424'# -> unI64 66636
+  '\x10425'# -> unI64 66637
+  '\x10426'# -> unI64 66638
+  '\x10427'# -> unI64 66639
+  '\x104b0'# -> unI64 66776
+  '\x104b1'# -> unI64 66777
+  '\x104b2'# -> unI64 66778
+  '\x104b3'# -> unI64 66779
+  '\x104b4'# -> unI64 66780
+  '\x104b5'# -> unI64 66781
+  '\x104b6'# -> unI64 66782
+  '\x104b7'# -> unI64 66783
+  '\x104b8'# -> unI64 66784
+  '\x104b9'# -> unI64 66785
+  '\x104ba'# -> unI64 66786
+  '\x104bb'# -> unI64 66787
+  '\x104bc'# -> unI64 66788
+  '\x104bd'# -> unI64 66789
+  '\x104be'# -> unI64 66790
+  '\x104bf'# -> unI64 66791
+  '\x104c0'# -> unI64 66792
+  '\x104c1'# -> unI64 66793
+  '\x104c2'# -> unI64 66794
+  '\x104c3'# -> unI64 66795
+  '\x104c4'# -> unI64 66796
+  '\x104c5'# -> unI64 66797
+  '\x104c6'# -> unI64 66798
+  '\x104c7'# -> unI64 66799
+  '\x104c8'# -> unI64 66800
+  '\x104c9'# -> unI64 66801
+  '\x104ca'# -> unI64 66802
+  '\x104cb'# -> unI64 66803
+  '\x104cc'# -> unI64 66804
+  '\x104cd'# -> unI64 66805
+  '\x104ce'# -> unI64 66806
+  '\x104cf'# -> unI64 66807
+  '\x104d0'# -> unI64 66808
+  '\x104d1'# -> unI64 66809
+  '\x104d2'# -> unI64 66810
+  '\x104d3'# -> unI64 66811
+  '\x10c80'# -> unI64 68800
+  '\x10c81'# -> unI64 68801
+  '\x10c82'# -> unI64 68802
+  '\x10c83'# -> unI64 68803
+  '\x10c84'# -> unI64 68804
+  '\x10c85'# -> unI64 68805
+  '\x10c86'# -> unI64 68806
+  '\x10c87'# -> unI64 68807
+  '\x10c88'# -> unI64 68808
+  '\x10c89'# -> unI64 68809
+  '\x10c8a'# -> unI64 68810
+  '\x10c8b'# -> unI64 68811
+  '\x10c8c'# -> unI64 68812
+  '\x10c8d'# -> unI64 68813
+  '\x10c8e'# -> unI64 68814
+  '\x10c8f'# -> unI64 68815
+  '\x10c90'# -> unI64 68816
+  '\x10c91'# -> unI64 68817
+  '\x10c92'# -> unI64 68818
+  '\x10c93'# -> unI64 68819
+  '\x10c94'# -> unI64 68820
+  '\x10c95'# -> unI64 68821
+  '\x10c96'# -> unI64 68822
+  '\x10c97'# -> unI64 68823
+  '\x10c98'# -> unI64 68824
+  '\x10c99'# -> unI64 68825
+  '\x10c9a'# -> unI64 68826
+  '\x10c9b'# -> unI64 68827
+  '\x10c9c'# -> unI64 68828
+  '\x10c9d'# -> unI64 68829
+  '\x10c9e'# -> unI64 68830
+  '\x10c9f'# -> unI64 68831
+  '\x10ca0'# -> unI64 68832
+  '\x10ca1'# -> unI64 68833
+  '\x10ca2'# -> unI64 68834
+  '\x10ca3'# -> unI64 68835
+  '\x10ca4'# -> unI64 68836
+  '\x10ca5'# -> unI64 68837
+  '\x10ca6'# -> unI64 68838
+  '\x10ca7'# -> unI64 68839
+  '\x10ca8'# -> unI64 68840
+  '\x10ca9'# -> unI64 68841
+  '\x10caa'# -> unI64 68842
+  '\x10cab'# -> unI64 68843
+  '\x10cac'# -> unI64 68844
+  '\x10cad'# -> unI64 68845
+  '\x10cae'# -> unI64 68846
+  '\x10caf'# -> unI64 68847
+  '\x10cb0'# -> unI64 68848
+  '\x10cb1'# -> unI64 68849
+  '\x10cb2'# -> unI64 68850
+  '\x118a0'# -> unI64 71872
+  '\x118a1'# -> unI64 71873
+  '\x118a2'# -> unI64 71874
+  '\x118a3'# -> unI64 71875
+  '\x118a4'# -> unI64 71876
+  '\x118a5'# -> unI64 71877
+  '\x118a6'# -> unI64 71878
+  '\x118a7'# -> unI64 71879
+  '\x118a8'# -> unI64 71880
+  '\x118a9'# -> unI64 71881
+  '\x118aa'# -> unI64 71882
+  '\x118ab'# -> unI64 71883
+  '\x118ac'# -> unI64 71884
+  '\x118ad'# -> unI64 71885
+  '\x118ae'# -> unI64 71886
+  '\x118af'# -> unI64 71887
+  '\x118b0'# -> unI64 71888
+  '\x118b1'# -> unI64 71889
+  '\x118b2'# -> unI64 71890
+  '\x118b3'# -> unI64 71891
+  '\x118b4'# -> unI64 71892
+  '\x118b5'# -> unI64 71893
+  '\x118b6'# -> unI64 71894
+  '\x118b7'# -> unI64 71895
+  '\x118b8'# -> unI64 71896
+  '\x118b9'# -> unI64 71897
+  '\x118ba'# -> unI64 71898
+  '\x118bb'# -> unI64 71899
+  '\x118bc'# -> unI64 71900
+  '\x118bd'# -> unI64 71901
+  '\x118be'# -> unI64 71902
+  '\x118bf'# -> unI64 71903
+  '\x16e40'# -> unI64 93792
+  '\x16e41'# -> unI64 93793
+  '\x16e42'# -> unI64 93794
+  '\x16e43'# -> unI64 93795
+  '\x16e44'# -> unI64 93796
+  '\x16e45'# -> unI64 93797
+  '\x16e46'# -> unI64 93798
+  '\x16e47'# -> unI64 93799
+  '\x16e48'# -> unI64 93800
+  '\x16e49'# -> unI64 93801
+  '\x16e4a'# -> unI64 93802
+  '\x16e4b'# -> unI64 93803
+  '\x16e4c'# -> unI64 93804
+  '\x16e4d'# -> unI64 93805
+  '\x16e4e'# -> unI64 93806
+  '\x16e4f'# -> unI64 93807
+  '\x16e50'# -> unI64 93808
+  '\x16e51'# -> unI64 93809
+  '\x16e52'# -> unI64 93810
+  '\x16e53'# -> unI64 93811
+  '\x16e54'# -> unI64 93812
+  '\x16e55'# -> unI64 93813
+  '\x16e56'# -> unI64 93814
+  '\x16e57'# -> unI64 93815
+  '\x16e58'# -> unI64 93816
+  '\x16e59'# -> unI64 93817
+  '\x16e5a'# -> unI64 93818
+  '\x16e5b'# -> unI64 93819
+  '\x16e5c'# -> unI64 93820
+  '\x16e5d'# -> unI64 93821
+  '\x16e5e'# -> unI64 93822
+  '\x16e5f'# -> unI64 93823
+  '\x1e900'# -> unI64 125218
+  '\x1e901'# -> unI64 125219
+  '\x1e902'# -> unI64 125220
+  '\x1e903'# -> unI64 125221
+  '\x1e904'# -> unI64 125222
+  '\x1e905'# -> unI64 125223
+  '\x1e906'# -> unI64 125224
+  '\x1e907'# -> unI64 125225
+  '\x1e908'# -> unI64 125226
+  '\x1e909'# -> unI64 125227
+  '\x1e90a'# -> unI64 125228
+  '\x1e90b'# -> unI64 125229
+  '\x1e90c'# -> unI64 125230
+  '\x1e90d'# -> unI64 125231
+  '\x1e90e'# -> unI64 125232
+  '\x1e90f'# -> unI64 125233
+  '\x1e910'# -> unI64 125234
+  '\x1e911'# -> unI64 125235
+  '\x1e912'# -> unI64 125236
+  '\x1e913'# -> unI64 125237
+  '\x1e914'# -> unI64 125238
+  '\x1e915'# -> unI64 125239
+  '\x1e916'# -> unI64 125240
+  '\x1e917'# -> unI64 125241
+  '\x1e918'# -> unI64 125242
+  '\x1e919'# -> unI64 125243
+  '\x1e91a'# -> unI64 125244
+  '\x1e91b'# -> unI64 125245
+  '\x1e91c'# -> unI64 125246
+  '\x1e91d'# -> unI64 125247
+  '\x1e91e'# -> unI64 125248
+  '\x1e91f'# -> unI64 125249
+  '\x1e920'# -> unI64 125250
+  '\x1e921'# -> unI64 125251
+  _ -> unI64 0
+titleMapping :: Char# -> _ {- unboxed Int64 -}
 {-# NOINLINE titleMapping #-}
--- LATIN SMALL LETTER SHARP S
-titleMapping '\x00df' s = Yield '\x0053' (CC s '\x0073' '\x0000')
--- LATIN SMALL LIGATURE FF
-titleMapping '\xfb00' s = Yield '\x0046' (CC s '\x0066' '\x0000')
--- LATIN SMALL LIGATURE FI
-titleMapping '\xfb01' s = Yield '\x0046' (CC s '\x0069' '\x0000')
--- LATIN SMALL LIGATURE FL
-titleMapping '\xfb02' s = Yield '\x0046' (CC s '\x006c' '\x0000')
--- LATIN SMALL LIGATURE FFI
-titleMapping '\xfb03' s = Yield '\x0046' (CC s '\x0066' '\x0069')
--- LATIN SMALL LIGATURE FFL
-titleMapping '\xfb04' s = Yield '\x0046' (CC s '\x0066' '\x006c')
--- LATIN SMALL LIGATURE LONG S T
-titleMapping '\xfb05' s = Yield '\x0053' (CC s '\x0074' '\x0000')
--- LATIN SMALL LIGATURE ST
-titleMapping '\xfb06' s = Yield '\x0053' (CC s '\x0074' '\x0000')
--- ARMENIAN SMALL LIGATURE ECH YIWN
-titleMapping '\x0587' s = Yield '\x0535' (CC s '\x0582' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN NOW
-titleMapping '\xfb13' s = Yield '\x0544' (CC s '\x0576' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN ECH
-titleMapping '\xfb14' s = Yield '\x0544' (CC s '\x0565' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN INI
-titleMapping '\xfb15' s = Yield '\x0544' (CC s '\x056b' '\x0000')
--- ARMENIAN SMALL LIGATURE VEW NOW
-titleMapping '\xfb16' s = Yield '\x054e' (CC s '\x0576' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN XEH
-titleMapping '\xfb17' s = Yield '\x0544' (CC s '\x056d' '\x0000')
--- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-titleMapping '\x0149' s = Yield '\x02bc' (CC s '\x004e' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-titleMapping '\x0390' s = Yield '\x0399' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-titleMapping '\x03b0' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
--- LATIN SMALL LETTER J WITH CARON
-titleMapping '\x01f0' s = Yield '\x004a' (CC s '\x030c' '\x0000')
--- LATIN SMALL LETTER H WITH LINE BELOW
-titleMapping '\x1e96' s = Yield '\x0048' (CC s '\x0331' '\x0000')
--- LATIN SMALL LETTER T WITH DIAERESIS
-titleMapping '\x1e97' s = Yield '\x0054' (CC s '\x0308' '\x0000')
--- LATIN SMALL LETTER W WITH RING ABOVE
-titleMapping '\x1e98' s = Yield '\x0057' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER Y WITH RING ABOVE
-titleMapping '\x1e99' s = Yield '\x0059' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER A WITH RIGHT HALF RING
-titleMapping '\x1e9a' s = Yield '\x0041' (CC s '\x02be' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI
-titleMapping '\x1f50' s = Yield '\x03a5' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
-titleMapping '\x1f52' s = Yield '\x03a5' (CC s '\x0313' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
-titleMapping '\x1f54' s = Yield '\x03a5' (CC s '\x0313' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
-titleMapping '\x1f56' s = Yield '\x03a5' (CC s '\x0313' '\x0342')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
-titleMapping '\x1fb6' s = Yield '\x0391' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI
-titleMapping '\x1fc6' s = Yield '\x0397' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
-titleMapping '\x1fd2' s = Yield '\x0399' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-titleMapping '\x1fd3' s = Yield '\x0399' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER IOTA WITH PERISPOMENI
-titleMapping '\x1fd6' s = Yield '\x0399' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
-titleMapping '\x1fd7' s = Yield '\x0399' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
-titleMapping '\x1fe2' s = Yield '\x03a5' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
-titleMapping '\x1fe3' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER RHO WITH PSILI
-titleMapping '\x1fe4' s = Yield '\x03a1' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
-titleMapping '\x1fe6' s = Yield '\x03a5' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
-titleMapping '\x1fe7' s = Yield '\x03a5' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
-titleMapping '\x1ff6' s = Yield '\x03a9' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
-titleMapping '\x1fb2' s = Yield '\x1fba' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
-titleMapping '\x1fb4' s = Yield '\x0386' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
-titleMapping '\x1fc2' s = Yield '\x1fca' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
-titleMapping '\x1fc4' s = Yield '\x0389' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
-titleMapping '\x1ff2' s = Yield '\x1ffa' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
-titleMapping '\x1ff4' s = Yield '\x038f' (CC s '\x0345' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
-titleMapping '\x1fb7' s = Yield '\x0391' (CC s '\x0342' '\x0345')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
-titleMapping '\x1fc7' s = Yield '\x0397' (CC s '\x0342' '\x0345')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
-titleMapping '\x1ff7' s = Yield '\x03a9' (CC s '\x0342' '\x0345')
-titleMapping c s = Yield (toTitle c) (CC s '\0' '\0')
-foldMapping :: forall s. Char -> s -> Step (CC s) Char
+titleMapping = \case
+  -- LATIN SMALL LETTER SHARP S
+  '\x00df'# -> unI64 241172563
+  -- LATIN SMALL LIGATURE FF
+  '\xfb00'# -> unI64 213909574
+  -- LATIN SMALL LIGATURE FI
+  '\xfb01'# -> unI64 220201030
+  -- LATIN SMALL LIGATURE FL
+  '\xfb02'# -> unI64 226492486
+  -- LATIN SMALL LIGATURE FFI
+  '\xfb03'# -> unI64 461795097575494
+  -- LATIN SMALL LIGATURE FFL
+  '\xfb04'# -> unI64 474989237108806
+  -- LATIN SMALL LIGATURE LONG S T
+  '\xfb05'# -> unI64 243269715
+  -- LATIN SMALL LIGATURE ST
+  '\xfb06'# -> unI64 243269715
+  -- ARMENIAN SMALL LIGATURE ECH YIWN
+  '\x0587'# -> unI64 2956985653
+  -- ARMENIAN SMALL LIGATURE MEN NOW
+  '\xfb13'# -> unI64 2931819844
+  -- ARMENIAN SMALL LIGATURE MEN ECH
+  '\xfb14'# -> unI64 2896168260
+  -- ARMENIAN SMALL LIGATURE MEN INI
+  '\xfb15'# -> unI64 2908751172
+  -- ARMENIAN SMALL LIGATURE VEW NOW
+  '\xfb16'# -> unI64 2931819854
+  -- ARMENIAN SMALL LIGATURE MEN XEH
+  '\xfb17'# -> unI64 2912945476
+  -- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+  '\x0149'# -> unI64 163578556
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+  '\x0390'# -> unI64 3382099394429849
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+  '\x03b0'# -> unI64 3382099394429861
+  -- LATIN SMALL LETTER J WITH CARON
+  '\x01f0'# -> unI64 1635778634
+  -- LATIN SMALL LETTER H WITH LINE BELOW
+  '\x1e96'# -> unI64 1713373256
+  -- LATIN SMALL LETTER T WITH DIAERESIS
+  '\x1e97'# -> unI64 1627390036
+  -- LATIN SMALL LETTER W WITH RING ABOVE
+  '\x1e98'# -> unI64 1631584343
+  -- LATIN SMALL LETTER Y WITH RING ABOVE
+  '\x1e99'# -> unI64 1631584345
+  -- LATIN SMALL LETTER A WITH RIGHT HALF RING
+  '\x1e9a'# -> unI64 1472200769
+  -- GREEK SMALL LETTER UPSILON WITH PSILI
+  '\x1f50'# -> unI64 1650459557
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+  '\x1f52'# -> unI64 3377701370987429
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+  '\x1f54'# -> unI64 3382099417498533
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+  '\x1f56'# -> unI64 3667972440720293
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+  '\x1fb6'# -> unI64 1749025681
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI
+  '\x1fc6'# -> unI64 1749025687
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+  '\x1fd2'# -> unI64 3377701347918745
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+  '\x1fd3'# -> unI64 3382099394429849
+  -- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+  '\x1fd6'# -> unI64 1749025689
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+  '\x1fd7'# -> unI64 3667972417651609
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+  '\x1fe2'# -> unI64 3377701347918757
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+  '\x1fe3'# -> unI64 3382099394429861
+  -- GREEK SMALL LETTER RHO WITH PSILI
+  '\x1fe4'# -> unI64 1650459553
+  -- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+  '\x1fe6'# -> unI64 1749025701
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+  '\x1fe7'# -> unI64 3667972417651621
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+  '\x1ff6'# -> unI64 1749025705
+  -- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fb2'# -> unI64 1755324346
+  -- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fb4'# -> unI64 1755317126
+  -- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fc2'# -> unI64 1755324362
+  -- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fc4'# -> unI64 1755317129
+  -- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+  '\x1ff2'# -> unI64 1755324410
+  -- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+  '\x1ff4'# -> unI64 1755317135
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fb7'# -> unI64 3681166678819729
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fc7'# -> unI64 3681166678819735
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1ff7'# -> unI64 3681166678819753
+  '\x0061'# -> unI64 65
+  '\x0062'# -> unI64 66
+  '\x0063'# -> unI64 67
+  '\x0064'# -> unI64 68
+  '\x0065'# -> unI64 69
+  '\x0066'# -> unI64 70
+  '\x0067'# -> unI64 71
+  '\x0068'# -> unI64 72
+  '\x0069'# -> unI64 73
+  '\x006a'# -> unI64 74
+  '\x006b'# -> unI64 75
+  '\x006c'# -> unI64 76
+  '\x006d'# -> unI64 77
+  '\x006e'# -> unI64 78
+  '\x006f'# -> unI64 79
+  '\x0070'# -> unI64 80
+  '\x0071'# -> unI64 81
+  '\x0072'# -> unI64 82
+  '\x0073'# -> unI64 83
+  '\x0074'# -> unI64 84
+  '\x0075'# -> unI64 85
+  '\x0076'# -> unI64 86
+  '\x0077'# -> unI64 87
+  '\x0078'# -> unI64 88
+  '\x0079'# -> unI64 89
+  '\x007a'# -> unI64 90
+  '\x00b5'# -> unI64 924
+  '\x00e0'# -> unI64 192
+  '\x00e1'# -> unI64 193
+  '\x00e2'# -> unI64 194
+  '\x00e3'# -> unI64 195
+  '\x00e4'# -> unI64 196
+  '\x00e5'# -> unI64 197
+  '\x00e6'# -> unI64 198
+  '\x00e7'# -> unI64 199
+  '\x00e8'# -> unI64 200
+  '\x00e9'# -> unI64 201
+  '\x00ea'# -> unI64 202
+  '\x00eb'# -> unI64 203
+  '\x00ec'# -> unI64 204
+  '\x00ed'# -> unI64 205
+  '\x00ee'# -> unI64 206
+  '\x00ef'# -> unI64 207
+  '\x00f0'# -> unI64 208
+  '\x00f1'# -> unI64 209
+  '\x00f2'# -> unI64 210
+  '\x00f3'# -> unI64 211
+  '\x00f4'# -> unI64 212
+  '\x00f5'# -> unI64 213
+  '\x00f6'# -> unI64 214
+  '\x00f8'# -> unI64 216
+  '\x00f9'# -> unI64 217
+  '\x00fa'# -> unI64 218
+  '\x00fb'# -> unI64 219
+  '\x00fc'# -> unI64 220
+  '\x00fd'# -> unI64 221
+  '\x00fe'# -> unI64 222
+  '\x00ff'# -> unI64 376
+  '\x0101'# -> unI64 256
+  '\x0103'# -> unI64 258
+  '\x0105'# -> unI64 260
+  '\x0107'# -> unI64 262
+  '\x0109'# -> unI64 264
+  '\x010b'# -> unI64 266
+  '\x010d'# -> unI64 268
+  '\x010f'# -> unI64 270
+  '\x0111'# -> unI64 272
+  '\x0113'# -> unI64 274
+  '\x0115'# -> unI64 276
+  '\x0117'# -> unI64 278
+  '\x0119'# -> unI64 280
+  '\x011b'# -> unI64 282
+  '\x011d'# -> unI64 284
+  '\x011f'# -> unI64 286
+  '\x0121'# -> unI64 288
+  '\x0123'# -> unI64 290
+  '\x0125'# -> unI64 292
+  '\x0127'# -> unI64 294
+  '\x0129'# -> unI64 296
+  '\x012b'# -> unI64 298
+  '\x012d'# -> unI64 300
+  '\x012f'# -> unI64 302
+  '\x0131'# -> unI64 73
+  '\x0133'# -> unI64 306
+  '\x0135'# -> unI64 308
+  '\x0137'# -> unI64 310
+  '\x013a'# -> unI64 313
+  '\x013c'# -> unI64 315
+  '\x013e'# -> unI64 317
+  '\x0140'# -> unI64 319
+  '\x0142'# -> unI64 321
+  '\x0144'# -> unI64 323
+  '\x0146'# -> unI64 325
+  '\x0148'# -> unI64 327
+  '\x014b'# -> unI64 330
+  '\x014d'# -> unI64 332
+  '\x014f'# -> unI64 334
+  '\x0151'# -> unI64 336
+  '\x0153'# -> unI64 338
+  '\x0155'# -> unI64 340
+  '\x0157'# -> unI64 342
+  '\x0159'# -> unI64 344
+  '\x015b'# -> unI64 346
+  '\x015d'# -> unI64 348
+  '\x015f'# -> unI64 350
+  '\x0161'# -> unI64 352
+  '\x0163'# -> unI64 354
+  '\x0165'# -> unI64 356
+  '\x0167'# -> unI64 358
+  '\x0169'# -> unI64 360
+  '\x016b'# -> unI64 362
+  '\x016d'# -> unI64 364
+  '\x016f'# -> unI64 366
+  '\x0171'# -> unI64 368
+  '\x0173'# -> unI64 370
+  '\x0175'# -> unI64 372
+  '\x0177'# -> unI64 374
+  '\x017a'# -> unI64 377
+  '\x017c'# -> unI64 379
+  '\x017e'# -> unI64 381
+  '\x017f'# -> unI64 83
+  '\x0180'# -> unI64 579
+  '\x0183'# -> unI64 386
+  '\x0185'# -> unI64 388
+  '\x0188'# -> unI64 391
+  '\x018c'# -> unI64 395
+  '\x0192'# -> unI64 401
+  '\x0195'# -> unI64 502
+  '\x0199'# -> unI64 408
+  '\x019a'# -> unI64 573
+  '\x019e'# -> unI64 544
+  '\x01a1'# -> unI64 416
+  '\x01a3'# -> unI64 418
+  '\x01a5'# -> unI64 420
+  '\x01a8'# -> unI64 423
+  '\x01ad'# -> unI64 428
+  '\x01b0'# -> unI64 431
+  '\x01b4'# -> unI64 435
+  '\x01b6'# -> unI64 437
+  '\x01b9'# -> unI64 440
+  '\x01bd'# -> unI64 444
+  '\x01bf'# -> unI64 503
+  '\x01c4'# -> unI64 453
+  '\x01c6'# -> unI64 453
+  '\x01c7'# -> unI64 456
+  '\x01c9'# -> unI64 456
+  '\x01ca'# -> unI64 459
+  '\x01cc'# -> unI64 459
+  '\x01ce'# -> unI64 461
+  '\x01d0'# -> unI64 463
+  '\x01d2'# -> unI64 465
+  '\x01d4'# -> unI64 467
+  '\x01d6'# -> unI64 469
+  '\x01d8'# -> unI64 471
+  '\x01da'# -> unI64 473
+  '\x01dc'# -> unI64 475
+  '\x01dd'# -> unI64 398
+  '\x01df'# -> unI64 478
+  '\x01e1'# -> unI64 480
+  '\x01e3'# -> unI64 482
+  '\x01e5'# -> unI64 484
+  '\x01e7'# -> unI64 486
+  '\x01e9'# -> unI64 488
+  '\x01eb'# -> unI64 490
+  '\x01ed'# -> unI64 492
+  '\x01ef'# -> unI64 494
+  '\x01f1'# -> unI64 498
+  '\x01f3'# -> unI64 498
+  '\x01f5'# -> unI64 500
+  '\x01f9'# -> unI64 504
+  '\x01fb'# -> unI64 506
+  '\x01fd'# -> unI64 508
+  '\x01ff'# -> unI64 510
+  '\x0201'# -> unI64 512
+  '\x0203'# -> unI64 514
+  '\x0205'# -> unI64 516
+  '\x0207'# -> unI64 518
+  '\x0209'# -> unI64 520
+  '\x020b'# -> unI64 522
+  '\x020d'# -> unI64 524
+  '\x020f'# -> unI64 526
+  '\x0211'# -> unI64 528
+  '\x0213'# -> unI64 530
+  '\x0215'# -> unI64 532
+  '\x0217'# -> unI64 534
+  '\x0219'# -> unI64 536
+  '\x021b'# -> unI64 538
+  '\x021d'# -> unI64 540
+  '\x021f'# -> unI64 542
+  '\x0223'# -> unI64 546
+  '\x0225'# -> unI64 548
+  '\x0227'# -> unI64 550
+  '\x0229'# -> unI64 552
+  '\x022b'# -> unI64 554
+  '\x022d'# -> unI64 556
+  '\x022f'# -> unI64 558
+  '\x0231'# -> unI64 560
+  '\x0233'# -> unI64 562
+  '\x023c'# -> unI64 571
+  '\x023f'# -> unI64 11390
+  '\x0240'# -> unI64 11391
+  '\x0242'# -> unI64 577
+  '\x0247'# -> unI64 582
+  '\x0249'# -> unI64 584
+  '\x024b'# -> unI64 586
+  '\x024d'# -> unI64 588
+  '\x024f'# -> unI64 590
+  '\x0250'# -> unI64 11375
+  '\x0251'# -> unI64 11373
+  '\x0252'# -> unI64 11376
+  '\x0253'# -> unI64 385
+  '\x0254'# -> unI64 390
+  '\x0256'# -> unI64 393
+  '\x0257'# -> unI64 394
+  '\x0259'# -> unI64 399
+  '\x025b'# -> unI64 400
+  '\x025c'# -> unI64 42923
+  '\x0260'# -> unI64 403
+  '\x0261'# -> unI64 42924
+  '\x0263'# -> unI64 404
+  '\x0265'# -> unI64 42893
+  '\x0266'# -> unI64 42922
+  '\x0268'# -> unI64 407
+  '\x0269'# -> unI64 406
+  '\x026a'# -> unI64 42926
+  '\x026b'# -> unI64 11362
+  '\x026c'# -> unI64 42925
+  '\x026f'# -> unI64 412
+  '\x0271'# -> unI64 11374
+  '\x0272'# -> unI64 413
+  '\x0275'# -> unI64 415
+  '\x027d'# -> unI64 11364
+  '\x0280'# -> unI64 422
+  '\x0282'# -> unI64 42949
+  '\x0283'# -> unI64 425
+  '\x0287'# -> unI64 42929
+  '\x0288'# -> unI64 430
+  '\x0289'# -> unI64 580
+  '\x028a'# -> unI64 433
+  '\x028b'# -> unI64 434
+  '\x028c'# -> unI64 581
+  '\x0292'# -> unI64 439
+  '\x029d'# -> unI64 42930
+  '\x029e'# -> unI64 42928
+  '\x0345'# -> unI64 921
+  '\x0371'# -> unI64 880
+  '\x0373'# -> unI64 882
+  '\x0377'# -> unI64 886
+  '\x037b'# -> unI64 1021
+  '\x037c'# -> unI64 1022
+  '\x037d'# -> unI64 1023
+  '\x03ac'# -> unI64 902
+  '\x03ad'# -> unI64 904
+  '\x03ae'# -> unI64 905
+  '\x03af'# -> unI64 906
+  '\x03b1'# -> unI64 913
+  '\x03b2'# -> unI64 914
+  '\x03b3'# -> unI64 915
+  '\x03b4'# -> unI64 916
+  '\x03b5'# -> unI64 917
+  '\x03b6'# -> unI64 918
+  '\x03b7'# -> unI64 919
+  '\x03b8'# -> unI64 920
+  '\x03b9'# -> unI64 921
+  '\x03ba'# -> unI64 922
+  '\x03bb'# -> unI64 923
+  '\x03bc'# -> unI64 924
+  '\x03bd'# -> unI64 925
+  '\x03be'# -> unI64 926
+  '\x03bf'# -> unI64 927
+  '\x03c0'# -> unI64 928
+  '\x03c1'# -> unI64 929
+  '\x03c2'# -> unI64 931
+  '\x03c3'# -> unI64 931
+  '\x03c4'# -> unI64 932
+  '\x03c5'# -> unI64 933
+  '\x03c6'# -> unI64 934
+  '\x03c7'# -> unI64 935
+  '\x03c8'# -> unI64 936
+  '\x03c9'# -> unI64 937
+  '\x03ca'# -> unI64 938
+  '\x03cb'# -> unI64 939
+  '\x03cc'# -> unI64 908
+  '\x03cd'# -> unI64 910
+  '\x03ce'# -> unI64 911
+  '\x03d0'# -> unI64 914
+  '\x03d1'# -> unI64 920
+  '\x03d5'# -> unI64 934
+  '\x03d6'# -> unI64 928
+  '\x03d7'# -> unI64 975
+  '\x03d9'# -> unI64 984
+  '\x03db'# -> unI64 986
+  '\x03dd'# -> unI64 988
+  '\x03df'# -> unI64 990
+  '\x03e1'# -> unI64 992
+  '\x03e3'# -> unI64 994
+  '\x03e5'# -> unI64 996
+  '\x03e7'# -> unI64 998
+  '\x03e9'# -> unI64 1000
+  '\x03eb'# -> unI64 1002
+  '\x03ed'# -> unI64 1004
+  '\x03ef'# -> unI64 1006
+  '\x03f0'# -> unI64 922
+  '\x03f1'# -> unI64 929
+  '\x03f2'# -> unI64 1017
+  '\x03f3'# -> unI64 895
+  '\x03f5'# -> unI64 917
+  '\x03f8'# -> unI64 1015
+  '\x03fb'# -> unI64 1018
+  '\x0430'# -> unI64 1040
+  '\x0431'# -> unI64 1041
+  '\x0432'# -> unI64 1042
+  '\x0433'# -> unI64 1043
+  '\x0434'# -> unI64 1044
+  '\x0435'# -> unI64 1045
+  '\x0436'# -> unI64 1046
+  '\x0437'# -> unI64 1047
+  '\x0438'# -> unI64 1048
+  '\x0439'# -> unI64 1049
+  '\x043a'# -> unI64 1050
+  '\x043b'# -> unI64 1051
+  '\x043c'# -> unI64 1052
+  '\x043d'# -> unI64 1053
+  '\x043e'# -> unI64 1054
+  '\x043f'# -> unI64 1055
+  '\x0440'# -> unI64 1056
+  '\x0441'# -> unI64 1057
+  '\x0442'# -> unI64 1058
+  '\x0443'# -> unI64 1059
+  '\x0444'# -> unI64 1060
+  '\x0445'# -> unI64 1061
+  '\x0446'# -> unI64 1062
+  '\x0447'# -> unI64 1063
+  '\x0448'# -> unI64 1064
+  '\x0449'# -> unI64 1065
+  '\x044a'# -> unI64 1066
+  '\x044b'# -> unI64 1067
+  '\x044c'# -> unI64 1068
+  '\x044d'# -> unI64 1069
+  '\x044e'# -> unI64 1070
+  '\x044f'# -> unI64 1071
+  '\x0450'# -> unI64 1024
+  '\x0451'# -> unI64 1025
+  '\x0452'# -> unI64 1026
+  '\x0453'# -> unI64 1027
+  '\x0454'# -> unI64 1028
+  '\x0455'# -> unI64 1029
+  '\x0456'# -> unI64 1030
+  '\x0457'# -> unI64 1031
+  '\x0458'# -> unI64 1032
+  '\x0459'# -> unI64 1033
+  '\x045a'# -> unI64 1034
+  '\x045b'# -> unI64 1035
+  '\x045c'# -> unI64 1036
+  '\x045d'# -> unI64 1037
+  '\x045e'# -> unI64 1038
+  '\x045f'# -> unI64 1039
+  '\x0461'# -> unI64 1120
+  '\x0463'# -> unI64 1122
+  '\x0465'# -> unI64 1124
+  '\x0467'# -> unI64 1126
+  '\x0469'# -> unI64 1128
+  '\x046b'# -> unI64 1130
+  '\x046d'# -> unI64 1132
+  '\x046f'# -> unI64 1134
+  '\x0471'# -> unI64 1136
+  '\x0473'# -> unI64 1138
+  '\x0475'# -> unI64 1140
+  '\x0477'# -> unI64 1142
+  '\x0479'# -> unI64 1144
+  '\x047b'# -> unI64 1146
+  '\x047d'# -> unI64 1148
+  '\x047f'# -> unI64 1150
+  '\x0481'# -> unI64 1152
+  '\x048b'# -> unI64 1162
+  '\x048d'# -> unI64 1164
+  '\x048f'# -> unI64 1166
+  '\x0491'# -> unI64 1168
+  '\x0493'# -> unI64 1170
+  '\x0495'# -> unI64 1172
+  '\x0497'# -> unI64 1174
+  '\x0499'# -> unI64 1176
+  '\x049b'# -> unI64 1178
+  '\x049d'# -> unI64 1180
+  '\x049f'# -> unI64 1182
+  '\x04a1'# -> unI64 1184
+  '\x04a3'# -> unI64 1186
+  '\x04a5'# -> unI64 1188
+  '\x04a7'# -> unI64 1190
+  '\x04a9'# -> unI64 1192
+  '\x04ab'# -> unI64 1194
+  '\x04ad'# -> unI64 1196
+  '\x04af'# -> unI64 1198
+  '\x04b1'# -> unI64 1200
+  '\x04b3'# -> unI64 1202
+  '\x04b5'# -> unI64 1204
+  '\x04b7'# -> unI64 1206
+  '\x04b9'# -> unI64 1208
+  '\x04bb'# -> unI64 1210
+  '\x04bd'# -> unI64 1212
+  '\x04bf'# -> unI64 1214
+  '\x04c2'# -> unI64 1217
+  '\x04c4'# -> unI64 1219
+  '\x04c6'# -> unI64 1221
+  '\x04c8'# -> unI64 1223
+  '\x04ca'# -> unI64 1225
+  '\x04cc'# -> unI64 1227
+  '\x04ce'# -> unI64 1229
+  '\x04cf'# -> unI64 1216
+  '\x04d1'# -> unI64 1232
+  '\x04d3'# -> unI64 1234
+  '\x04d5'# -> unI64 1236
+  '\x04d7'# -> unI64 1238
+  '\x04d9'# -> unI64 1240
+  '\x04db'# -> unI64 1242
+  '\x04dd'# -> unI64 1244
+  '\x04df'# -> unI64 1246
+  '\x04e1'# -> unI64 1248
+  '\x04e3'# -> unI64 1250
+  '\x04e5'# -> unI64 1252
+  '\x04e7'# -> unI64 1254
+  '\x04e9'# -> unI64 1256
+  '\x04eb'# -> unI64 1258
+  '\x04ed'# -> unI64 1260
+  '\x04ef'# -> unI64 1262
+  '\x04f1'# -> unI64 1264
+  '\x04f3'# -> unI64 1266
+  '\x04f5'# -> unI64 1268
+  '\x04f7'# -> unI64 1270
+  '\x04f9'# -> unI64 1272
+  '\x04fb'# -> unI64 1274
+  '\x04fd'# -> unI64 1276
+  '\x04ff'# -> unI64 1278
+  '\x0501'# -> unI64 1280
+  '\x0503'# -> unI64 1282
+  '\x0505'# -> unI64 1284
+  '\x0507'# -> unI64 1286
+  '\x0509'# -> unI64 1288
+  '\x050b'# -> unI64 1290
+  '\x050d'# -> unI64 1292
+  '\x050f'# -> unI64 1294
+  '\x0511'# -> unI64 1296
+  '\x0513'# -> unI64 1298
+  '\x0515'# -> unI64 1300
+  '\x0517'# -> unI64 1302
+  '\x0519'# -> unI64 1304
+  '\x051b'# -> unI64 1306
+  '\x051d'# -> unI64 1308
+  '\x051f'# -> unI64 1310
+  '\x0521'# -> unI64 1312
+  '\x0523'# -> unI64 1314
+  '\x0525'# -> unI64 1316
+  '\x0527'# -> unI64 1318
+  '\x0529'# -> unI64 1320
+  '\x052b'# -> unI64 1322
+  '\x052d'# -> unI64 1324
+  '\x052f'# -> unI64 1326
+  '\x0561'# -> unI64 1329
+  '\x0562'# -> unI64 1330
+  '\x0563'# -> unI64 1331
+  '\x0564'# -> unI64 1332
+  '\x0565'# -> unI64 1333
+  '\x0566'# -> unI64 1334
+  '\x0567'# -> unI64 1335
+  '\x0568'# -> unI64 1336
+  '\x0569'# -> unI64 1337
+  '\x056a'# -> unI64 1338
+  '\x056b'# -> unI64 1339
+  '\x056c'# -> unI64 1340
+  '\x056d'# -> unI64 1341
+  '\x056e'# -> unI64 1342
+  '\x056f'# -> unI64 1343
+  '\x0570'# -> unI64 1344
+  '\x0571'# -> unI64 1345
+  '\x0572'# -> unI64 1346
+  '\x0573'# -> unI64 1347
+  '\x0574'# -> unI64 1348
+  '\x0575'# -> unI64 1349
+  '\x0576'# -> unI64 1350
+  '\x0577'# -> unI64 1351
+  '\x0578'# -> unI64 1352
+  '\x0579'# -> unI64 1353
+  '\x057a'# -> unI64 1354
+  '\x057b'# -> unI64 1355
+  '\x057c'# -> unI64 1356
+  '\x057d'# -> unI64 1357
+  '\x057e'# -> unI64 1358
+  '\x057f'# -> unI64 1359
+  '\x0580'# -> unI64 1360
+  '\x0581'# -> unI64 1361
+  '\x0582'# -> unI64 1362
+  '\x0583'# -> unI64 1363
+  '\x0584'# -> unI64 1364
+  '\x0585'# -> unI64 1365
+  '\x0586'# -> unI64 1366
+  '\x13f8'# -> unI64 5104
+  '\x13f9'# -> unI64 5105
+  '\x13fa'# -> unI64 5106
+  '\x13fb'# -> unI64 5107
+  '\x13fc'# -> unI64 5108
+  '\x13fd'# -> unI64 5109
+  '\x1c80'# -> unI64 1042
+  '\x1c81'# -> unI64 1044
+  '\x1c82'# -> unI64 1054
+  '\x1c83'# -> unI64 1057
+  '\x1c84'# -> unI64 1058
+  '\x1c85'# -> unI64 1058
+  '\x1c86'# -> unI64 1066
+  '\x1c87'# -> unI64 1122
+  '\x1c88'# -> unI64 42570
+  '\x1d79'# -> unI64 42877
+  '\x1d7d'# -> unI64 11363
+  '\x1d8e'# -> unI64 42950
+  '\x1e01'# -> unI64 7680
+  '\x1e03'# -> unI64 7682
+  '\x1e05'# -> unI64 7684
+  '\x1e07'# -> unI64 7686
+  '\x1e09'# -> unI64 7688
+  '\x1e0b'# -> unI64 7690
+  '\x1e0d'# -> unI64 7692
+  '\x1e0f'# -> unI64 7694
+  '\x1e11'# -> unI64 7696
+  '\x1e13'# -> unI64 7698
+  '\x1e15'# -> unI64 7700
+  '\x1e17'# -> unI64 7702
+  '\x1e19'# -> unI64 7704
+  '\x1e1b'# -> unI64 7706
+  '\x1e1d'# -> unI64 7708
+  '\x1e1f'# -> unI64 7710
+  '\x1e21'# -> unI64 7712
+  '\x1e23'# -> unI64 7714
+  '\x1e25'# -> unI64 7716
+  '\x1e27'# -> unI64 7718
+  '\x1e29'# -> unI64 7720
+  '\x1e2b'# -> unI64 7722
+  '\x1e2d'# -> unI64 7724
+  '\x1e2f'# -> unI64 7726
+  '\x1e31'# -> unI64 7728
+  '\x1e33'# -> unI64 7730
+  '\x1e35'# -> unI64 7732
+  '\x1e37'# -> unI64 7734
+  '\x1e39'# -> unI64 7736
+  '\x1e3b'# -> unI64 7738
+  '\x1e3d'# -> unI64 7740
+  '\x1e3f'# -> unI64 7742
+  '\x1e41'# -> unI64 7744
+  '\x1e43'# -> unI64 7746
+  '\x1e45'# -> unI64 7748
+  '\x1e47'# -> unI64 7750
+  '\x1e49'# -> unI64 7752
+  '\x1e4b'# -> unI64 7754
+  '\x1e4d'# -> unI64 7756
+  '\x1e4f'# -> unI64 7758
+  '\x1e51'# -> unI64 7760
+  '\x1e53'# -> unI64 7762
+  '\x1e55'# -> unI64 7764
+  '\x1e57'# -> unI64 7766
+  '\x1e59'# -> unI64 7768
+  '\x1e5b'# -> unI64 7770
+  '\x1e5d'# -> unI64 7772
+  '\x1e5f'# -> unI64 7774
+  '\x1e61'# -> unI64 7776
+  '\x1e63'# -> unI64 7778
+  '\x1e65'# -> unI64 7780
+  '\x1e67'# -> unI64 7782
+  '\x1e69'# -> unI64 7784
+  '\x1e6b'# -> unI64 7786
+  '\x1e6d'# -> unI64 7788
+  '\x1e6f'# -> unI64 7790
+  '\x1e71'# -> unI64 7792
+  '\x1e73'# -> unI64 7794
+  '\x1e75'# -> unI64 7796
+  '\x1e77'# -> unI64 7798
+  '\x1e79'# -> unI64 7800
+  '\x1e7b'# -> unI64 7802
+  '\x1e7d'# -> unI64 7804
+  '\x1e7f'# -> unI64 7806
+  '\x1e81'# -> unI64 7808
+  '\x1e83'# -> unI64 7810
+  '\x1e85'# -> unI64 7812
+  '\x1e87'# -> unI64 7814
+  '\x1e89'# -> unI64 7816
+  '\x1e8b'# -> unI64 7818
+  '\x1e8d'# -> unI64 7820
+  '\x1e8f'# -> unI64 7822
+  '\x1e91'# -> unI64 7824
+  '\x1e93'# -> unI64 7826
+  '\x1e95'# -> unI64 7828
+  '\x1e9b'# -> unI64 7776
+  '\x1ea1'# -> unI64 7840
+  '\x1ea3'# -> unI64 7842
+  '\x1ea5'# -> unI64 7844
+  '\x1ea7'# -> unI64 7846
+  '\x1ea9'# -> unI64 7848
+  '\x1eab'# -> unI64 7850
+  '\x1ead'# -> unI64 7852
+  '\x1eaf'# -> unI64 7854
+  '\x1eb1'# -> unI64 7856
+  '\x1eb3'# -> unI64 7858
+  '\x1eb5'# -> unI64 7860
+  '\x1eb7'# -> unI64 7862
+  '\x1eb9'# -> unI64 7864
+  '\x1ebb'# -> unI64 7866
+  '\x1ebd'# -> unI64 7868
+  '\x1ebf'# -> unI64 7870
+  '\x1ec1'# -> unI64 7872
+  '\x1ec3'# -> unI64 7874
+  '\x1ec5'# -> unI64 7876
+  '\x1ec7'# -> unI64 7878
+  '\x1ec9'# -> unI64 7880
+  '\x1ecb'# -> unI64 7882
+  '\x1ecd'# -> unI64 7884
+  '\x1ecf'# -> unI64 7886
+  '\x1ed1'# -> unI64 7888
+  '\x1ed3'# -> unI64 7890
+  '\x1ed5'# -> unI64 7892
+  '\x1ed7'# -> unI64 7894
+  '\x1ed9'# -> unI64 7896
+  '\x1edb'# -> unI64 7898
+  '\x1edd'# -> unI64 7900
+  '\x1edf'# -> unI64 7902
+  '\x1ee1'# -> unI64 7904
+  '\x1ee3'# -> unI64 7906
+  '\x1ee5'# -> unI64 7908
+  '\x1ee7'# -> unI64 7910
+  '\x1ee9'# -> unI64 7912
+  '\x1eeb'# -> unI64 7914
+  '\x1eed'# -> unI64 7916
+  '\x1eef'# -> unI64 7918
+  '\x1ef1'# -> unI64 7920
+  '\x1ef3'# -> unI64 7922
+  '\x1ef5'# -> unI64 7924
+  '\x1ef7'# -> unI64 7926
+  '\x1ef9'# -> unI64 7928
+  '\x1efb'# -> unI64 7930
+  '\x1efd'# -> unI64 7932
+  '\x1eff'# -> unI64 7934
+  '\x1f00'# -> unI64 7944
+  '\x1f01'# -> unI64 7945
+  '\x1f02'# -> unI64 7946
+  '\x1f03'# -> unI64 7947
+  '\x1f04'# -> unI64 7948
+  '\x1f05'# -> unI64 7949
+  '\x1f06'# -> unI64 7950
+  '\x1f07'# -> unI64 7951
+  '\x1f10'# -> unI64 7960
+  '\x1f11'# -> unI64 7961
+  '\x1f12'# -> unI64 7962
+  '\x1f13'# -> unI64 7963
+  '\x1f14'# -> unI64 7964
+  '\x1f15'# -> unI64 7965
+  '\x1f20'# -> unI64 7976
+  '\x1f21'# -> unI64 7977
+  '\x1f22'# -> unI64 7978
+  '\x1f23'# -> unI64 7979
+  '\x1f24'# -> unI64 7980
+  '\x1f25'# -> unI64 7981
+  '\x1f26'# -> unI64 7982
+  '\x1f27'# -> unI64 7983
+  '\x1f30'# -> unI64 7992
+  '\x1f31'# -> unI64 7993
+  '\x1f32'# -> unI64 7994
+  '\x1f33'# -> unI64 7995
+  '\x1f34'# -> unI64 7996
+  '\x1f35'# -> unI64 7997
+  '\x1f36'# -> unI64 7998
+  '\x1f37'# -> unI64 7999
+  '\x1f40'# -> unI64 8008
+  '\x1f41'# -> unI64 8009
+  '\x1f42'# -> unI64 8010
+  '\x1f43'# -> unI64 8011
+  '\x1f44'# -> unI64 8012
+  '\x1f45'# -> unI64 8013
+  '\x1f51'# -> unI64 8025
+  '\x1f53'# -> unI64 8027
+  '\x1f55'# -> unI64 8029
+  '\x1f57'# -> unI64 8031
+  '\x1f60'# -> unI64 8040
+  '\x1f61'# -> unI64 8041
+  '\x1f62'# -> unI64 8042
+  '\x1f63'# -> unI64 8043
+  '\x1f64'# -> unI64 8044
+  '\x1f65'# -> unI64 8045
+  '\x1f66'# -> unI64 8046
+  '\x1f67'# -> unI64 8047
+  '\x1f70'# -> unI64 8122
+  '\x1f71'# -> unI64 8123
+  '\x1f72'# -> unI64 8136
+  '\x1f73'# -> unI64 8137
+  '\x1f74'# -> unI64 8138
+  '\x1f75'# -> unI64 8139
+  '\x1f76'# -> unI64 8154
+  '\x1f77'# -> unI64 8155
+  '\x1f78'# -> unI64 8184
+  '\x1f79'# -> unI64 8185
+  '\x1f7a'# -> unI64 8170
+  '\x1f7b'# -> unI64 8171
+  '\x1f7c'# -> unI64 8186
+  '\x1f7d'# -> unI64 8187
+  '\x1f80'# -> unI64 8072
+  '\x1f81'# -> unI64 8073
+  '\x1f82'# -> unI64 8074
+  '\x1f83'# -> unI64 8075
+  '\x1f84'# -> unI64 8076
+  '\x1f85'# -> unI64 8077
+  '\x1f86'# -> unI64 8078
+  '\x1f87'# -> unI64 8079
+  '\x1f90'# -> unI64 8088
+  '\x1f91'# -> unI64 8089
+  '\x1f92'# -> unI64 8090
+  '\x1f93'# -> unI64 8091
+  '\x1f94'# -> unI64 8092
+  '\x1f95'# -> unI64 8093
+  '\x1f96'# -> unI64 8094
+  '\x1f97'# -> unI64 8095
+  '\x1fa0'# -> unI64 8104
+  '\x1fa1'# -> unI64 8105
+  '\x1fa2'# -> unI64 8106
+  '\x1fa3'# -> unI64 8107
+  '\x1fa4'# -> unI64 8108
+  '\x1fa5'# -> unI64 8109
+  '\x1fa6'# -> unI64 8110
+  '\x1fa7'# -> unI64 8111
+  '\x1fb0'# -> unI64 8120
+  '\x1fb1'# -> unI64 8121
+  '\x1fb3'# -> unI64 8124
+  '\x1fbe'# -> unI64 921
+  '\x1fc3'# -> unI64 8140
+  '\x1fd0'# -> unI64 8152
+  '\x1fd1'# -> unI64 8153
+  '\x1fe0'# -> unI64 8168
+  '\x1fe1'# -> unI64 8169
+  '\x1fe5'# -> unI64 8172
+  '\x1ff3'# -> unI64 8188
+  '\x214e'# -> unI64 8498
+  '\x2170'# -> unI64 8544
+  '\x2171'# -> unI64 8545
+  '\x2172'# -> unI64 8546
+  '\x2173'# -> unI64 8547
+  '\x2174'# -> unI64 8548
+  '\x2175'# -> unI64 8549
+  '\x2176'# -> unI64 8550
+  '\x2177'# -> unI64 8551
+  '\x2178'# -> unI64 8552
+  '\x2179'# -> unI64 8553
+  '\x217a'# -> unI64 8554
+  '\x217b'# -> unI64 8555
+  '\x217c'# -> unI64 8556
+  '\x217d'# -> unI64 8557
+  '\x217e'# -> unI64 8558
+  '\x217f'# -> unI64 8559
+  '\x2184'# -> unI64 8579
+  '\x24d0'# -> unI64 9398
+  '\x24d1'# -> unI64 9399
+  '\x24d2'# -> unI64 9400
+  '\x24d3'# -> unI64 9401
+  '\x24d4'# -> unI64 9402
+  '\x24d5'# -> unI64 9403
+  '\x24d6'# -> unI64 9404
+  '\x24d7'# -> unI64 9405
+  '\x24d8'# -> unI64 9406
+  '\x24d9'# -> unI64 9407
+  '\x24da'# -> unI64 9408
+  '\x24db'# -> unI64 9409
+  '\x24dc'# -> unI64 9410
+  '\x24dd'# -> unI64 9411
+  '\x24de'# -> unI64 9412
+  '\x24df'# -> unI64 9413
+  '\x24e0'# -> unI64 9414
+  '\x24e1'# -> unI64 9415
+  '\x24e2'# -> unI64 9416
+  '\x24e3'# -> unI64 9417
+  '\x24e4'# -> unI64 9418
+  '\x24e5'# -> unI64 9419
+  '\x24e6'# -> unI64 9420
+  '\x24e7'# -> unI64 9421
+  '\x24e8'# -> unI64 9422
+  '\x24e9'# -> unI64 9423
+  '\x2c30'# -> unI64 11264
+  '\x2c31'# -> unI64 11265
+  '\x2c32'# -> unI64 11266
+  '\x2c33'# -> unI64 11267
+  '\x2c34'# -> unI64 11268
+  '\x2c35'# -> unI64 11269
+  '\x2c36'# -> unI64 11270
+  '\x2c37'# -> unI64 11271
+  '\x2c38'# -> unI64 11272
+  '\x2c39'# -> unI64 11273
+  '\x2c3a'# -> unI64 11274
+  '\x2c3b'# -> unI64 11275
+  '\x2c3c'# -> unI64 11276
+  '\x2c3d'# -> unI64 11277
+  '\x2c3e'# -> unI64 11278
+  '\x2c3f'# -> unI64 11279
+  '\x2c40'# -> unI64 11280
+  '\x2c41'# -> unI64 11281
+  '\x2c42'# -> unI64 11282
+  '\x2c43'# -> unI64 11283
+  '\x2c44'# -> unI64 11284
+  '\x2c45'# -> unI64 11285
+  '\x2c46'# -> unI64 11286
+  '\x2c47'# -> unI64 11287
+  '\x2c48'# -> unI64 11288
+  '\x2c49'# -> unI64 11289
+  '\x2c4a'# -> unI64 11290
+  '\x2c4b'# -> unI64 11291
+  '\x2c4c'# -> unI64 11292
+  '\x2c4d'# -> unI64 11293
+  '\x2c4e'# -> unI64 11294
+  '\x2c4f'# -> unI64 11295
+  '\x2c50'# -> unI64 11296
+  '\x2c51'# -> unI64 11297
+  '\x2c52'# -> unI64 11298
+  '\x2c53'# -> unI64 11299
+  '\x2c54'# -> unI64 11300
+  '\x2c55'# -> unI64 11301
+  '\x2c56'# -> unI64 11302
+  '\x2c57'# -> unI64 11303
+  '\x2c58'# -> unI64 11304
+  '\x2c59'# -> unI64 11305
+  '\x2c5a'# -> unI64 11306
+  '\x2c5b'# -> unI64 11307
+  '\x2c5c'# -> unI64 11308
+  '\x2c5d'# -> unI64 11309
+  '\x2c5e'# -> unI64 11310
+  '\x2c61'# -> unI64 11360
+  '\x2c65'# -> unI64 570
+  '\x2c66'# -> unI64 574
+  '\x2c68'# -> unI64 11367
+  '\x2c6a'# -> unI64 11369
+  '\x2c6c'# -> unI64 11371
+  '\x2c73'# -> unI64 11378
+  '\x2c76'# -> unI64 11381
+  '\x2c81'# -> unI64 11392
+  '\x2c83'# -> unI64 11394
+  '\x2c85'# -> unI64 11396
+  '\x2c87'# -> unI64 11398
+  '\x2c89'# -> unI64 11400
+  '\x2c8b'# -> unI64 11402
+  '\x2c8d'# -> unI64 11404
+  '\x2c8f'# -> unI64 11406
+  '\x2c91'# -> unI64 11408
+  '\x2c93'# -> unI64 11410
+  '\x2c95'# -> unI64 11412
+  '\x2c97'# -> unI64 11414
+  '\x2c99'# -> unI64 11416
+  '\x2c9b'# -> unI64 11418
+  '\x2c9d'# -> unI64 11420
+  '\x2c9f'# -> unI64 11422
+  '\x2ca1'# -> unI64 11424
+  '\x2ca3'# -> unI64 11426
+  '\x2ca5'# -> unI64 11428
+  '\x2ca7'# -> unI64 11430
+  '\x2ca9'# -> unI64 11432
+  '\x2cab'# -> unI64 11434
+  '\x2cad'# -> unI64 11436
+  '\x2caf'# -> unI64 11438
+  '\x2cb1'# -> unI64 11440
+  '\x2cb3'# -> unI64 11442
+  '\x2cb5'# -> unI64 11444
+  '\x2cb7'# -> unI64 11446
+  '\x2cb9'# -> unI64 11448
+  '\x2cbb'# -> unI64 11450
+  '\x2cbd'# -> unI64 11452
+  '\x2cbf'# -> unI64 11454
+  '\x2cc1'# -> unI64 11456
+  '\x2cc3'# -> unI64 11458
+  '\x2cc5'# -> unI64 11460
+  '\x2cc7'# -> unI64 11462
+  '\x2cc9'# -> unI64 11464
+  '\x2ccb'# -> unI64 11466
+  '\x2ccd'# -> unI64 11468
+  '\x2ccf'# -> unI64 11470
+  '\x2cd1'# -> unI64 11472
+  '\x2cd3'# -> unI64 11474
+  '\x2cd5'# -> unI64 11476
+  '\x2cd7'# -> unI64 11478
+  '\x2cd9'# -> unI64 11480
+  '\x2cdb'# -> unI64 11482
+  '\x2cdd'# -> unI64 11484
+  '\x2cdf'# -> unI64 11486
+  '\x2ce1'# -> unI64 11488
+  '\x2ce3'# -> unI64 11490
+  '\x2cec'# -> unI64 11499
+  '\x2cee'# -> unI64 11501
+  '\x2cf3'# -> unI64 11506
+  '\x2d00'# -> unI64 4256
+  '\x2d01'# -> unI64 4257
+  '\x2d02'# -> unI64 4258
+  '\x2d03'# -> unI64 4259
+  '\x2d04'# -> unI64 4260
+  '\x2d05'# -> unI64 4261
+  '\x2d06'# -> unI64 4262
+  '\x2d07'# -> unI64 4263
+  '\x2d08'# -> unI64 4264
+  '\x2d09'# -> unI64 4265
+  '\x2d0a'# -> unI64 4266
+  '\x2d0b'# -> unI64 4267
+  '\x2d0c'# -> unI64 4268
+  '\x2d0d'# -> unI64 4269
+  '\x2d0e'# -> unI64 4270
+  '\x2d0f'# -> unI64 4271
+  '\x2d10'# -> unI64 4272
+  '\x2d11'# -> unI64 4273
+  '\x2d12'# -> unI64 4274
+  '\x2d13'# -> unI64 4275
+  '\x2d14'# -> unI64 4276
+  '\x2d15'# -> unI64 4277
+  '\x2d16'# -> unI64 4278
+  '\x2d17'# -> unI64 4279
+  '\x2d18'# -> unI64 4280
+  '\x2d19'# -> unI64 4281
+  '\x2d1a'# -> unI64 4282
+  '\x2d1b'# -> unI64 4283
+  '\x2d1c'# -> unI64 4284
+  '\x2d1d'# -> unI64 4285
+  '\x2d1e'# -> unI64 4286
+  '\x2d1f'# -> unI64 4287
+  '\x2d20'# -> unI64 4288
+  '\x2d21'# -> unI64 4289
+  '\x2d22'# -> unI64 4290
+  '\x2d23'# -> unI64 4291
+  '\x2d24'# -> unI64 4292
+  '\x2d25'# -> unI64 4293
+  '\x2d27'# -> unI64 4295
+  '\x2d2d'# -> unI64 4301
+  '\xa641'# -> unI64 42560
+  '\xa643'# -> unI64 42562
+  '\xa645'# -> unI64 42564
+  '\xa647'# -> unI64 42566
+  '\xa649'# -> unI64 42568
+  '\xa64b'# -> unI64 42570
+  '\xa64d'# -> unI64 42572
+  '\xa64f'# -> unI64 42574
+  '\xa651'# -> unI64 42576
+  '\xa653'# -> unI64 42578
+  '\xa655'# -> unI64 42580
+  '\xa657'# -> unI64 42582
+  '\xa659'# -> unI64 42584
+  '\xa65b'# -> unI64 42586
+  '\xa65d'# -> unI64 42588
+  '\xa65f'# -> unI64 42590
+  '\xa661'# -> unI64 42592
+  '\xa663'# -> unI64 42594
+  '\xa665'# -> unI64 42596
+  '\xa667'# -> unI64 42598
+  '\xa669'# -> unI64 42600
+  '\xa66b'# -> unI64 42602
+  '\xa66d'# -> unI64 42604
+  '\xa681'# -> unI64 42624
+  '\xa683'# -> unI64 42626
+  '\xa685'# -> unI64 42628
+  '\xa687'# -> unI64 42630
+  '\xa689'# -> unI64 42632
+  '\xa68b'# -> unI64 42634
+  '\xa68d'# -> unI64 42636
+  '\xa68f'# -> unI64 42638
+  '\xa691'# -> unI64 42640
+  '\xa693'# -> unI64 42642
+  '\xa695'# -> unI64 42644
+  '\xa697'# -> unI64 42646
+  '\xa699'# -> unI64 42648
+  '\xa69b'# -> unI64 42650
+  '\xa723'# -> unI64 42786
+  '\xa725'# -> unI64 42788
+  '\xa727'# -> unI64 42790
+  '\xa729'# -> unI64 42792
+  '\xa72b'# -> unI64 42794
+  '\xa72d'# -> unI64 42796
+  '\xa72f'# -> unI64 42798
+  '\xa733'# -> unI64 42802
+  '\xa735'# -> unI64 42804
+  '\xa737'# -> unI64 42806
+  '\xa739'# -> unI64 42808
+  '\xa73b'# -> unI64 42810
+  '\xa73d'# -> unI64 42812
+  '\xa73f'# -> unI64 42814
+  '\xa741'# -> unI64 42816
+  '\xa743'# -> unI64 42818
+  '\xa745'# -> unI64 42820
+  '\xa747'# -> unI64 42822
+  '\xa749'# -> unI64 42824
+  '\xa74b'# -> unI64 42826
+  '\xa74d'# -> unI64 42828
+  '\xa74f'# -> unI64 42830
+  '\xa751'# -> unI64 42832
+  '\xa753'# -> unI64 42834
+  '\xa755'# -> unI64 42836
+  '\xa757'# -> unI64 42838
+  '\xa759'# -> unI64 42840
+  '\xa75b'# -> unI64 42842
+  '\xa75d'# -> unI64 42844
+  '\xa75f'# -> unI64 42846
+  '\xa761'# -> unI64 42848
+  '\xa763'# -> unI64 42850
+  '\xa765'# -> unI64 42852
+  '\xa767'# -> unI64 42854
+  '\xa769'# -> unI64 42856
+  '\xa76b'# -> unI64 42858
+  '\xa76d'# -> unI64 42860
+  '\xa76f'# -> unI64 42862
+  '\xa77a'# -> unI64 42873
+  '\xa77c'# -> unI64 42875
+  '\xa77f'# -> unI64 42878
+  '\xa781'# -> unI64 42880
+  '\xa783'# -> unI64 42882
+  '\xa785'# -> unI64 42884
+  '\xa787'# -> unI64 42886
+  '\xa78c'# -> unI64 42891
+  '\xa791'# -> unI64 42896
+  '\xa793'# -> unI64 42898
+  '\xa794'# -> unI64 42948
+  '\xa797'# -> unI64 42902
+  '\xa799'# -> unI64 42904
+  '\xa79b'# -> unI64 42906
+  '\xa79d'# -> unI64 42908
+  '\xa79f'# -> unI64 42910
+  '\xa7a1'# -> unI64 42912
+  '\xa7a3'# -> unI64 42914
+  '\xa7a5'# -> unI64 42916
+  '\xa7a7'# -> unI64 42918
+  '\xa7a9'# -> unI64 42920
+  '\xa7b5'# -> unI64 42932
+  '\xa7b7'# -> unI64 42934
+  '\xa7b9'# -> unI64 42936
+  '\xa7bb'# -> unI64 42938
+  '\xa7bd'# -> unI64 42940
+  '\xa7bf'# -> unI64 42942
+  '\xa7c3'# -> unI64 42946
+  '\xab53'# -> unI64 42931
+  '\xab70'# -> unI64 5024
+  '\xab71'# -> unI64 5025
+  '\xab72'# -> unI64 5026
+  '\xab73'# -> unI64 5027
+  '\xab74'# -> unI64 5028
+  '\xab75'# -> unI64 5029
+  '\xab76'# -> unI64 5030
+  '\xab77'# -> unI64 5031
+  '\xab78'# -> unI64 5032
+  '\xab79'# -> unI64 5033
+  '\xab7a'# -> unI64 5034
+  '\xab7b'# -> unI64 5035
+  '\xab7c'# -> unI64 5036
+  '\xab7d'# -> unI64 5037
+  '\xab7e'# -> unI64 5038
+  '\xab7f'# -> unI64 5039
+  '\xab80'# -> unI64 5040
+  '\xab81'# -> unI64 5041
+  '\xab82'# -> unI64 5042
+  '\xab83'# -> unI64 5043
+  '\xab84'# -> unI64 5044
+  '\xab85'# -> unI64 5045
+  '\xab86'# -> unI64 5046
+  '\xab87'# -> unI64 5047
+  '\xab88'# -> unI64 5048
+  '\xab89'# -> unI64 5049
+  '\xab8a'# -> unI64 5050
+  '\xab8b'# -> unI64 5051
+  '\xab8c'# -> unI64 5052
+  '\xab8d'# -> unI64 5053
+  '\xab8e'# -> unI64 5054
+  '\xab8f'# -> unI64 5055
+  '\xab90'# -> unI64 5056
+  '\xab91'# -> unI64 5057
+  '\xab92'# -> unI64 5058
+  '\xab93'# -> unI64 5059
+  '\xab94'# -> unI64 5060
+  '\xab95'# -> unI64 5061
+  '\xab96'# -> unI64 5062
+  '\xab97'# -> unI64 5063
+  '\xab98'# -> unI64 5064
+  '\xab99'# -> unI64 5065
+  '\xab9a'# -> unI64 5066
+  '\xab9b'# -> unI64 5067
+  '\xab9c'# -> unI64 5068
+  '\xab9d'# -> unI64 5069
+  '\xab9e'# -> unI64 5070
+  '\xab9f'# -> unI64 5071
+  '\xaba0'# -> unI64 5072
+  '\xaba1'# -> unI64 5073
+  '\xaba2'# -> unI64 5074
+  '\xaba3'# -> unI64 5075
+  '\xaba4'# -> unI64 5076
+  '\xaba5'# -> unI64 5077
+  '\xaba6'# -> unI64 5078
+  '\xaba7'# -> unI64 5079
+  '\xaba8'# -> unI64 5080
+  '\xaba9'# -> unI64 5081
+  '\xabaa'# -> unI64 5082
+  '\xabab'# -> unI64 5083
+  '\xabac'# -> unI64 5084
+  '\xabad'# -> unI64 5085
+  '\xabae'# -> unI64 5086
+  '\xabaf'# -> unI64 5087
+  '\xabb0'# -> unI64 5088
+  '\xabb1'# -> unI64 5089
+  '\xabb2'# -> unI64 5090
+  '\xabb3'# -> unI64 5091
+  '\xabb4'# -> unI64 5092
+  '\xabb5'# -> unI64 5093
+  '\xabb6'# -> unI64 5094
+  '\xabb7'# -> unI64 5095
+  '\xabb8'# -> unI64 5096
+  '\xabb9'# -> unI64 5097
+  '\xabba'# -> unI64 5098
+  '\xabbb'# -> unI64 5099
+  '\xabbc'# -> unI64 5100
+  '\xabbd'# -> unI64 5101
+  '\xabbe'# -> unI64 5102
+  '\xabbf'# -> unI64 5103
+  '\xff41'# -> unI64 65313
+  '\xff42'# -> unI64 65314
+  '\xff43'# -> unI64 65315
+  '\xff44'# -> unI64 65316
+  '\xff45'# -> unI64 65317
+  '\xff46'# -> unI64 65318
+  '\xff47'# -> unI64 65319
+  '\xff48'# -> unI64 65320
+  '\xff49'# -> unI64 65321
+  '\xff4a'# -> unI64 65322
+  '\xff4b'# -> unI64 65323
+  '\xff4c'# -> unI64 65324
+  '\xff4d'# -> unI64 65325
+  '\xff4e'# -> unI64 65326
+  '\xff4f'# -> unI64 65327
+  '\xff50'# -> unI64 65328
+  '\xff51'# -> unI64 65329
+  '\xff52'# -> unI64 65330
+  '\xff53'# -> unI64 65331
+  '\xff54'# -> unI64 65332
+  '\xff55'# -> unI64 65333
+  '\xff56'# -> unI64 65334
+  '\xff57'# -> unI64 65335
+  '\xff58'# -> unI64 65336
+  '\xff59'# -> unI64 65337
+  '\xff5a'# -> unI64 65338
+  '\x10428'# -> unI64 66560
+  '\x10429'# -> unI64 66561
+  '\x1042a'# -> unI64 66562
+  '\x1042b'# -> unI64 66563
+  '\x1042c'# -> unI64 66564
+  '\x1042d'# -> unI64 66565
+  '\x1042e'# -> unI64 66566
+  '\x1042f'# -> unI64 66567
+  '\x10430'# -> unI64 66568
+  '\x10431'# -> unI64 66569
+  '\x10432'# -> unI64 66570
+  '\x10433'# -> unI64 66571
+  '\x10434'# -> unI64 66572
+  '\x10435'# -> unI64 66573
+  '\x10436'# -> unI64 66574
+  '\x10437'# -> unI64 66575
+  '\x10438'# -> unI64 66576
+  '\x10439'# -> unI64 66577
+  '\x1043a'# -> unI64 66578
+  '\x1043b'# -> unI64 66579
+  '\x1043c'# -> unI64 66580
+  '\x1043d'# -> unI64 66581
+  '\x1043e'# -> unI64 66582
+  '\x1043f'# -> unI64 66583
+  '\x10440'# -> unI64 66584
+  '\x10441'# -> unI64 66585
+  '\x10442'# -> unI64 66586
+  '\x10443'# -> unI64 66587
+  '\x10444'# -> unI64 66588
+  '\x10445'# -> unI64 66589
+  '\x10446'# -> unI64 66590
+  '\x10447'# -> unI64 66591
+  '\x10448'# -> unI64 66592
+  '\x10449'# -> unI64 66593
+  '\x1044a'# -> unI64 66594
+  '\x1044b'# -> unI64 66595
+  '\x1044c'# -> unI64 66596
+  '\x1044d'# -> unI64 66597
+  '\x1044e'# -> unI64 66598
+  '\x1044f'# -> unI64 66599
+  '\x104d8'# -> unI64 66736
+  '\x104d9'# -> unI64 66737
+  '\x104da'# -> unI64 66738
+  '\x104db'# -> unI64 66739
+  '\x104dc'# -> unI64 66740
+  '\x104dd'# -> unI64 66741
+  '\x104de'# -> unI64 66742
+  '\x104df'# -> unI64 66743
+  '\x104e0'# -> unI64 66744
+  '\x104e1'# -> unI64 66745
+  '\x104e2'# -> unI64 66746
+  '\x104e3'# -> unI64 66747
+  '\x104e4'# -> unI64 66748
+  '\x104e5'# -> unI64 66749
+  '\x104e6'# -> unI64 66750
+  '\x104e7'# -> unI64 66751
+  '\x104e8'# -> unI64 66752
+  '\x104e9'# -> unI64 66753
+  '\x104ea'# -> unI64 66754
+  '\x104eb'# -> unI64 66755
+  '\x104ec'# -> unI64 66756
+  '\x104ed'# -> unI64 66757
+  '\x104ee'# -> unI64 66758
+  '\x104ef'# -> unI64 66759
+  '\x104f0'# -> unI64 66760
+  '\x104f1'# -> unI64 66761
+  '\x104f2'# -> unI64 66762
+  '\x104f3'# -> unI64 66763
+  '\x104f4'# -> unI64 66764
+  '\x104f5'# -> unI64 66765
+  '\x104f6'# -> unI64 66766
+  '\x104f7'# -> unI64 66767
+  '\x104f8'# -> unI64 66768
+  '\x104f9'# -> unI64 66769
+  '\x104fa'# -> unI64 66770
+  '\x104fb'# -> unI64 66771
+  '\x10cc0'# -> unI64 68736
+  '\x10cc1'# -> unI64 68737
+  '\x10cc2'# -> unI64 68738
+  '\x10cc3'# -> unI64 68739
+  '\x10cc4'# -> unI64 68740
+  '\x10cc5'# -> unI64 68741
+  '\x10cc6'# -> unI64 68742
+  '\x10cc7'# -> unI64 68743
+  '\x10cc8'# -> unI64 68744
+  '\x10cc9'# -> unI64 68745
+  '\x10cca'# -> unI64 68746
+  '\x10ccb'# -> unI64 68747
+  '\x10ccc'# -> unI64 68748
+  '\x10ccd'# -> unI64 68749
+  '\x10cce'# -> unI64 68750
+  '\x10ccf'# -> unI64 68751
+  '\x10cd0'# -> unI64 68752
+  '\x10cd1'# -> unI64 68753
+  '\x10cd2'# -> unI64 68754
+  '\x10cd3'# -> unI64 68755
+  '\x10cd4'# -> unI64 68756
+  '\x10cd5'# -> unI64 68757
+  '\x10cd6'# -> unI64 68758
+  '\x10cd7'# -> unI64 68759
+  '\x10cd8'# -> unI64 68760
+  '\x10cd9'# -> unI64 68761
+  '\x10cda'# -> unI64 68762
+  '\x10cdb'# -> unI64 68763
+  '\x10cdc'# -> unI64 68764
+  '\x10cdd'# -> unI64 68765
+  '\x10cde'# -> unI64 68766
+  '\x10cdf'# -> unI64 68767
+  '\x10ce0'# -> unI64 68768
+  '\x10ce1'# -> unI64 68769
+  '\x10ce2'# -> unI64 68770
+  '\x10ce3'# -> unI64 68771
+  '\x10ce4'# -> unI64 68772
+  '\x10ce5'# -> unI64 68773
+  '\x10ce6'# -> unI64 68774
+  '\x10ce7'# -> unI64 68775
+  '\x10ce8'# -> unI64 68776
+  '\x10ce9'# -> unI64 68777
+  '\x10cea'# -> unI64 68778
+  '\x10ceb'# -> unI64 68779
+  '\x10cec'# -> unI64 68780
+  '\x10ced'# -> unI64 68781
+  '\x10cee'# -> unI64 68782
+  '\x10cef'# -> unI64 68783
+  '\x10cf0'# -> unI64 68784
+  '\x10cf1'# -> unI64 68785
+  '\x10cf2'# -> unI64 68786
+  '\x118c0'# -> unI64 71840
+  '\x118c1'# -> unI64 71841
+  '\x118c2'# -> unI64 71842
+  '\x118c3'# -> unI64 71843
+  '\x118c4'# -> unI64 71844
+  '\x118c5'# -> unI64 71845
+  '\x118c6'# -> unI64 71846
+  '\x118c7'# -> unI64 71847
+  '\x118c8'# -> unI64 71848
+  '\x118c9'# -> unI64 71849
+  '\x118ca'# -> unI64 71850
+  '\x118cb'# -> unI64 71851
+  '\x118cc'# -> unI64 71852
+  '\x118cd'# -> unI64 71853
+  '\x118ce'# -> unI64 71854
+  '\x118cf'# -> unI64 71855
+  '\x118d0'# -> unI64 71856
+  '\x118d1'# -> unI64 71857
+  '\x118d2'# -> unI64 71858
+  '\x118d3'# -> unI64 71859
+  '\x118d4'# -> unI64 71860
+  '\x118d5'# -> unI64 71861
+  '\x118d6'# -> unI64 71862
+  '\x118d7'# -> unI64 71863
+  '\x118d8'# -> unI64 71864
+  '\x118d9'# -> unI64 71865
+  '\x118da'# -> unI64 71866
+  '\x118db'# -> unI64 71867
+  '\x118dc'# -> unI64 71868
+  '\x118dd'# -> unI64 71869
+  '\x118de'# -> unI64 71870
+  '\x118df'# -> unI64 71871
+  '\x16e60'# -> unI64 93760
+  '\x16e61'# -> unI64 93761
+  '\x16e62'# -> unI64 93762
+  '\x16e63'# -> unI64 93763
+  '\x16e64'# -> unI64 93764
+  '\x16e65'# -> unI64 93765
+  '\x16e66'# -> unI64 93766
+  '\x16e67'# -> unI64 93767
+  '\x16e68'# -> unI64 93768
+  '\x16e69'# -> unI64 93769
+  '\x16e6a'# -> unI64 93770
+  '\x16e6b'# -> unI64 93771
+  '\x16e6c'# -> unI64 93772
+  '\x16e6d'# -> unI64 93773
+  '\x16e6e'# -> unI64 93774
+  '\x16e6f'# -> unI64 93775
+  '\x16e70'# -> unI64 93776
+  '\x16e71'# -> unI64 93777
+  '\x16e72'# -> unI64 93778
+  '\x16e73'# -> unI64 93779
+  '\x16e74'# -> unI64 93780
+  '\x16e75'# -> unI64 93781
+  '\x16e76'# -> unI64 93782
+  '\x16e77'# -> unI64 93783
+  '\x16e78'# -> unI64 93784
+  '\x16e79'# -> unI64 93785
+  '\x16e7a'# -> unI64 93786
+  '\x16e7b'# -> unI64 93787
+  '\x16e7c'# -> unI64 93788
+  '\x16e7d'# -> unI64 93789
+  '\x16e7e'# -> unI64 93790
+  '\x16e7f'# -> unI64 93791
+  '\x1e922'# -> unI64 125184
+  '\x1e923'# -> unI64 125185
+  '\x1e924'# -> unI64 125186
+  '\x1e925'# -> unI64 125187
+  '\x1e926'# -> unI64 125188
+  '\x1e927'# -> unI64 125189
+  '\x1e928'# -> unI64 125190
+  '\x1e929'# -> unI64 125191
+  '\x1e92a'# -> unI64 125192
+  '\x1e92b'# -> unI64 125193
+  '\x1e92c'# -> unI64 125194
+  '\x1e92d'# -> unI64 125195
+  '\x1e92e'# -> unI64 125196
+  '\x1e92f'# -> unI64 125197
+  '\x1e930'# -> unI64 125198
+  '\x1e931'# -> unI64 125199
+  '\x1e932'# -> unI64 125200
+  '\x1e933'# -> unI64 125201
+  '\x1e934'# -> unI64 125202
+  '\x1e935'# -> unI64 125203
+  '\x1e936'# -> unI64 125204
+  '\x1e937'# -> unI64 125205
+  '\x1e938'# -> unI64 125206
+  '\x1e939'# -> unI64 125207
+  '\x1e93a'# -> unI64 125208
+  '\x1e93b'# -> unI64 125209
+  '\x1e93c'# -> unI64 125210
+  '\x1e93d'# -> unI64 125211
+  '\x1e93e'# -> unI64 125212
+  '\x1e93f'# -> unI64 125213
+  '\x1e940'# -> unI64 125214
+  '\x1e941'# -> unI64 125215
+  '\x1e942'# -> unI64 125216
+  '\x1e943'# -> unI64 125217
+  _ -> unI64 0
+foldMapping :: Char# -> _ {- unboxed Int64 -}
 {-# NOINLINE foldMapping #-}
--- MICRO SIGN
-foldMapping '\x00b5' s = Yield '\x03bc' (CC s '\x0000' '\x0000')
--- LATIN SMALL LETTER SHARP S
-foldMapping '\x00df' s = Yield '\x0073' (CC s '\x0073' '\x0000')
--- LATIN CAPITAL LETTER I WITH DOT ABOVE
-foldMapping '\x0130' s = Yield '\x0069' (CC s '\x0307' '\x0000')
--- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-foldMapping '\x0149' s = Yield '\x02bc' (CC s '\x006e' '\x0000')
--- LATIN SMALL LETTER LONG S
-foldMapping '\x017f' s = Yield '\x0073' (CC s '\x0000' '\x0000')
--- LATIN SMALL LETTER J WITH CARON
-foldMapping '\x01f0' s = Yield '\x006a' (CC s '\x030c' '\x0000')
--- COMBINING GREEK YPOGEGRAMMENI
-foldMapping '\x0345' s = Yield '\x03b9' (CC s '\x0000' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-foldMapping '\x0390' s = Yield '\x03b9' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-foldMapping '\x03b0' s = Yield '\x03c5' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER FINAL SIGMA
-foldMapping '\x03c2' s = Yield '\x03c3' (CC s '\x0000' '\x0000')
--- GREEK BETA SYMBOL
-foldMapping '\x03d0' s = Yield '\x03b2' (CC s '\x0000' '\x0000')
--- GREEK THETA SYMBOL
-foldMapping '\x03d1' s = Yield '\x03b8' (CC s '\x0000' '\x0000')
--- GREEK PHI SYMBOL
-foldMapping '\x03d5' s = Yield '\x03c6' (CC s '\x0000' '\x0000')
--- GREEK PI SYMBOL
-foldMapping '\x03d6' s = Yield '\x03c0' (CC s '\x0000' '\x0000')
--- GREEK KAPPA SYMBOL
-foldMapping '\x03f0' s = Yield '\x03ba' (CC s '\x0000' '\x0000')
--- GREEK RHO SYMBOL
-foldMapping '\x03f1' s = Yield '\x03c1' (CC s '\x0000' '\x0000')
--- GREEK LUNATE EPSILON SYMBOL
-foldMapping '\x03f5' s = Yield '\x03b5' (CC s '\x0000' '\x0000')
--- ARMENIAN SMALL LIGATURE ECH YIWN
-foldMapping '\x0587' s = Yield '\x0565' (CC s '\x0582' '\x0000')
--- CHEROKEE SMALL LETTER YE
-foldMapping '\x13f8' s = Yield '\x13f0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER YI
-foldMapping '\x13f9' s = Yield '\x13f1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER YO
-foldMapping '\x13fa' s = Yield '\x13f2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER YU
-foldMapping '\x13fb' s = Yield '\x13f3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER YV
-foldMapping '\x13fc' s = Yield '\x13f4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER MV
-foldMapping '\x13fd' s = Yield '\x13f5' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER ROUNDED VE
-foldMapping '\x1c80' s = Yield '\x0432' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER LONG-LEGGED DE
-foldMapping '\x1c81' s = Yield '\x0434' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER NARROW O
-foldMapping '\x1c82' s = Yield '\x043e' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER WIDE ES
-foldMapping '\x1c83' s = Yield '\x0441' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER TALL TE
-foldMapping '\x1c84' s = Yield '\x0442' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER THREE-LEGGED TE
-foldMapping '\x1c85' s = Yield '\x0442' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER TALL HARD SIGN
-foldMapping '\x1c86' s = Yield '\x044a' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER TALL YAT
-foldMapping '\x1c87' s = Yield '\x0463' (CC s '\x0000' '\x0000')
--- CYRILLIC SMALL LETTER UNBLENDED UK
-foldMapping '\x1c88' s = Yield '\xa64b' (CC s '\x0000' '\x0000')
--- LATIN SMALL LETTER H WITH LINE BELOW
-foldMapping '\x1e96' s = Yield '\x0068' (CC s '\x0331' '\x0000')
--- LATIN SMALL LETTER T WITH DIAERESIS
-foldMapping '\x1e97' s = Yield '\x0074' (CC s '\x0308' '\x0000')
--- LATIN SMALL LETTER W WITH RING ABOVE
-foldMapping '\x1e98' s = Yield '\x0077' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER Y WITH RING ABOVE
-foldMapping '\x1e99' s = Yield '\x0079' (CC s '\x030a' '\x0000')
--- LATIN SMALL LETTER A WITH RIGHT HALF RING
-foldMapping '\x1e9a' s = Yield '\x0061' (CC s '\x02be' '\x0000')
--- LATIN SMALL LETTER LONG S WITH DOT ABOVE
-foldMapping '\x1e9b' s = Yield '\x1e61' (CC s '\x0000' '\x0000')
--- LATIN CAPITAL LETTER SHARP S
-foldMapping '\x1e9e' s = Yield '\x0073' (CC s '\x0073' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI
-foldMapping '\x1f50' s = Yield '\x03c5' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
-foldMapping '\x1f52' s = Yield '\x03c5' (CC s '\x0313' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
-foldMapping '\x1f54' s = Yield '\x03c5' (CC s '\x0313' '\x0301')
--- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
-foldMapping '\x1f56' s = Yield '\x03c5' (CC s '\x0313' '\x0342')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
-foldMapping '\x1f80' s = Yield '\x1f00' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
-foldMapping '\x1f81' s = Yield '\x1f01' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1f82' s = Yield '\x1f02' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1f83' s = Yield '\x1f03' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1f84' s = Yield '\x1f04' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1f85' s = Yield '\x1f05' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1f86' s = Yield '\x1f06' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1f87' s = Yield '\x1f07' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
-foldMapping '\x1f88' s = Yield '\x1f00' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
-foldMapping '\x1f89' s = Yield '\x1f01' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1f8a' s = Yield '\x1f02' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1f8b' s = Yield '\x1f03' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1f8c' s = Yield '\x1f04' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1f8d' s = Yield '\x1f05' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1f8e' s = Yield '\x1f06' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1f8f' s = Yield '\x1f07' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
-foldMapping '\x1f90' s = Yield '\x1f20' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
-foldMapping '\x1f91' s = Yield '\x1f21' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1f92' s = Yield '\x1f22' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1f93' s = Yield '\x1f23' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1f94' s = Yield '\x1f24' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1f95' s = Yield '\x1f25' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1f96' s = Yield '\x1f26' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1f97' s = Yield '\x1f27' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
-foldMapping '\x1f98' s = Yield '\x1f20' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
-foldMapping '\x1f99' s = Yield '\x1f21' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1f9a' s = Yield '\x1f22' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1f9b' s = Yield '\x1f23' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1f9c' s = Yield '\x1f24' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1f9d' s = Yield '\x1f25' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1f9e' s = Yield '\x1f26' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1f9f' s = Yield '\x1f27' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
-foldMapping '\x1fa0' s = Yield '\x1f60' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
-foldMapping '\x1fa1' s = Yield '\x1f61' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1fa2' s = Yield '\x1f62' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
-foldMapping '\x1fa3' s = Yield '\x1f63' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1fa4' s = Yield '\x1f64' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
-foldMapping '\x1fa5' s = Yield '\x1f65' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1fa6' s = Yield '\x1f66' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1fa7' s = Yield '\x1f67' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
-foldMapping '\x1fa8' s = Yield '\x1f60' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
-foldMapping '\x1fa9' s = Yield '\x1f61' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1faa' s = Yield '\x1f62' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
-foldMapping '\x1fab' s = Yield '\x1f63' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1fac' s = Yield '\x1f64' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
-foldMapping '\x1fad' s = Yield '\x1f65' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1fae' s = Yield '\x1f66' (CC s '\x03b9' '\x0000')
--- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
-foldMapping '\x1faf' s = Yield '\x1f67' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
-foldMapping '\x1fb2' s = Yield '\x1f70' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
-foldMapping '\x1fb3' s = Yield '\x03b1' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
-foldMapping '\x1fb4' s = Yield '\x03ac' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
-foldMapping '\x1fb6' s = Yield '\x03b1' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1fb7' s = Yield '\x03b1' (CC s '\x0342' '\x03b9')
--- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
-foldMapping '\x1fbc' s = Yield '\x03b1' (CC s '\x03b9' '\x0000')
--- GREEK PROSGEGRAMMENI
-foldMapping '\x1fbe' s = Yield '\x03b9' (CC s '\x0000' '\x0000')
--- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
-foldMapping '\x1fc2' s = Yield '\x1f74' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
-foldMapping '\x1fc3' s = Yield '\x03b7' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
-foldMapping '\x1fc4' s = Yield '\x03ae' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI
-foldMapping '\x1fc6' s = Yield '\x03b7' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1fc7' s = Yield '\x03b7' (CC s '\x0342' '\x03b9')
--- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
-foldMapping '\x1fcc' s = Yield '\x03b7' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
-foldMapping '\x1fd2' s = Yield '\x03b9' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-foldMapping '\x1fd3' s = Yield '\x03b9' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER IOTA WITH PERISPOMENI
-foldMapping '\x1fd6' s = Yield '\x03b9' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
-foldMapping '\x1fd7' s = Yield '\x03b9' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
-foldMapping '\x1fe2' s = Yield '\x03c5' (CC s '\x0308' '\x0300')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
-foldMapping '\x1fe3' s = Yield '\x03c5' (CC s '\x0308' '\x0301')
--- GREEK SMALL LETTER RHO WITH PSILI
-foldMapping '\x1fe4' s = Yield '\x03c1' (CC s '\x0313' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
-foldMapping '\x1fe6' s = Yield '\x03c5' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
-foldMapping '\x1fe7' s = Yield '\x03c5' (CC s '\x0308' '\x0342')
--- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
-foldMapping '\x1ff2' s = Yield '\x1f7c' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
-foldMapping '\x1ff3' s = Yield '\x03c9' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
-foldMapping '\x1ff4' s = Yield '\x03ce' (CC s '\x03b9' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
-foldMapping '\x1ff6' s = Yield '\x03c9' (CC s '\x0342' '\x0000')
--- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
-foldMapping '\x1ff7' s = Yield '\x03c9' (CC s '\x0342' '\x03b9')
--- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
-foldMapping '\x1ffc' s = Yield '\x03c9' (CC s '\x03b9' '\x0000')
--- LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
-foldMapping '\xa7c7' s = Yield '\xa7c8' (CC s '\x0000' '\x0000')
--- LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
-foldMapping '\xa7c9' s = Yield '\xa7ca' (CC s '\x0000' '\x0000')
--- LATIN CAPITAL LETTER REVERSED HALF H
-foldMapping '\xa7f5' s = Yield '\xa7f6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER A
-foldMapping '\xab70' s = Yield '\x13a0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER E
-foldMapping '\xab71' s = Yield '\x13a1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER I
-foldMapping '\xab72' s = Yield '\x13a2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER O
-foldMapping '\xab73' s = Yield '\x13a3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER U
-foldMapping '\xab74' s = Yield '\x13a4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER V
-foldMapping '\xab75' s = Yield '\x13a5' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GA
-foldMapping '\xab76' s = Yield '\x13a6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER KA
-foldMapping '\xab77' s = Yield '\x13a7' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GE
-foldMapping '\xab78' s = Yield '\x13a8' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GI
-foldMapping '\xab79' s = Yield '\x13a9' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GO
-foldMapping '\xab7a' s = Yield '\x13aa' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GU
-foldMapping '\xab7b' s = Yield '\x13ab' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER GV
-foldMapping '\xab7c' s = Yield '\x13ac' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HA
-foldMapping '\xab7d' s = Yield '\x13ad' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HE
-foldMapping '\xab7e' s = Yield '\x13ae' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HI
-foldMapping '\xab7f' s = Yield '\x13af' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HO
-foldMapping '\xab80' s = Yield '\x13b0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HU
-foldMapping '\xab81' s = Yield '\x13b1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HV
-foldMapping '\xab82' s = Yield '\x13b2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LA
-foldMapping '\xab83' s = Yield '\x13b3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LE
-foldMapping '\xab84' s = Yield '\x13b4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LI
-foldMapping '\xab85' s = Yield '\x13b5' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LO
-foldMapping '\xab86' s = Yield '\x13b6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LU
-foldMapping '\xab87' s = Yield '\x13b7' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER LV
-foldMapping '\xab88' s = Yield '\x13b8' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER MA
-foldMapping '\xab89' s = Yield '\x13b9' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER ME
-foldMapping '\xab8a' s = Yield '\x13ba' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER MI
-foldMapping '\xab8b' s = Yield '\x13bb' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER MO
-foldMapping '\xab8c' s = Yield '\x13bc' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER MU
-foldMapping '\xab8d' s = Yield '\x13bd' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NA
-foldMapping '\xab8e' s = Yield '\x13be' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER HNA
-foldMapping '\xab8f' s = Yield '\x13bf' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NAH
-foldMapping '\xab90' s = Yield '\x13c0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NE
-foldMapping '\xab91' s = Yield '\x13c1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NI
-foldMapping '\xab92' s = Yield '\x13c2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NO
-foldMapping '\xab93' s = Yield '\x13c3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NU
-foldMapping '\xab94' s = Yield '\x13c4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER NV
-foldMapping '\xab95' s = Yield '\x13c5' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUA
-foldMapping '\xab96' s = Yield '\x13c6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUE
-foldMapping '\xab97' s = Yield '\x13c7' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUI
-foldMapping '\xab98' s = Yield '\x13c8' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUO
-foldMapping '\xab99' s = Yield '\x13c9' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUU
-foldMapping '\xab9a' s = Yield '\x13ca' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER QUV
-foldMapping '\xab9b' s = Yield '\x13cb' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SA
-foldMapping '\xab9c' s = Yield '\x13cc' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER S
-foldMapping '\xab9d' s = Yield '\x13cd' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SE
-foldMapping '\xab9e' s = Yield '\x13ce' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SI
-foldMapping '\xab9f' s = Yield '\x13cf' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SO
-foldMapping '\xaba0' s = Yield '\x13d0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SU
-foldMapping '\xaba1' s = Yield '\x13d1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER SV
-foldMapping '\xaba2' s = Yield '\x13d2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DA
-foldMapping '\xaba3' s = Yield '\x13d3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TA
-foldMapping '\xaba4' s = Yield '\x13d4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DE
-foldMapping '\xaba5' s = Yield '\x13d5' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TE
-foldMapping '\xaba6' s = Yield '\x13d6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DI
-foldMapping '\xaba7' s = Yield '\x13d7' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TI
-foldMapping '\xaba8' s = Yield '\x13d8' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DO
-foldMapping '\xaba9' s = Yield '\x13d9' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DU
-foldMapping '\xabaa' s = Yield '\x13da' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DV
-foldMapping '\xabab' s = Yield '\x13db' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER DLA
-foldMapping '\xabac' s = Yield '\x13dc' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLA
-foldMapping '\xabad' s = Yield '\x13dd' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLE
-foldMapping '\xabae' s = Yield '\x13de' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLI
-foldMapping '\xabaf' s = Yield '\x13df' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLO
-foldMapping '\xabb0' s = Yield '\x13e0' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLU
-foldMapping '\xabb1' s = Yield '\x13e1' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TLV
-foldMapping '\xabb2' s = Yield '\x13e2' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSA
-foldMapping '\xabb3' s = Yield '\x13e3' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSE
-foldMapping '\xabb4' s = Yield '\x13e4' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSI
-foldMapping '\xabb5' s = Yield '\x13e5' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSO
-foldMapping '\xabb6' s = Yield '\x13e6' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSU
-foldMapping '\xabb7' s = Yield '\x13e7' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER TSV
-foldMapping '\xabb8' s = Yield '\x13e8' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WA
-foldMapping '\xabb9' s = Yield '\x13e9' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WE
-foldMapping '\xabba' s = Yield '\x13ea' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WI
-foldMapping '\xabbb' s = Yield '\x13eb' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WO
-foldMapping '\xabbc' s = Yield '\x13ec' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WU
-foldMapping '\xabbd' s = Yield '\x13ed' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER WV
-foldMapping '\xabbe' s = Yield '\x13ee' (CC s '\x0000' '\x0000')
--- CHEROKEE SMALL LETTER YA
-foldMapping '\xabbf' s = Yield '\x13ef' (CC s '\x0000' '\x0000')
--- LATIN SMALL LIGATURE FF
-foldMapping '\xfb00' s = Yield '\x0066' (CC s '\x0066' '\x0000')
--- LATIN SMALL LIGATURE FI
-foldMapping '\xfb01' s = Yield '\x0066' (CC s '\x0069' '\x0000')
--- LATIN SMALL LIGATURE FL
-foldMapping '\xfb02' s = Yield '\x0066' (CC s '\x006c' '\x0000')
--- LATIN SMALL LIGATURE FFI
-foldMapping '\xfb03' s = Yield '\x0066' (CC s '\x0066' '\x0069')
--- LATIN SMALL LIGATURE FFL
-foldMapping '\xfb04' s = Yield '\x0066' (CC s '\x0066' '\x006c')
--- LATIN SMALL LIGATURE LONG S T
-foldMapping '\xfb05' s = Yield '\x0073' (CC s '\x0074' '\x0000')
--- LATIN SMALL LIGATURE ST
-foldMapping '\xfb06' s = Yield '\x0073' (CC s '\x0074' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN NOW
-foldMapping '\xfb13' s = Yield '\x0574' (CC s '\x0576' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN ECH
-foldMapping '\xfb14' s = Yield '\x0574' (CC s '\x0565' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN INI
-foldMapping '\xfb15' s = Yield '\x0574' (CC s '\x056b' '\x0000')
--- ARMENIAN SMALL LIGATURE VEW NOW
-foldMapping '\xfb16' s = Yield '\x057e' (CC s '\x0576' '\x0000')
--- ARMENIAN SMALL LIGATURE MEN XEH
-foldMapping '\xfb17' s = Yield '\x0574' (CC s '\x056d' '\x0000')
-foldMapping c s = Yield (toLower c) (CC s '\0' '\0')
+foldMapping = \case
+  -- MICRO SIGN
+  '\x00b5'# -> unI64 956
+  -- LATIN SMALL LETTER SHARP S
+  '\x00df'# -> unI64 241172595
+  -- LATIN CAPITAL LETTER I WITH DOT ABOVE
+  '\x0130'# -> unI64 1625292905
+  -- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+  '\x0149'# -> unI64 230687420
+  -- LATIN SMALL LETTER LONG S
+  '\x017f'# -> unI64 115
+  -- LATIN SMALL LETTER J WITH CARON
+  '\x01f0'# -> unI64 1635778666
+  -- COMBINING GREEK YPOGEGRAMMENI
+  '\x0345'# -> unI64 953
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+  '\x0390'# -> unI64 3382099394429881
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+  '\x03b0'# -> unI64 3382099394429893
+  -- GREEK SMALL LETTER FINAL SIGMA
+  '\x03c2'# -> unI64 963
+  -- GREEK BETA SYMBOL
+  '\x03d0'# -> unI64 946
+  -- GREEK THETA SYMBOL
+  '\x03d1'# -> unI64 952
+  -- GREEK PHI SYMBOL
+  '\x03d5'# -> unI64 966
+  -- GREEK PI SYMBOL
+  '\x03d6'# -> unI64 960
+  -- GREEK KAPPA SYMBOL
+  '\x03f0'# -> unI64 954
+  -- GREEK RHO SYMBOL
+  '\x03f1'# -> unI64 961
+  -- GREEK LUNATE EPSILON SYMBOL
+  '\x03f5'# -> unI64 949
+  -- ARMENIAN SMALL LIGATURE ECH YIWN
+  '\x0587'# -> unI64 2956985701
+  -- CHEROKEE SMALL LETTER YE
+  '\x13f8'# -> unI64 5104
+  -- CHEROKEE SMALL LETTER YI
+  '\x13f9'# -> unI64 5105
+  -- CHEROKEE SMALL LETTER YO
+  '\x13fa'# -> unI64 5106
+  -- CHEROKEE SMALL LETTER YU
+  '\x13fb'# -> unI64 5107
+  -- CHEROKEE SMALL LETTER YV
+  '\x13fc'# -> unI64 5108
+  -- CHEROKEE SMALL LETTER MV
+  '\x13fd'# -> unI64 5109
+  -- CYRILLIC SMALL LETTER ROUNDED VE
+  '\x1c80'# -> unI64 1074
+  -- CYRILLIC SMALL LETTER LONG-LEGGED DE
+  '\x1c81'# -> unI64 1076
+  -- CYRILLIC SMALL LETTER NARROW O
+  '\x1c82'# -> unI64 1086
+  -- CYRILLIC SMALL LETTER WIDE ES
+  '\x1c83'# -> unI64 1089
+  -- CYRILLIC SMALL LETTER TALL TE
+  '\x1c84'# -> unI64 1090
+  -- CYRILLIC SMALL LETTER THREE-LEGGED TE
+  '\x1c85'# -> unI64 1090
+  -- CYRILLIC SMALL LETTER TALL HARD SIGN
+  '\x1c86'# -> unI64 1098
+  -- CYRILLIC SMALL LETTER TALL YAT
+  '\x1c87'# -> unI64 1123
+  -- CYRILLIC SMALL LETTER UNBLENDED UK
+  '\x1c88'# -> unI64 42571
+  -- LATIN SMALL LETTER H WITH LINE BELOW
+  '\x1e96'# -> unI64 1713373288
+  -- LATIN SMALL LETTER T WITH DIAERESIS
+  '\x1e97'# -> unI64 1627390068
+  -- LATIN SMALL LETTER W WITH RING ABOVE
+  '\x1e98'# -> unI64 1631584375
+  -- LATIN SMALL LETTER Y WITH RING ABOVE
+  '\x1e99'# -> unI64 1631584377
+  -- LATIN SMALL LETTER A WITH RIGHT HALF RING
+  '\x1e9a'# -> unI64 1472200801
+  -- LATIN SMALL LETTER LONG S WITH DOT ABOVE
+  '\x1e9b'# -> unI64 7777
+  -- LATIN CAPITAL LETTER SHARP S
+  '\x1e9e'# -> unI64 241172595
+  -- GREEK SMALL LETTER UPSILON WITH PSILI
+  '\x1f50'# -> unI64 1650459589
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+  '\x1f52'# -> unI64 3377701370987461
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+  '\x1f54'# -> unI64 3382099417498565
+  -- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+  '\x1f56'# -> unI64 3667972440720325
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+  '\x1f80'# -> unI64 1998593792
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+  '\x1f81'# -> unI64 1998593793
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1f82'# -> unI64 1998593794
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1f83'# -> unI64 1998593795
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1f84'# -> unI64 1998593796
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1f85'# -> unI64 1998593797
+  -- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f86'# -> unI64 1998593798
+  -- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f87'# -> unI64 1998593799
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+  '\x1f88'# -> unI64 1998593792
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+  '\x1f89'# -> unI64 1998593793
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1f8a'# -> unI64 1998593794
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1f8b'# -> unI64 1998593795
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1f8c'# -> unI64 1998593796
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1f8d'# -> unI64 1998593797
+  -- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f8e'# -> unI64 1998593798
+  -- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f8f'# -> unI64 1998593799
+  -- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+  '\x1f90'# -> unI64 1998593824
+  -- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+  '\x1f91'# -> unI64 1998593825
+  -- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1f92'# -> unI64 1998593826
+  -- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1f93'# -> unI64 1998593827
+  -- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1f94'# -> unI64 1998593828
+  -- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1f95'# -> unI64 1998593829
+  -- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f96'# -> unI64 1998593830
+  -- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1f97'# -> unI64 1998593831
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+  '\x1f98'# -> unI64 1998593824
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+  '\x1f99'# -> unI64 1998593825
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1f9a'# -> unI64 1998593826
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1f9b'# -> unI64 1998593827
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1f9c'# -> unI64 1998593828
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1f9d'# -> unI64 1998593829
+  -- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f9e'# -> unI64 1998593830
+  -- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1f9f'# -> unI64 1998593831
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+  '\x1fa0'# -> unI64 1998593888
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+  '\x1fa1'# -> unI64 1998593889
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+  '\x1fa2'# -> unI64 1998593890
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+  '\x1fa3'# -> unI64 1998593891
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+  '\x1fa4'# -> unI64 1998593892
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+  '\x1fa5'# -> unI64 1998593893
+  -- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fa6'# -> unI64 1998593894
+  -- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fa7'# -> unI64 1998593895
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+  '\x1fa8'# -> unI64 1998593888
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+  '\x1fa9'# -> unI64 1998593889
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+  '\x1faa'# -> unI64 1998593890
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+  '\x1fab'# -> unI64 1998593891
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+  '\x1fac'# -> unI64 1998593892
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+  '\x1fad'# -> unI64 1998593893
+  -- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1fae'# -> unI64 1998593894
+  -- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+  '\x1faf'# -> unI64 1998593895
+  -- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fb2'# -> unI64 1998593904
+  -- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+  '\x1fb3'# -> unI64 1998586801
+  -- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fb4'# -> unI64 1998586796
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+  '\x1fb6'# -> unI64 1749025713
+  -- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fb7'# -> unI64 4191340074107825
+  -- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+  '\x1fbc'# -> unI64 1998586801
+  -- GREEK PROSGEGRAMMENI
+  '\x1fbe'# -> unI64 953
+  -- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+  '\x1fc2'# -> unI64 1998593908
+  -- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+  '\x1fc3'# -> unI64 1998586807
+  -- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+  '\x1fc4'# -> unI64 1998586798
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI
+  '\x1fc6'# -> unI64 1749025719
+  -- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1fc7'# -> unI64 4191340074107831
+  -- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+  '\x1fcc'# -> unI64 1998586807
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+  '\x1fd2'# -> unI64 3377701347918777
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+  '\x1fd3'# -> unI64 3382099394429881
+  -- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+  '\x1fd6'# -> unI64 1749025721
+  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+  '\x1fd7'# -> unI64 3667972417651641
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+  '\x1fe2'# -> unI64 3377701347918789
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+  '\x1fe3'# -> unI64 3382099394429893
+  -- GREEK SMALL LETTER RHO WITH PSILI
+  '\x1fe4'# -> unI64 1650459585
+  -- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+  '\x1fe6'# -> unI64 1749025733
+  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+  '\x1fe7'# -> unI64 3667972417651653
+  -- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+  '\x1ff2'# -> unI64 1998593916
+  -- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+  '\x1ff3'# -> unI64 1998586825
+  -- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+  '\x1ff4'# -> unI64 1998586830
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+  '\x1ff6'# -> unI64 1749025737
+  -- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+  '\x1ff7'# -> unI64 4191340074107849
+  -- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+  '\x1ffc'# -> unI64 1998586825
+  -- LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
+  '\xa7c7'# -> unI64 42952
+  -- LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
+  '\xa7c9'# -> unI64 42954
+  -- LATIN CAPITAL LETTER REVERSED HALF H
+  '\xa7f5'# -> unI64 42998
+  -- CHEROKEE SMALL LETTER A
+  '\xab70'# -> unI64 5024
+  -- CHEROKEE SMALL LETTER E
+  '\xab71'# -> unI64 5025
+  -- CHEROKEE SMALL LETTER I
+  '\xab72'# -> unI64 5026
+  -- CHEROKEE SMALL LETTER O
+  '\xab73'# -> unI64 5027
+  -- CHEROKEE SMALL LETTER U
+  '\xab74'# -> unI64 5028
+  -- CHEROKEE SMALL LETTER V
+  '\xab75'# -> unI64 5029
+  -- CHEROKEE SMALL LETTER GA
+  '\xab76'# -> unI64 5030
+  -- CHEROKEE SMALL LETTER KA
+  '\xab77'# -> unI64 5031
+  -- CHEROKEE SMALL LETTER GE
+  '\xab78'# -> unI64 5032
+  -- CHEROKEE SMALL LETTER GI
+  '\xab79'# -> unI64 5033
+  -- CHEROKEE SMALL LETTER GO
+  '\xab7a'# -> unI64 5034
+  -- CHEROKEE SMALL LETTER GU
+  '\xab7b'# -> unI64 5035
+  -- CHEROKEE SMALL LETTER GV
+  '\xab7c'# -> unI64 5036
+  -- CHEROKEE SMALL LETTER HA
+  '\xab7d'# -> unI64 5037
+  -- CHEROKEE SMALL LETTER HE
+  '\xab7e'# -> unI64 5038
+  -- CHEROKEE SMALL LETTER HI
+  '\xab7f'# -> unI64 5039
+  -- CHEROKEE SMALL LETTER HO
+  '\xab80'# -> unI64 5040
+  -- CHEROKEE SMALL LETTER HU
+  '\xab81'# -> unI64 5041
+  -- CHEROKEE SMALL LETTER HV
+  '\xab82'# -> unI64 5042
+  -- CHEROKEE SMALL LETTER LA
+  '\xab83'# -> unI64 5043
+  -- CHEROKEE SMALL LETTER LE
+  '\xab84'# -> unI64 5044
+  -- CHEROKEE SMALL LETTER LI
+  '\xab85'# -> unI64 5045
+  -- CHEROKEE SMALL LETTER LO
+  '\xab86'# -> unI64 5046
+  -- CHEROKEE SMALL LETTER LU
+  '\xab87'# -> unI64 5047
+  -- CHEROKEE SMALL LETTER LV
+  '\xab88'# -> unI64 5048
+  -- CHEROKEE SMALL LETTER MA
+  '\xab89'# -> unI64 5049
+  -- CHEROKEE SMALL LETTER ME
+  '\xab8a'# -> unI64 5050
+  -- CHEROKEE SMALL LETTER MI
+  '\xab8b'# -> unI64 5051
+  -- CHEROKEE SMALL LETTER MO
+  '\xab8c'# -> unI64 5052
+  -- CHEROKEE SMALL LETTER MU
+  '\xab8d'# -> unI64 5053
+  -- CHEROKEE SMALL LETTER NA
+  '\xab8e'# -> unI64 5054
+  -- CHEROKEE SMALL LETTER HNA
+  '\xab8f'# -> unI64 5055
+  -- CHEROKEE SMALL LETTER NAH
+  '\xab90'# -> unI64 5056
+  -- CHEROKEE SMALL LETTER NE
+  '\xab91'# -> unI64 5057
+  -- CHEROKEE SMALL LETTER NI
+  '\xab92'# -> unI64 5058
+  -- CHEROKEE SMALL LETTER NO
+  '\xab93'# -> unI64 5059
+  -- CHEROKEE SMALL LETTER NU
+  '\xab94'# -> unI64 5060
+  -- CHEROKEE SMALL LETTER NV
+  '\xab95'# -> unI64 5061
+  -- CHEROKEE SMALL LETTER QUA
+  '\xab96'# -> unI64 5062
+  -- CHEROKEE SMALL LETTER QUE
+  '\xab97'# -> unI64 5063
+  -- CHEROKEE SMALL LETTER QUI
+  '\xab98'# -> unI64 5064
+  -- CHEROKEE SMALL LETTER QUO
+  '\xab99'# -> unI64 5065
+  -- CHEROKEE SMALL LETTER QUU
+  '\xab9a'# -> unI64 5066
+  -- CHEROKEE SMALL LETTER QUV
+  '\xab9b'# -> unI64 5067
+  -- CHEROKEE SMALL LETTER SA
+  '\xab9c'# -> unI64 5068
+  -- CHEROKEE SMALL LETTER S
+  '\xab9d'# -> unI64 5069
+  -- CHEROKEE SMALL LETTER SE
+  '\xab9e'# -> unI64 5070
+  -- CHEROKEE SMALL LETTER SI
+  '\xab9f'# -> unI64 5071
+  -- CHEROKEE SMALL LETTER SO
+  '\xaba0'# -> unI64 5072
+  -- CHEROKEE SMALL LETTER SU
+  '\xaba1'# -> unI64 5073
+  -- CHEROKEE SMALL LETTER SV
+  '\xaba2'# -> unI64 5074
+  -- CHEROKEE SMALL LETTER DA
+  '\xaba3'# -> unI64 5075
+  -- CHEROKEE SMALL LETTER TA
+  '\xaba4'# -> unI64 5076
+  -- CHEROKEE SMALL LETTER DE
+  '\xaba5'# -> unI64 5077
+  -- CHEROKEE SMALL LETTER TE
+  '\xaba6'# -> unI64 5078
+  -- CHEROKEE SMALL LETTER DI
+  '\xaba7'# -> unI64 5079
+  -- CHEROKEE SMALL LETTER TI
+  '\xaba8'# -> unI64 5080
+  -- CHEROKEE SMALL LETTER DO
+  '\xaba9'# -> unI64 5081
+  -- CHEROKEE SMALL LETTER DU
+  '\xabaa'# -> unI64 5082
+  -- CHEROKEE SMALL LETTER DV
+  '\xabab'# -> unI64 5083
+  -- CHEROKEE SMALL LETTER DLA
+  '\xabac'# -> unI64 5084
+  -- CHEROKEE SMALL LETTER TLA
+  '\xabad'# -> unI64 5085
+  -- CHEROKEE SMALL LETTER TLE
+  '\xabae'# -> unI64 5086
+  -- CHEROKEE SMALL LETTER TLI
+  '\xabaf'# -> unI64 5087
+  -- CHEROKEE SMALL LETTER TLO
+  '\xabb0'# -> unI64 5088
+  -- CHEROKEE SMALL LETTER TLU
+  '\xabb1'# -> unI64 5089
+  -- CHEROKEE SMALL LETTER TLV
+  '\xabb2'# -> unI64 5090
+  -- CHEROKEE SMALL LETTER TSA
+  '\xabb3'# -> unI64 5091
+  -- CHEROKEE SMALL LETTER TSE
+  '\xabb4'# -> unI64 5092
+  -- CHEROKEE SMALL LETTER TSI
+  '\xabb5'# -> unI64 5093
+  -- CHEROKEE SMALL LETTER TSO
+  '\xabb6'# -> unI64 5094
+  -- CHEROKEE SMALL LETTER TSU
+  '\xabb7'# -> unI64 5095
+  -- CHEROKEE SMALL LETTER TSV
+  '\xabb8'# -> unI64 5096
+  -- CHEROKEE SMALL LETTER WA
+  '\xabb9'# -> unI64 5097
+  -- CHEROKEE SMALL LETTER WE
+  '\xabba'# -> unI64 5098
+  -- CHEROKEE SMALL LETTER WI
+  '\xabbb'# -> unI64 5099
+  -- CHEROKEE SMALL LETTER WO
+  '\xabbc'# -> unI64 5100
+  -- CHEROKEE SMALL LETTER WU
+  '\xabbd'# -> unI64 5101
+  -- CHEROKEE SMALL LETTER WV
+  '\xabbe'# -> unI64 5102
+  -- CHEROKEE SMALL LETTER YA
+  '\xabbf'# -> unI64 5103
+  -- LATIN SMALL LIGATURE FF
+  '\xfb00'# -> unI64 213909606
+  -- LATIN SMALL LIGATURE FI
+  '\xfb01'# -> unI64 220201062
+  -- LATIN SMALL LIGATURE FL
+  '\xfb02'# -> unI64 226492518
+  -- LATIN SMALL LIGATURE FFI
+  '\xfb03'# -> unI64 461795097575526
+  -- LATIN SMALL LIGATURE FFL
+  '\xfb04'# -> unI64 474989237108838
+  -- LATIN SMALL LIGATURE LONG S T
+  '\xfb05'# -> unI64 243269747
+  -- LATIN SMALL LIGATURE ST
+  '\xfb06'# -> unI64 243269747
+  -- ARMENIAN SMALL LIGATURE MEN NOW
+  '\xfb13'# -> unI64 2931819892
+  -- ARMENIAN SMALL LIGATURE MEN ECH
+  '\xfb14'# -> unI64 2896168308
+  -- ARMENIAN SMALL LIGATURE MEN INI
+  '\xfb15'# -> unI64 2908751220
+  -- ARMENIAN SMALL LIGATURE VEW NOW
+  '\xfb16'# -> unI64 2931819902
+  -- ARMENIAN SMALL LIGATURE MEN XEH
+  '\xfb17'# -> unI64 2912945524
+  '\x0041'# -> unI64 97
+  '\x0042'# -> unI64 98
+  '\x0043'# -> unI64 99
+  '\x0044'# -> unI64 100
+  '\x0045'# -> unI64 101
+  '\x0046'# -> unI64 102
+  '\x0047'# -> unI64 103
+  '\x0048'# -> unI64 104
+  '\x0049'# -> unI64 105
+  '\x004a'# -> unI64 106
+  '\x004b'# -> unI64 107
+  '\x004c'# -> unI64 108
+  '\x004d'# -> unI64 109
+  '\x004e'# -> unI64 110
+  '\x004f'# -> unI64 111
+  '\x0050'# -> unI64 112
+  '\x0051'# -> unI64 113
+  '\x0052'# -> unI64 114
+  '\x0053'# -> unI64 115
+  '\x0054'# -> unI64 116
+  '\x0055'# -> unI64 117
+  '\x0056'# -> unI64 118
+  '\x0057'# -> unI64 119
+  '\x0058'# -> unI64 120
+  '\x0059'# -> unI64 121
+  '\x005a'# -> unI64 122
+  '\x00c0'# -> unI64 224
+  '\x00c1'# -> unI64 225
+  '\x00c2'# -> unI64 226
+  '\x00c3'# -> unI64 227
+  '\x00c4'# -> unI64 228
+  '\x00c5'# -> unI64 229
+  '\x00c6'# -> unI64 230
+  '\x00c7'# -> unI64 231
+  '\x00c8'# -> unI64 232
+  '\x00c9'# -> unI64 233
+  '\x00ca'# -> unI64 234
+  '\x00cb'# -> unI64 235
+  '\x00cc'# -> unI64 236
+  '\x00cd'# -> unI64 237
+  '\x00ce'# -> unI64 238
+  '\x00cf'# -> unI64 239
+  '\x00d0'# -> unI64 240
+  '\x00d1'# -> unI64 241
+  '\x00d2'# -> unI64 242
+  '\x00d3'# -> unI64 243
+  '\x00d4'# -> unI64 244
+  '\x00d5'# -> unI64 245
+  '\x00d6'# -> unI64 246
+  '\x00d8'# -> unI64 248
+  '\x00d9'# -> unI64 249
+  '\x00da'# -> unI64 250
+  '\x00db'# -> unI64 251
+  '\x00dc'# -> unI64 252
+  '\x00dd'# -> unI64 253
+  '\x00de'# -> unI64 254
+  '\x0100'# -> unI64 257
+  '\x0102'# -> unI64 259
+  '\x0104'# -> unI64 261
+  '\x0106'# -> unI64 263
+  '\x0108'# -> unI64 265
+  '\x010a'# -> unI64 267
+  '\x010c'# -> unI64 269
+  '\x010e'# -> unI64 271
+  '\x0110'# -> unI64 273
+  '\x0112'# -> unI64 275
+  '\x0114'# -> unI64 277
+  '\x0116'# -> unI64 279
+  '\x0118'# -> unI64 281
+  '\x011a'# -> unI64 283
+  '\x011c'# -> unI64 285
+  '\x011e'# -> unI64 287
+  '\x0120'# -> unI64 289
+  '\x0122'# -> unI64 291
+  '\x0124'# -> unI64 293
+  '\x0126'# -> unI64 295
+  '\x0128'# -> unI64 297
+  '\x012a'# -> unI64 299
+  '\x012c'# -> unI64 301
+  '\x012e'# -> unI64 303
+  '\x0132'# -> unI64 307
+  '\x0134'# -> unI64 309
+  '\x0136'# -> unI64 311
+  '\x0139'# -> unI64 314
+  '\x013b'# -> unI64 316
+  '\x013d'# -> unI64 318
+  '\x013f'# -> unI64 320
+  '\x0141'# -> unI64 322
+  '\x0143'# -> unI64 324
+  '\x0145'# -> unI64 326
+  '\x0147'# -> unI64 328
+  '\x014a'# -> unI64 331
+  '\x014c'# -> unI64 333
+  '\x014e'# -> unI64 335
+  '\x0150'# -> unI64 337
+  '\x0152'# -> unI64 339
+  '\x0154'# -> unI64 341
+  '\x0156'# -> unI64 343
+  '\x0158'# -> unI64 345
+  '\x015a'# -> unI64 347
+  '\x015c'# -> unI64 349
+  '\x015e'# -> unI64 351
+  '\x0160'# -> unI64 353
+  '\x0162'# -> unI64 355
+  '\x0164'# -> unI64 357
+  '\x0166'# -> unI64 359
+  '\x0168'# -> unI64 361
+  '\x016a'# -> unI64 363
+  '\x016c'# -> unI64 365
+  '\x016e'# -> unI64 367
+  '\x0170'# -> unI64 369
+  '\x0172'# -> unI64 371
+  '\x0174'# -> unI64 373
+  '\x0176'# -> unI64 375
+  '\x0178'# -> unI64 255
+  '\x0179'# -> unI64 378
+  '\x017b'# -> unI64 380
+  '\x017d'# -> unI64 382
+  '\x0181'# -> unI64 595
+  '\x0182'# -> unI64 387
+  '\x0184'# -> unI64 389
+  '\x0186'# -> unI64 596
+  '\x0187'# -> unI64 392
+  '\x0189'# -> unI64 598
+  '\x018a'# -> unI64 599
+  '\x018b'# -> unI64 396
+  '\x018e'# -> unI64 477
+  '\x018f'# -> unI64 601
+  '\x0190'# -> unI64 603
+  '\x0191'# -> unI64 402
+  '\x0193'# -> unI64 608
+  '\x0194'# -> unI64 611
+  '\x0196'# -> unI64 617
+  '\x0197'# -> unI64 616
+  '\x0198'# -> unI64 409
+  '\x019c'# -> unI64 623
+  '\x019d'# -> unI64 626
+  '\x019f'# -> unI64 629
+  '\x01a0'# -> unI64 417
+  '\x01a2'# -> unI64 419
+  '\x01a4'# -> unI64 421
+  '\x01a6'# -> unI64 640
+  '\x01a7'# -> unI64 424
+  '\x01a9'# -> unI64 643
+  '\x01ac'# -> unI64 429
+  '\x01ae'# -> unI64 648
+  '\x01af'# -> unI64 432
+  '\x01b1'# -> unI64 650
+  '\x01b2'# -> unI64 651
+  '\x01b3'# -> unI64 436
+  '\x01b5'# -> unI64 438
+  '\x01b7'# -> unI64 658
+  '\x01b8'# -> unI64 441
+  '\x01bc'# -> unI64 445
+  '\x01c4'# -> unI64 454
+  '\x01c5'# -> unI64 454
+  '\x01c7'# -> unI64 457
+  '\x01c8'# -> unI64 457
+  '\x01ca'# -> unI64 460
+  '\x01cb'# -> unI64 460
+  '\x01cd'# -> unI64 462
+  '\x01cf'# -> unI64 464
+  '\x01d1'# -> unI64 466
+  '\x01d3'# -> unI64 468
+  '\x01d5'# -> unI64 470
+  '\x01d7'# -> unI64 472
+  '\x01d9'# -> unI64 474
+  '\x01db'# -> unI64 476
+  '\x01de'# -> unI64 479
+  '\x01e0'# -> unI64 481
+  '\x01e2'# -> unI64 483
+  '\x01e4'# -> unI64 485
+  '\x01e6'# -> unI64 487
+  '\x01e8'# -> unI64 489
+  '\x01ea'# -> unI64 491
+  '\x01ec'# -> unI64 493
+  '\x01ee'# -> unI64 495
+  '\x01f1'# -> unI64 499
+  '\x01f2'# -> unI64 499
+  '\x01f4'# -> unI64 501
+  '\x01f6'# -> unI64 405
+  '\x01f7'# -> unI64 447
+  '\x01f8'# -> unI64 505
+  '\x01fa'# -> unI64 507
+  '\x01fc'# -> unI64 509
+  '\x01fe'# -> unI64 511
+  '\x0200'# -> unI64 513
+  '\x0202'# -> unI64 515
+  '\x0204'# -> unI64 517
+  '\x0206'# -> unI64 519
+  '\x0208'# -> unI64 521
+  '\x020a'# -> unI64 523
+  '\x020c'# -> unI64 525
+  '\x020e'# -> unI64 527
+  '\x0210'# -> unI64 529
+  '\x0212'# -> unI64 531
+  '\x0214'# -> unI64 533
+  '\x0216'# -> unI64 535
+  '\x0218'# -> unI64 537
+  '\x021a'# -> unI64 539
+  '\x021c'# -> unI64 541
+  '\x021e'# -> unI64 543
+  '\x0220'# -> unI64 414
+  '\x0222'# -> unI64 547
+  '\x0224'# -> unI64 549
+  '\x0226'# -> unI64 551
+  '\x0228'# -> unI64 553
+  '\x022a'# -> unI64 555
+  '\x022c'# -> unI64 557
+  '\x022e'# -> unI64 559
+  '\x0230'# -> unI64 561
+  '\x0232'# -> unI64 563
+  '\x023a'# -> unI64 11365
+  '\x023b'# -> unI64 572
+  '\x023d'# -> unI64 410
+  '\x023e'# -> unI64 11366
+  '\x0241'# -> unI64 578
+  '\x0243'# -> unI64 384
+  '\x0244'# -> unI64 649
+  '\x0245'# -> unI64 652
+  '\x0246'# -> unI64 583
+  '\x0248'# -> unI64 585
+  '\x024a'# -> unI64 587
+  '\x024c'# -> unI64 589
+  '\x024e'# -> unI64 591
+  '\x0370'# -> unI64 881
+  '\x0372'# -> unI64 883
+  '\x0376'# -> unI64 887
+  '\x037f'# -> unI64 1011
+  '\x0386'# -> unI64 940
+  '\x0388'# -> unI64 941
+  '\x0389'# -> unI64 942
+  '\x038a'# -> unI64 943
+  '\x038c'# -> unI64 972
+  '\x038e'# -> unI64 973
+  '\x038f'# -> unI64 974
+  '\x0391'# -> unI64 945
+  '\x0392'# -> unI64 946
+  '\x0393'# -> unI64 947
+  '\x0394'# -> unI64 948
+  '\x0395'# -> unI64 949
+  '\x0396'# -> unI64 950
+  '\x0397'# -> unI64 951
+  '\x0398'# -> unI64 952
+  '\x0399'# -> unI64 953
+  '\x039a'# -> unI64 954
+  '\x039b'# -> unI64 955
+  '\x039c'# -> unI64 956
+  '\x039d'# -> unI64 957
+  '\x039e'# -> unI64 958
+  '\x039f'# -> unI64 959
+  '\x03a0'# -> unI64 960
+  '\x03a1'# -> unI64 961
+  '\x03a3'# -> unI64 963
+  '\x03a4'# -> unI64 964
+  '\x03a5'# -> unI64 965
+  '\x03a6'# -> unI64 966
+  '\x03a7'# -> unI64 967
+  '\x03a8'# -> unI64 968
+  '\x03a9'# -> unI64 969
+  '\x03aa'# -> unI64 970
+  '\x03ab'# -> unI64 971
+  '\x03cf'# -> unI64 983
+  '\x03d8'# -> unI64 985
+  '\x03da'# -> unI64 987
+  '\x03dc'# -> unI64 989
+  '\x03de'# -> unI64 991
+  '\x03e0'# -> unI64 993
+  '\x03e2'# -> unI64 995
+  '\x03e4'# -> unI64 997
+  '\x03e6'# -> unI64 999
+  '\x03e8'# -> unI64 1001
+  '\x03ea'# -> unI64 1003
+  '\x03ec'# -> unI64 1005
+  '\x03ee'# -> unI64 1007
+  '\x03f4'# -> unI64 952
+  '\x03f7'# -> unI64 1016
+  '\x03f9'# -> unI64 1010
+  '\x03fa'# -> unI64 1019
+  '\x03fd'# -> unI64 891
+  '\x03fe'# -> unI64 892
+  '\x03ff'# -> unI64 893
+  '\x0400'# -> unI64 1104
+  '\x0401'# -> unI64 1105
+  '\x0402'# -> unI64 1106
+  '\x0403'# -> unI64 1107
+  '\x0404'# -> unI64 1108
+  '\x0405'# -> unI64 1109
+  '\x0406'# -> unI64 1110
+  '\x0407'# -> unI64 1111
+  '\x0408'# -> unI64 1112
+  '\x0409'# -> unI64 1113
+  '\x040a'# -> unI64 1114
+  '\x040b'# -> unI64 1115
+  '\x040c'# -> unI64 1116
+  '\x040d'# -> unI64 1117
+  '\x040e'# -> unI64 1118
+  '\x040f'# -> unI64 1119
+  '\x0410'# -> unI64 1072
+  '\x0411'# -> unI64 1073
+  '\x0412'# -> unI64 1074
+  '\x0413'# -> unI64 1075
+  '\x0414'# -> unI64 1076
+  '\x0415'# -> unI64 1077
+  '\x0416'# -> unI64 1078
+  '\x0417'# -> unI64 1079
+  '\x0418'# -> unI64 1080
+  '\x0419'# -> unI64 1081
+  '\x041a'# -> unI64 1082
+  '\x041b'# -> unI64 1083
+  '\x041c'# -> unI64 1084
+  '\x041d'# -> unI64 1085
+  '\x041e'# -> unI64 1086
+  '\x041f'# -> unI64 1087
+  '\x0420'# -> unI64 1088
+  '\x0421'# -> unI64 1089
+  '\x0422'# -> unI64 1090
+  '\x0423'# -> unI64 1091
+  '\x0424'# -> unI64 1092
+  '\x0425'# -> unI64 1093
+  '\x0426'# -> unI64 1094
+  '\x0427'# -> unI64 1095
+  '\x0428'# -> unI64 1096
+  '\x0429'# -> unI64 1097
+  '\x042a'# -> unI64 1098
+  '\x042b'# -> unI64 1099
+  '\x042c'# -> unI64 1100
+  '\x042d'# -> unI64 1101
+  '\x042e'# -> unI64 1102
+  '\x042f'# -> unI64 1103
+  '\x0460'# -> unI64 1121
+  '\x0462'# -> unI64 1123
+  '\x0464'# -> unI64 1125
+  '\x0466'# -> unI64 1127
+  '\x0468'# -> unI64 1129
+  '\x046a'# -> unI64 1131
+  '\x046c'# -> unI64 1133
+  '\x046e'# -> unI64 1135
+  '\x0470'# -> unI64 1137
+  '\x0472'# -> unI64 1139
+  '\x0474'# -> unI64 1141
+  '\x0476'# -> unI64 1143
+  '\x0478'# -> unI64 1145
+  '\x047a'# -> unI64 1147
+  '\x047c'# -> unI64 1149
+  '\x047e'# -> unI64 1151
+  '\x0480'# -> unI64 1153
+  '\x048a'# -> unI64 1163
+  '\x048c'# -> unI64 1165
+  '\x048e'# -> unI64 1167
+  '\x0490'# -> unI64 1169
+  '\x0492'# -> unI64 1171
+  '\x0494'# -> unI64 1173
+  '\x0496'# -> unI64 1175
+  '\x0498'# -> unI64 1177
+  '\x049a'# -> unI64 1179
+  '\x049c'# -> unI64 1181
+  '\x049e'# -> unI64 1183
+  '\x04a0'# -> unI64 1185
+  '\x04a2'# -> unI64 1187
+  '\x04a4'# -> unI64 1189
+  '\x04a6'# -> unI64 1191
+  '\x04a8'# -> unI64 1193
+  '\x04aa'# -> unI64 1195
+  '\x04ac'# -> unI64 1197
+  '\x04ae'# -> unI64 1199
+  '\x04b0'# -> unI64 1201
+  '\x04b2'# -> unI64 1203
+  '\x04b4'# -> unI64 1205
+  '\x04b6'# -> unI64 1207
+  '\x04b8'# -> unI64 1209
+  '\x04ba'# -> unI64 1211
+  '\x04bc'# -> unI64 1213
+  '\x04be'# -> unI64 1215
+  '\x04c0'# -> unI64 1231
+  '\x04c1'# -> unI64 1218
+  '\x04c3'# -> unI64 1220
+  '\x04c5'# -> unI64 1222
+  '\x04c7'# -> unI64 1224
+  '\x04c9'# -> unI64 1226
+  '\x04cb'# -> unI64 1228
+  '\x04cd'# -> unI64 1230
+  '\x04d0'# -> unI64 1233
+  '\x04d2'# -> unI64 1235
+  '\x04d4'# -> unI64 1237
+  '\x04d6'# -> unI64 1239
+  '\x04d8'# -> unI64 1241
+  '\x04da'# -> unI64 1243
+  '\x04dc'# -> unI64 1245
+  '\x04de'# -> unI64 1247
+  '\x04e0'# -> unI64 1249
+  '\x04e2'# -> unI64 1251
+  '\x04e4'# -> unI64 1253
+  '\x04e6'# -> unI64 1255
+  '\x04e8'# -> unI64 1257
+  '\x04ea'# -> unI64 1259
+  '\x04ec'# -> unI64 1261
+  '\x04ee'# -> unI64 1263
+  '\x04f0'# -> unI64 1265
+  '\x04f2'# -> unI64 1267
+  '\x04f4'# -> unI64 1269
+  '\x04f6'# -> unI64 1271
+  '\x04f8'# -> unI64 1273
+  '\x04fa'# -> unI64 1275
+  '\x04fc'# -> unI64 1277
+  '\x04fe'# -> unI64 1279
+  '\x0500'# -> unI64 1281
+  '\x0502'# -> unI64 1283
+  '\x0504'# -> unI64 1285
+  '\x0506'# -> unI64 1287
+  '\x0508'# -> unI64 1289
+  '\x050a'# -> unI64 1291
+  '\x050c'# -> unI64 1293
+  '\x050e'# -> unI64 1295
+  '\x0510'# -> unI64 1297
+  '\x0512'# -> unI64 1299
+  '\x0514'# -> unI64 1301
+  '\x0516'# -> unI64 1303
+  '\x0518'# -> unI64 1305
+  '\x051a'# -> unI64 1307
+  '\x051c'# -> unI64 1309
+  '\x051e'# -> unI64 1311
+  '\x0520'# -> unI64 1313
+  '\x0522'# -> unI64 1315
+  '\x0524'# -> unI64 1317
+  '\x0526'# -> unI64 1319
+  '\x0528'# -> unI64 1321
+  '\x052a'# -> unI64 1323
+  '\x052c'# -> unI64 1325
+  '\x052e'# -> unI64 1327
+  '\x0531'# -> unI64 1377
+  '\x0532'# -> unI64 1378
+  '\x0533'# -> unI64 1379
+  '\x0534'# -> unI64 1380
+  '\x0535'# -> unI64 1381
+  '\x0536'# -> unI64 1382
+  '\x0537'# -> unI64 1383
+  '\x0538'# -> unI64 1384
+  '\x0539'# -> unI64 1385
+  '\x053a'# -> unI64 1386
+  '\x053b'# -> unI64 1387
+  '\x053c'# -> unI64 1388
+  '\x053d'# -> unI64 1389
+  '\x053e'# -> unI64 1390
+  '\x053f'# -> unI64 1391
+  '\x0540'# -> unI64 1392
+  '\x0541'# -> unI64 1393
+  '\x0542'# -> unI64 1394
+  '\x0543'# -> unI64 1395
+  '\x0544'# -> unI64 1396
+  '\x0545'# -> unI64 1397
+  '\x0546'# -> unI64 1398
+  '\x0547'# -> unI64 1399
+  '\x0548'# -> unI64 1400
+  '\x0549'# -> unI64 1401
+  '\x054a'# -> unI64 1402
+  '\x054b'# -> unI64 1403
+  '\x054c'# -> unI64 1404
+  '\x054d'# -> unI64 1405
+  '\x054e'# -> unI64 1406
+  '\x054f'# -> unI64 1407
+  '\x0550'# -> unI64 1408
+  '\x0551'# -> unI64 1409
+  '\x0552'# -> unI64 1410
+  '\x0553'# -> unI64 1411
+  '\x0554'# -> unI64 1412
+  '\x0555'# -> unI64 1413
+  '\x0556'# -> unI64 1414
+  '\x10a0'# -> unI64 11520
+  '\x10a1'# -> unI64 11521
+  '\x10a2'# -> unI64 11522
+  '\x10a3'# -> unI64 11523
+  '\x10a4'# -> unI64 11524
+  '\x10a5'# -> unI64 11525
+  '\x10a6'# -> unI64 11526
+  '\x10a7'# -> unI64 11527
+  '\x10a8'# -> unI64 11528
+  '\x10a9'# -> unI64 11529
+  '\x10aa'# -> unI64 11530
+  '\x10ab'# -> unI64 11531
+  '\x10ac'# -> unI64 11532
+  '\x10ad'# -> unI64 11533
+  '\x10ae'# -> unI64 11534
+  '\x10af'# -> unI64 11535
+  '\x10b0'# -> unI64 11536
+  '\x10b1'# -> unI64 11537
+  '\x10b2'# -> unI64 11538
+  '\x10b3'# -> unI64 11539
+  '\x10b4'# -> unI64 11540
+  '\x10b5'# -> unI64 11541
+  '\x10b6'# -> unI64 11542
+  '\x10b7'# -> unI64 11543
+  '\x10b8'# -> unI64 11544
+  '\x10b9'# -> unI64 11545
+  '\x10ba'# -> unI64 11546
+  '\x10bb'# -> unI64 11547
+  '\x10bc'# -> unI64 11548
+  '\x10bd'# -> unI64 11549
+  '\x10be'# -> unI64 11550
+  '\x10bf'# -> unI64 11551
+  '\x10c0'# -> unI64 11552
+  '\x10c1'# -> unI64 11553
+  '\x10c2'# -> unI64 11554
+  '\x10c3'# -> unI64 11555
+  '\x10c4'# -> unI64 11556
+  '\x10c5'# -> unI64 11557
+  '\x10c7'# -> unI64 11559
+  '\x10cd'# -> unI64 11565
+  '\x13a0'# -> unI64 43888
+  '\x13a1'# -> unI64 43889
+  '\x13a2'# -> unI64 43890
+  '\x13a3'# -> unI64 43891
+  '\x13a4'# -> unI64 43892
+  '\x13a5'# -> unI64 43893
+  '\x13a6'# -> unI64 43894
+  '\x13a7'# -> unI64 43895
+  '\x13a8'# -> unI64 43896
+  '\x13a9'# -> unI64 43897
+  '\x13aa'# -> unI64 43898
+  '\x13ab'# -> unI64 43899
+  '\x13ac'# -> unI64 43900
+  '\x13ad'# -> unI64 43901
+  '\x13ae'# -> unI64 43902
+  '\x13af'# -> unI64 43903
+  '\x13b0'# -> unI64 43904
+  '\x13b1'# -> unI64 43905
+  '\x13b2'# -> unI64 43906
+  '\x13b3'# -> unI64 43907
+  '\x13b4'# -> unI64 43908
+  '\x13b5'# -> unI64 43909
+  '\x13b6'# -> unI64 43910
+  '\x13b7'# -> unI64 43911
+  '\x13b8'# -> unI64 43912
+  '\x13b9'# -> unI64 43913
+  '\x13ba'# -> unI64 43914
+  '\x13bb'# -> unI64 43915
+  '\x13bc'# -> unI64 43916
+  '\x13bd'# -> unI64 43917
+  '\x13be'# -> unI64 43918
+  '\x13bf'# -> unI64 43919
+  '\x13c0'# -> unI64 43920
+  '\x13c1'# -> unI64 43921
+  '\x13c2'# -> unI64 43922
+  '\x13c3'# -> unI64 43923
+  '\x13c4'# -> unI64 43924
+  '\x13c5'# -> unI64 43925
+  '\x13c6'# -> unI64 43926
+  '\x13c7'# -> unI64 43927
+  '\x13c8'# -> unI64 43928
+  '\x13c9'# -> unI64 43929
+  '\x13ca'# -> unI64 43930
+  '\x13cb'# -> unI64 43931
+  '\x13cc'# -> unI64 43932
+  '\x13cd'# -> unI64 43933
+  '\x13ce'# -> unI64 43934
+  '\x13cf'# -> unI64 43935
+  '\x13d0'# -> unI64 43936
+  '\x13d1'# -> unI64 43937
+  '\x13d2'# -> unI64 43938
+  '\x13d3'# -> unI64 43939
+  '\x13d4'# -> unI64 43940
+  '\x13d5'# -> unI64 43941
+  '\x13d6'# -> unI64 43942
+  '\x13d7'# -> unI64 43943
+  '\x13d8'# -> unI64 43944
+  '\x13d9'# -> unI64 43945
+  '\x13da'# -> unI64 43946
+  '\x13db'# -> unI64 43947
+  '\x13dc'# -> unI64 43948
+  '\x13dd'# -> unI64 43949
+  '\x13de'# -> unI64 43950
+  '\x13df'# -> unI64 43951
+  '\x13e0'# -> unI64 43952
+  '\x13e1'# -> unI64 43953
+  '\x13e2'# -> unI64 43954
+  '\x13e3'# -> unI64 43955
+  '\x13e4'# -> unI64 43956
+  '\x13e5'# -> unI64 43957
+  '\x13e6'# -> unI64 43958
+  '\x13e7'# -> unI64 43959
+  '\x13e8'# -> unI64 43960
+  '\x13e9'# -> unI64 43961
+  '\x13ea'# -> unI64 43962
+  '\x13eb'# -> unI64 43963
+  '\x13ec'# -> unI64 43964
+  '\x13ed'# -> unI64 43965
+  '\x13ee'# -> unI64 43966
+  '\x13ef'# -> unI64 43967
+  '\x13f0'# -> unI64 5112
+  '\x13f1'# -> unI64 5113
+  '\x13f2'# -> unI64 5114
+  '\x13f3'# -> unI64 5115
+  '\x13f4'# -> unI64 5116
+  '\x13f5'# -> unI64 5117
+  '\x1c90'# -> unI64 4304
+  '\x1c91'# -> unI64 4305
+  '\x1c92'# -> unI64 4306
+  '\x1c93'# -> unI64 4307
+  '\x1c94'# -> unI64 4308
+  '\x1c95'# -> unI64 4309
+  '\x1c96'# -> unI64 4310
+  '\x1c97'# -> unI64 4311
+  '\x1c98'# -> unI64 4312
+  '\x1c99'# -> unI64 4313
+  '\x1c9a'# -> unI64 4314
+  '\x1c9b'# -> unI64 4315
+  '\x1c9c'# -> unI64 4316
+  '\x1c9d'# -> unI64 4317
+  '\x1c9e'# -> unI64 4318
+  '\x1c9f'# -> unI64 4319
+  '\x1ca0'# -> unI64 4320
+  '\x1ca1'# -> unI64 4321
+  '\x1ca2'# -> unI64 4322
+  '\x1ca3'# -> unI64 4323
+  '\x1ca4'# -> unI64 4324
+  '\x1ca5'# -> unI64 4325
+  '\x1ca6'# -> unI64 4326
+  '\x1ca7'# -> unI64 4327
+  '\x1ca8'# -> unI64 4328
+  '\x1ca9'# -> unI64 4329
+  '\x1caa'# -> unI64 4330
+  '\x1cab'# -> unI64 4331
+  '\x1cac'# -> unI64 4332
+  '\x1cad'# -> unI64 4333
+  '\x1cae'# -> unI64 4334
+  '\x1caf'# -> unI64 4335
+  '\x1cb0'# -> unI64 4336
+  '\x1cb1'# -> unI64 4337
+  '\x1cb2'# -> unI64 4338
+  '\x1cb3'# -> unI64 4339
+  '\x1cb4'# -> unI64 4340
+  '\x1cb5'# -> unI64 4341
+  '\x1cb6'# -> unI64 4342
+  '\x1cb7'# -> unI64 4343
+  '\x1cb8'# -> unI64 4344
+  '\x1cb9'# -> unI64 4345
+  '\x1cba'# -> unI64 4346
+  '\x1cbd'# -> unI64 4349
+  '\x1cbe'# -> unI64 4350
+  '\x1cbf'# -> unI64 4351
+  '\x1e00'# -> unI64 7681
+  '\x1e02'# -> unI64 7683
+  '\x1e04'# -> unI64 7685
+  '\x1e06'# -> unI64 7687
+  '\x1e08'# -> unI64 7689
+  '\x1e0a'# -> unI64 7691
+  '\x1e0c'# -> unI64 7693
+  '\x1e0e'# -> unI64 7695
+  '\x1e10'# -> unI64 7697
+  '\x1e12'# -> unI64 7699
+  '\x1e14'# -> unI64 7701
+  '\x1e16'# -> unI64 7703
+  '\x1e18'# -> unI64 7705
+  '\x1e1a'# -> unI64 7707
+  '\x1e1c'# -> unI64 7709
+  '\x1e1e'# -> unI64 7711
+  '\x1e20'# -> unI64 7713
+  '\x1e22'# -> unI64 7715
+  '\x1e24'# -> unI64 7717
+  '\x1e26'# -> unI64 7719
+  '\x1e28'# -> unI64 7721
+  '\x1e2a'# -> unI64 7723
+  '\x1e2c'# -> unI64 7725
+  '\x1e2e'# -> unI64 7727
+  '\x1e30'# -> unI64 7729
+  '\x1e32'# -> unI64 7731
+  '\x1e34'# -> unI64 7733
+  '\x1e36'# -> unI64 7735
+  '\x1e38'# -> unI64 7737
+  '\x1e3a'# -> unI64 7739
+  '\x1e3c'# -> unI64 7741
+  '\x1e3e'# -> unI64 7743
+  '\x1e40'# -> unI64 7745
+  '\x1e42'# -> unI64 7747
+  '\x1e44'# -> unI64 7749
+  '\x1e46'# -> unI64 7751
+  '\x1e48'# -> unI64 7753
+  '\x1e4a'# -> unI64 7755
+  '\x1e4c'# -> unI64 7757
+  '\x1e4e'# -> unI64 7759
+  '\x1e50'# -> unI64 7761
+  '\x1e52'# -> unI64 7763
+  '\x1e54'# -> unI64 7765
+  '\x1e56'# -> unI64 7767
+  '\x1e58'# -> unI64 7769
+  '\x1e5a'# -> unI64 7771
+  '\x1e5c'# -> unI64 7773
+  '\x1e5e'# -> unI64 7775
+  '\x1e60'# -> unI64 7777
+  '\x1e62'# -> unI64 7779
+  '\x1e64'# -> unI64 7781
+  '\x1e66'# -> unI64 7783
+  '\x1e68'# -> unI64 7785
+  '\x1e6a'# -> unI64 7787
+  '\x1e6c'# -> unI64 7789
+  '\x1e6e'# -> unI64 7791
+  '\x1e70'# -> unI64 7793
+  '\x1e72'# -> unI64 7795
+  '\x1e74'# -> unI64 7797
+  '\x1e76'# -> unI64 7799
+  '\x1e78'# -> unI64 7801
+  '\x1e7a'# -> unI64 7803
+  '\x1e7c'# -> unI64 7805
+  '\x1e7e'# -> unI64 7807
+  '\x1e80'# -> unI64 7809
+  '\x1e82'# -> unI64 7811
+  '\x1e84'# -> unI64 7813
+  '\x1e86'# -> unI64 7815
+  '\x1e88'# -> unI64 7817
+  '\x1e8a'# -> unI64 7819
+  '\x1e8c'# -> unI64 7821
+  '\x1e8e'# -> unI64 7823
+  '\x1e90'# -> unI64 7825
+  '\x1e92'# -> unI64 7827
+  '\x1e94'# -> unI64 7829
+  '\x1ea0'# -> unI64 7841
+  '\x1ea2'# -> unI64 7843
+  '\x1ea4'# -> unI64 7845
+  '\x1ea6'# -> unI64 7847
+  '\x1ea8'# -> unI64 7849
+  '\x1eaa'# -> unI64 7851
+  '\x1eac'# -> unI64 7853
+  '\x1eae'# -> unI64 7855
+  '\x1eb0'# -> unI64 7857
+  '\x1eb2'# -> unI64 7859
+  '\x1eb4'# -> unI64 7861
+  '\x1eb6'# -> unI64 7863
+  '\x1eb8'# -> unI64 7865
+  '\x1eba'# -> unI64 7867
+  '\x1ebc'# -> unI64 7869
+  '\x1ebe'# -> unI64 7871
+  '\x1ec0'# -> unI64 7873
+  '\x1ec2'# -> unI64 7875
+  '\x1ec4'# -> unI64 7877
+  '\x1ec6'# -> unI64 7879
+  '\x1ec8'# -> unI64 7881
+  '\x1eca'# -> unI64 7883
+  '\x1ecc'# -> unI64 7885
+  '\x1ece'# -> unI64 7887
+  '\x1ed0'# -> unI64 7889
+  '\x1ed2'# -> unI64 7891
+  '\x1ed4'# -> unI64 7893
+  '\x1ed6'# -> unI64 7895
+  '\x1ed8'# -> unI64 7897
+  '\x1eda'# -> unI64 7899
+  '\x1edc'# -> unI64 7901
+  '\x1ede'# -> unI64 7903
+  '\x1ee0'# -> unI64 7905
+  '\x1ee2'# -> unI64 7907
+  '\x1ee4'# -> unI64 7909
+  '\x1ee6'# -> unI64 7911
+  '\x1ee8'# -> unI64 7913
+  '\x1eea'# -> unI64 7915
+  '\x1eec'# -> unI64 7917
+  '\x1eee'# -> unI64 7919
+  '\x1ef0'# -> unI64 7921
+  '\x1ef2'# -> unI64 7923
+  '\x1ef4'# -> unI64 7925
+  '\x1ef6'# -> unI64 7927
+  '\x1ef8'# -> unI64 7929
+  '\x1efa'# -> unI64 7931
+  '\x1efc'# -> unI64 7933
+  '\x1efe'# -> unI64 7935
+  '\x1f08'# -> unI64 7936
+  '\x1f09'# -> unI64 7937
+  '\x1f0a'# -> unI64 7938
+  '\x1f0b'# -> unI64 7939
+  '\x1f0c'# -> unI64 7940
+  '\x1f0d'# -> unI64 7941
+  '\x1f0e'# -> unI64 7942
+  '\x1f0f'# -> unI64 7943
+  '\x1f18'# -> unI64 7952
+  '\x1f19'# -> unI64 7953
+  '\x1f1a'# -> unI64 7954
+  '\x1f1b'# -> unI64 7955
+  '\x1f1c'# -> unI64 7956
+  '\x1f1d'# -> unI64 7957
+  '\x1f28'# -> unI64 7968
+  '\x1f29'# -> unI64 7969
+  '\x1f2a'# -> unI64 7970
+  '\x1f2b'# -> unI64 7971
+  '\x1f2c'# -> unI64 7972
+  '\x1f2d'# -> unI64 7973
+  '\x1f2e'# -> unI64 7974
+  '\x1f2f'# -> unI64 7975
+  '\x1f38'# -> unI64 7984
+  '\x1f39'# -> unI64 7985
+  '\x1f3a'# -> unI64 7986
+  '\x1f3b'# -> unI64 7987
+  '\x1f3c'# -> unI64 7988
+  '\x1f3d'# -> unI64 7989
+  '\x1f3e'# -> unI64 7990
+  '\x1f3f'# -> unI64 7991
+  '\x1f48'# -> unI64 8000
+  '\x1f49'# -> unI64 8001
+  '\x1f4a'# -> unI64 8002
+  '\x1f4b'# -> unI64 8003
+  '\x1f4c'# -> unI64 8004
+  '\x1f4d'# -> unI64 8005
+  '\x1f59'# -> unI64 8017
+  '\x1f5b'# -> unI64 8019
+  '\x1f5d'# -> unI64 8021
+  '\x1f5f'# -> unI64 8023
+  '\x1f68'# -> unI64 8032
+  '\x1f69'# -> unI64 8033
+  '\x1f6a'# -> unI64 8034
+  '\x1f6b'# -> unI64 8035
+  '\x1f6c'# -> unI64 8036
+  '\x1f6d'# -> unI64 8037
+  '\x1f6e'# -> unI64 8038
+  '\x1f6f'# -> unI64 8039
+  '\x1fb8'# -> unI64 8112
+  '\x1fb9'# -> unI64 8113
+  '\x1fba'# -> unI64 8048
+  '\x1fbb'# -> unI64 8049
+  '\x1fc8'# -> unI64 8050
+  '\x1fc9'# -> unI64 8051
+  '\x1fca'# -> unI64 8052
+  '\x1fcb'# -> unI64 8053
+  '\x1fd8'# -> unI64 8144
+  '\x1fd9'# -> unI64 8145
+  '\x1fda'# -> unI64 8054
+  '\x1fdb'# -> unI64 8055
+  '\x1fe8'# -> unI64 8160
+  '\x1fe9'# -> unI64 8161
+  '\x1fea'# -> unI64 8058
+  '\x1feb'# -> unI64 8059
+  '\x1fec'# -> unI64 8165
+  '\x1ff8'# -> unI64 8056
+  '\x1ff9'# -> unI64 8057
+  '\x1ffa'# -> unI64 8060
+  '\x1ffb'# -> unI64 8061
+  '\x2126'# -> unI64 969
+  '\x212a'# -> unI64 107
+  '\x212b'# -> unI64 229
+  '\x2132'# -> unI64 8526
+  '\x2160'# -> unI64 8560
+  '\x2161'# -> unI64 8561
+  '\x2162'# -> unI64 8562
+  '\x2163'# -> unI64 8563
+  '\x2164'# -> unI64 8564
+  '\x2165'# -> unI64 8565
+  '\x2166'# -> unI64 8566
+  '\x2167'# -> unI64 8567
+  '\x2168'# -> unI64 8568
+  '\x2169'# -> unI64 8569
+  '\x216a'# -> unI64 8570
+  '\x216b'# -> unI64 8571
+  '\x216c'# -> unI64 8572
+  '\x216d'# -> unI64 8573
+  '\x216e'# -> unI64 8574
+  '\x216f'# -> unI64 8575
+  '\x2183'# -> unI64 8580
+  '\x24b6'# -> unI64 9424
+  '\x24b7'# -> unI64 9425
+  '\x24b8'# -> unI64 9426
+  '\x24b9'# -> unI64 9427
+  '\x24ba'# -> unI64 9428
+  '\x24bb'# -> unI64 9429
+  '\x24bc'# -> unI64 9430
+  '\x24bd'# -> unI64 9431
+  '\x24be'# -> unI64 9432
+  '\x24bf'# -> unI64 9433
+  '\x24c0'# -> unI64 9434
+  '\x24c1'# -> unI64 9435
+  '\x24c2'# -> unI64 9436
+  '\x24c3'# -> unI64 9437
+  '\x24c4'# -> unI64 9438
+  '\x24c5'# -> unI64 9439
+  '\x24c6'# -> unI64 9440
+  '\x24c7'# -> unI64 9441
+  '\x24c8'# -> unI64 9442
+  '\x24c9'# -> unI64 9443
+  '\x24ca'# -> unI64 9444
+  '\x24cb'# -> unI64 9445
+  '\x24cc'# -> unI64 9446
+  '\x24cd'# -> unI64 9447
+  '\x24ce'# -> unI64 9448
+  '\x24cf'# -> unI64 9449
+  '\x2c00'# -> unI64 11312
+  '\x2c01'# -> unI64 11313
+  '\x2c02'# -> unI64 11314
+  '\x2c03'# -> unI64 11315
+  '\x2c04'# -> unI64 11316
+  '\x2c05'# -> unI64 11317
+  '\x2c06'# -> unI64 11318
+  '\x2c07'# -> unI64 11319
+  '\x2c08'# -> unI64 11320
+  '\x2c09'# -> unI64 11321
+  '\x2c0a'# -> unI64 11322
+  '\x2c0b'# -> unI64 11323
+  '\x2c0c'# -> unI64 11324
+  '\x2c0d'# -> unI64 11325
+  '\x2c0e'# -> unI64 11326
+  '\x2c0f'# -> unI64 11327
+  '\x2c10'# -> unI64 11328
+  '\x2c11'# -> unI64 11329
+  '\x2c12'# -> unI64 11330
+  '\x2c13'# -> unI64 11331
+  '\x2c14'# -> unI64 11332
+  '\x2c15'# -> unI64 11333
+  '\x2c16'# -> unI64 11334
+  '\x2c17'# -> unI64 11335
+  '\x2c18'# -> unI64 11336
+  '\x2c19'# -> unI64 11337
+  '\x2c1a'# -> unI64 11338
+  '\x2c1b'# -> unI64 11339
+  '\x2c1c'# -> unI64 11340
+  '\x2c1d'# -> unI64 11341
+  '\x2c1e'# -> unI64 11342
+  '\x2c1f'# -> unI64 11343
+  '\x2c20'# -> unI64 11344
+  '\x2c21'# -> unI64 11345
+  '\x2c22'# -> unI64 11346
+  '\x2c23'# -> unI64 11347
+  '\x2c24'# -> unI64 11348
+  '\x2c25'# -> unI64 11349
+  '\x2c26'# -> unI64 11350
+  '\x2c27'# -> unI64 11351
+  '\x2c28'# -> unI64 11352
+  '\x2c29'# -> unI64 11353
+  '\x2c2a'# -> unI64 11354
+  '\x2c2b'# -> unI64 11355
+  '\x2c2c'# -> unI64 11356
+  '\x2c2d'# -> unI64 11357
+  '\x2c2e'# -> unI64 11358
+  '\x2c60'# -> unI64 11361
+  '\x2c62'# -> unI64 619
+  '\x2c63'# -> unI64 7549
+  '\x2c64'# -> unI64 637
+  '\x2c67'# -> unI64 11368
+  '\x2c69'# -> unI64 11370
+  '\x2c6b'# -> unI64 11372
+  '\x2c6d'# -> unI64 593
+  '\x2c6e'# -> unI64 625
+  '\x2c6f'# -> unI64 592
+  '\x2c70'# -> unI64 594
+  '\x2c72'# -> unI64 11379
+  '\x2c75'# -> unI64 11382
+  '\x2c7e'# -> unI64 575
+  '\x2c7f'# -> unI64 576
+  '\x2c80'# -> unI64 11393
+  '\x2c82'# -> unI64 11395
+  '\x2c84'# -> unI64 11397
+  '\x2c86'# -> unI64 11399
+  '\x2c88'# -> unI64 11401
+  '\x2c8a'# -> unI64 11403
+  '\x2c8c'# -> unI64 11405
+  '\x2c8e'# -> unI64 11407
+  '\x2c90'# -> unI64 11409
+  '\x2c92'# -> unI64 11411
+  '\x2c94'# -> unI64 11413
+  '\x2c96'# -> unI64 11415
+  '\x2c98'# -> unI64 11417
+  '\x2c9a'# -> unI64 11419
+  '\x2c9c'# -> unI64 11421
+  '\x2c9e'# -> unI64 11423
+  '\x2ca0'# -> unI64 11425
+  '\x2ca2'# -> unI64 11427
+  '\x2ca4'# -> unI64 11429
+  '\x2ca6'# -> unI64 11431
+  '\x2ca8'# -> unI64 11433
+  '\x2caa'# -> unI64 11435
+  '\x2cac'# -> unI64 11437
+  '\x2cae'# -> unI64 11439
+  '\x2cb0'# -> unI64 11441
+  '\x2cb2'# -> unI64 11443
+  '\x2cb4'# -> unI64 11445
+  '\x2cb6'# -> unI64 11447
+  '\x2cb8'# -> unI64 11449
+  '\x2cba'# -> unI64 11451
+  '\x2cbc'# -> unI64 11453
+  '\x2cbe'# -> unI64 11455
+  '\x2cc0'# -> unI64 11457
+  '\x2cc2'# -> unI64 11459
+  '\x2cc4'# -> unI64 11461
+  '\x2cc6'# -> unI64 11463
+  '\x2cc8'# -> unI64 11465
+  '\x2cca'# -> unI64 11467
+  '\x2ccc'# -> unI64 11469
+  '\x2cce'# -> unI64 11471
+  '\x2cd0'# -> unI64 11473
+  '\x2cd2'# -> unI64 11475
+  '\x2cd4'# -> unI64 11477
+  '\x2cd6'# -> unI64 11479
+  '\x2cd8'# -> unI64 11481
+  '\x2cda'# -> unI64 11483
+  '\x2cdc'# -> unI64 11485
+  '\x2cde'# -> unI64 11487
+  '\x2ce0'# -> unI64 11489
+  '\x2ce2'# -> unI64 11491
+  '\x2ceb'# -> unI64 11500
+  '\x2ced'# -> unI64 11502
+  '\x2cf2'# -> unI64 11507
+  '\xa640'# -> unI64 42561
+  '\xa642'# -> unI64 42563
+  '\xa644'# -> unI64 42565
+  '\xa646'# -> unI64 42567
+  '\xa648'# -> unI64 42569
+  '\xa64a'# -> unI64 42571
+  '\xa64c'# -> unI64 42573
+  '\xa64e'# -> unI64 42575
+  '\xa650'# -> unI64 42577
+  '\xa652'# -> unI64 42579
+  '\xa654'# -> unI64 42581
+  '\xa656'# -> unI64 42583
+  '\xa658'# -> unI64 42585
+  '\xa65a'# -> unI64 42587
+  '\xa65c'# -> unI64 42589
+  '\xa65e'# -> unI64 42591
+  '\xa660'# -> unI64 42593
+  '\xa662'# -> unI64 42595
+  '\xa664'# -> unI64 42597
+  '\xa666'# -> unI64 42599
+  '\xa668'# -> unI64 42601
+  '\xa66a'# -> unI64 42603
+  '\xa66c'# -> unI64 42605
+  '\xa680'# -> unI64 42625
+  '\xa682'# -> unI64 42627
+  '\xa684'# -> unI64 42629
+  '\xa686'# -> unI64 42631
+  '\xa688'# -> unI64 42633
+  '\xa68a'# -> unI64 42635
+  '\xa68c'# -> unI64 42637
+  '\xa68e'# -> unI64 42639
+  '\xa690'# -> unI64 42641
+  '\xa692'# -> unI64 42643
+  '\xa694'# -> unI64 42645
+  '\xa696'# -> unI64 42647
+  '\xa698'# -> unI64 42649
+  '\xa69a'# -> unI64 42651
+  '\xa722'# -> unI64 42787
+  '\xa724'# -> unI64 42789
+  '\xa726'# -> unI64 42791
+  '\xa728'# -> unI64 42793
+  '\xa72a'# -> unI64 42795
+  '\xa72c'# -> unI64 42797
+  '\xa72e'# -> unI64 42799
+  '\xa732'# -> unI64 42803
+  '\xa734'# -> unI64 42805
+  '\xa736'# -> unI64 42807
+  '\xa738'# -> unI64 42809
+  '\xa73a'# -> unI64 42811
+  '\xa73c'# -> unI64 42813
+  '\xa73e'# -> unI64 42815
+  '\xa740'# -> unI64 42817
+  '\xa742'# -> unI64 42819
+  '\xa744'# -> unI64 42821
+  '\xa746'# -> unI64 42823
+  '\xa748'# -> unI64 42825
+  '\xa74a'# -> unI64 42827
+  '\xa74c'# -> unI64 42829
+  '\xa74e'# -> unI64 42831
+  '\xa750'# -> unI64 42833
+  '\xa752'# -> unI64 42835
+  '\xa754'# -> unI64 42837
+  '\xa756'# -> unI64 42839
+  '\xa758'# -> unI64 42841
+  '\xa75a'# -> unI64 42843
+  '\xa75c'# -> unI64 42845
+  '\xa75e'# -> unI64 42847
+  '\xa760'# -> unI64 42849
+  '\xa762'# -> unI64 42851
+  '\xa764'# -> unI64 42853
+  '\xa766'# -> unI64 42855
+  '\xa768'# -> unI64 42857
+  '\xa76a'# -> unI64 42859
+  '\xa76c'# -> unI64 42861
+  '\xa76e'# -> unI64 42863
+  '\xa779'# -> unI64 42874
+  '\xa77b'# -> unI64 42876
+  '\xa77d'# -> unI64 7545
+  '\xa77e'# -> unI64 42879
+  '\xa780'# -> unI64 42881
+  '\xa782'# -> unI64 42883
+  '\xa784'# -> unI64 42885
+  '\xa786'# -> unI64 42887
+  '\xa78b'# -> unI64 42892
+  '\xa78d'# -> unI64 613
+  '\xa790'# -> unI64 42897
+  '\xa792'# -> unI64 42899
+  '\xa796'# -> unI64 42903
+  '\xa798'# -> unI64 42905
+  '\xa79a'# -> unI64 42907
+  '\xa79c'# -> unI64 42909
+  '\xa79e'# -> unI64 42911
+  '\xa7a0'# -> unI64 42913
+  '\xa7a2'# -> unI64 42915
+  '\xa7a4'# -> unI64 42917
+  '\xa7a6'# -> unI64 42919
+  '\xa7a8'# -> unI64 42921
+  '\xa7aa'# -> unI64 614
+  '\xa7ab'# -> unI64 604
+  '\xa7ac'# -> unI64 609
+  '\xa7ad'# -> unI64 620
+  '\xa7ae'# -> unI64 618
+  '\xa7b0'# -> unI64 670
+  '\xa7b1'# -> unI64 647
+  '\xa7b2'# -> unI64 669
+  '\xa7b3'# -> unI64 43859
+  '\xa7b4'# -> unI64 42933
+  '\xa7b6'# -> unI64 42935
+  '\xa7b8'# -> unI64 42937
+  '\xa7ba'# -> unI64 42939
+  '\xa7bc'# -> unI64 42941
+  '\xa7be'# -> unI64 42943
+  '\xa7c2'# -> unI64 42947
+  '\xa7c4'# -> unI64 42900
+  '\xa7c5'# -> unI64 642
+  '\xa7c6'# -> unI64 7566
+  '\xff21'# -> unI64 65345
+  '\xff22'# -> unI64 65346
+  '\xff23'# -> unI64 65347
+  '\xff24'# -> unI64 65348
+  '\xff25'# -> unI64 65349
+  '\xff26'# -> unI64 65350
+  '\xff27'# -> unI64 65351
+  '\xff28'# -> unI64 65352
+  '\xff29'# -> unI64 65353
+  '\xff2a'# -> unI64 65354
+  '\xff2b'# -> unI64 65355
+  '\xff2c'# -> unI64 65356
+  '\xff2d'# -> unI64 65357
+  '\xff2e'# -> unI64 65358
+  '\xff2f'# -> unI64 65359
+  '\xff30'# -> unI64 65360
+  '\xff31'# -> unI64 65361
+  '\xff32'# -> unI64 65362
+  '\xff33'# -> unI64 65363
+  '\xff34'# -> unI64 65364
+  '\xff35'# -> unI64 65365
+  '\xff36'# -> unI64 65366
+  '\xff37'# -> unI64 65367
+  '\xff38'# -> unI64 65368
+  '\xff39'# -> unI64 65369
+  '\xff3a'# -> unI64 65370
+  '\x10400'# -> unI64 66600
+  '\x10401'# -> unI64 66601
+  '\x10402'# -> unI64 66602
+  '\x10403'# -> unI64 66603
+  '\x10404'# -> unI64 66604
+  '\x10405'# -> unI64 66605
+  '\x10406'# -> unI64 66606
+  '\x10407'# -> unI64 66607
+  '\x10408'# -> unI64 66608
+  '\x10409'# -> unI64 66609
+  '\x1040a'# -> unI64 66610
+  '\x1040b'# -> unI64 66611
+  '\x1040c'# -> unI64 66612
+  '\x1040d'# -> unI64 66613
+  '\x1040e'# -> unI64 66614
+  '\x1040f'# -> unI64 66615
+  '\x10410'# -> unI64 66616
+  '\x10411'# -> unI64 66617
+  '\x10412'# -> unI64 66618
+  '\x10413'# -> unI64 66619
+  '\x10414'# -> unI64 66620
+  '\x10415'# -> unI64 66621
+  '\x10416'# -> unI64 66622
+  '\x10417'# -> unI64 66623
+  '\x10418'# -> unI64 66624
+  '\x10419'# -> unI64 66625
+  '\x1041a'# -> unI64 66626
+  '\x1041b'# -> unI64 66627
+  '\x1041c'# -> unI64 66628
+  '\x1041d'# -> unI64 66629
+  '\x1041e'# -> unI64 66630
+  '\x1041f'# -> unI64 66631
+  '\x10420'# -> unI64 66632
+  '\x10421'# -> unI64 66633
+  '\x10422'# -> unI64 66634
+  '\x10423'# -> unI64 66635
+  '\x10424'# -> unI64 66636
+  '\x10425'# -> unI64 66637
+  '\x10426'# -> unI64 66638
+  '\x10427'# -> unI64 66639
+  '\x104b0'# -> unI64 66776
+  '\x104b1'# -> unI64 66777
+  '\x104b2'# -> unI64 66778
+  '\x104b3'# -> unI64 66779
+  '\x104b4'# -> unI64 66780
+  '\x104b5'# -> unI64 66781
+  '\x104b6'# -> unI64 66782
+  '\x104b7'# -> unI64 66783
+  '\x104b8'# -> unI64 66784
+  '\x104b9'# -> unI64 66785
+  '\x104ba'# -> unI64 66786
+  '\x104bb'# -> unI64 66787
+  '\x104bc'# -> unI64 66788
+  '\x104bd'# -> unI64 66789
+  '\x104be'# -> unI64 66790
+  '\x104bf'# -> unI64 66791
+  '\x104c0'# -> unI64 66792
+  '\x104c1'# -> unI64 66793
+  '\x104c2'# -> unI64 66794
+  '\x104c3'# -> unI64 66795
+  '\x104c4'# -> unI64 66796
+  '\x104c5'# -> unI64 66797
+  '\x104c6'# -> unI64 66798
+  '\x104c7'# -> unI64 66799
+  '\x104c8'# -> unI64 66800
+  '\x104c9'# -> unI64 66801
+  '\x104ca'# -> unI64 66802
+  '\x104cb'# -> unI64 66803
+  '\x104cc'# -> unI64 66804
+  '\x104cd'# -> unI64 66805
+  '\x104ce'# -> unI64 66806
+  '\x104cf'# -> unI64 66807
+  '\x104d0'# -> unI64 66808
+  '\x104d1'# -> unI64 66809
+  '\x104d2'# -> unI64 66810
+  '\x104d3'# -> unI64 66811
+  '\x10c80'# -> unI64 68800
+  '\x10c81'# -> unI64 68801
+  '\x10c82'# -> unI64 68802
+  '\x10c83'# -> unI64 68803
+  '\x10c84'# -> unI64 68804
+  '\x10c85'# -> unI64 68805
+  '\x10c86'# -> unI64 68806
+  '\x10c87'# -> unI64 68807
+  '\x10c88'# -> unI64 68808
+  '\x10c89'# -> unI64 68809
+  '\x10c8a'# -> unI64 68810
+  '\x10c8b'# -> unI64 68811
+  '\x10c8c'# -> unI64 68812
+  '\x10c8d'# -> unI64 68813
+  '\x10c8e'# -> unI64 68814
+  '\x10c8f'# -> unI64 68815
+  '\x10c90'# -> unI64 68816
+  '\x10c91'# -> unI64 68817
+  '\x10c92'# -> unI64 68818
+  '\x10c93'# -> unI64 68819
+  '\x10c94'# -> unI64 68820
+  '\x10c95'# -> unI64 68821
+  '\x10c96'# -> unI64 68822
+  '\x10c97'# -> unI64 68823
+  '\x10c98'# -> unI64 68824
+  '\x10c99'# -> unI64 68825
+  '\x10c9a'# -> unI64 68826
+  '\x10c9b'# -> unI64 68827
+  '\x10c9c'# -> unI64 68828
+  '\x10c9d'# -> unI64 68829
+  '\x10c9e'# -> unI64 68830
+  '\x10c9f'# -> unI64 68831
+  '\x10ca0'# -> unI64 68832
+  '\x10ca1'# -> unI64 68833
+  '\x10ca2'# -> unI64 68834
+  '\x10ca3'# -> unI64 68835
+  '\x10ca4'# -> unI64 68836
+  '\x10ca5'# -> unI64 68837
+  '\x10ca6'# -> unI64 68838
+  '\x10ca7'# -> unI64 68839
+  '\x10ca8'# -> unI64 68840
+  '\x10ca9'# -> unI64 68841
+  '\x10caa'# -> unI64 68842
+  '\x10cab'# -> unI64 68843
+  '\x10cac'# -> unI64 68844
+  '\x10cad'# -> unI64 68845
+  '\x10cae'# -> unI64 68846
+  '\x10caf'# -> unI64 68847
+  '\x10cb0'# -> unI64 68848
+  '\x10cb1'# -> unI64 68849
+  '\x10cb2'# -> unI64 68850
+  '\x118a0'# -> unI64 71872
+  '\x118a1'# -> unI64 71873
+  '\x118a2'# -> unI64 71874
+  '\x118a3'# -> unI64 71875
+  '\x118a4'# -> unI64 71876
+  '\x118a5'# -> unI64 71877
+  '\x118a6'# -> unI64 71878
+  '\x118a7'# -> unI64 71879
+  '\x118a8'# -> unI64 71880
+  '\x118a9'# -> unI64 71881
+  '\x118aa'# -> unI64 71882
+  '\x118ab'# -> unI64 71883
+  '\x118ac'# -> unI64 71884
+  '\x118ad'# -> unI64 71885
+  '\x118ae'# -> unI64 71886
+  '\x118af'# -> unI64 71887
+  '\x118b0'# -> unI64 71888
+  '\x118b1'# -> unI64 71889
+  '\x118b2'# -> unI64 71890
+  '\x118b3'# -> unI64 71891
+  '\x118b4'# -> unI64 71892
+  '\x118b5'# -> unI64 71893
+  '\x118b6'# -> unI64 71894
+  '\x118b7'# -> unI64 71895
+  '\x118b8'# -> unI64 71896
+  '\x118b9'# -> unI64 71897
+  '\x118ba'# -> unI64 71898
+  '\x118bb'# -> unI64 71899
+  '\x118bc'# -> unI64 71900
+  '\x118bd'# -> unI64 71901
+  '\x118be'# -> unI64 71902
+  '\x118bf'# -> unI64 71903
+  '\x16e40'# -> unI64 93792
+  '\x16e41'# -> unI64 93793
+  '\x16e42'# -> unI64 93794
+  '\x16e43'# -> unI64 93795
+  '\x16e44'# -> unI64 93796
+  '\x16e45'# -> unI64 93797
+  '\x16e46'# -> unI64 93798
+  '\x16e47'# -> unI64 93799
+  '\x16e48'# -> unI64 93800
+  '\x16e49'# -> unI64 93801
+  '\x16e4a'# -> unI64 93802
+  '\x16e4b'# -> unI64 93803
+  '\x16e4c'# -> unI64 93804
+  '\x16e4d'# -> unI64 93805
+  '\x16e4e'# -> unI64 93806
+  '\x16e4f'# -> unI64 93807
+  '\x16e50'# -> unI64 93808
+  '\x16e51'# -> unI64 93809
+  '\x16e52'# -> unI64 93810
+  '\x16e53'# -> unI64 93811
+  '\x16e54'# -> unI64 93812
+  '\x16e55'# -> unI64 93813
+  '\x16e56'# -> unI64 93814
+  '\x16e57'# -> unI64 93815
+  '\x16e58'# -> unI64 93816
+  '\x16e59'# -> unI64 93817
+  '\x16e5a'# -> unI64 93818
+  '\x16e5b'# -> unI64 93819
+  '\x16e5c'# -> unI64 93820
+  '\x16e5d'# -> unI64 93821
+  '\x16e5e'# -> unI64 93822
+  '\x16e5f'# -> unI64 93823
+  '\x1e900'# -> unI64 125218
+  '\x1e901'# -> unI64 125219
+  '\x1e902'# -> unI64 125220
+  '\x1e903'# -> unI64 125221
+  '\x1e904'# -> unI64 125222
+  '\x1e905'# -> unI64 125223
+  '\x1e906'# -> unI64 125224
+  '\x1e907'# -> unI64 125225
+  '\x1e908'# -> unI64 125226
+  '\x1e909'# -> unI64 125227
+  '\x1e90a'# -> unI64 125228
+  '\x1e90b'# -> unI64 125229
+  '\x1e90c'# -> unI64 125230
+  '\x1e90d'# -> unI64 125231
+  '\x1e90e'# -> unI64 125232
+  '\x1e90f'# -> unI64 125233
+  '\x1e910'# -> unI64 125234
+  '\x1e911'# -> unI64 125235
+  '\x1e912'# -> unI64 125236
+  '\x1e913'# -> unI64 125237
+  '\x1e914'# -> unI64 125238
+  '\x1e915'# -> unI64 125239
+  '\x1e916'# -> unI64 125240
+  '\x1e917'# -> unI64 125241
+  '\x1e918'# -> unI64 125242
+  '\x1e919'# -> unI64 125243
+  '\x1e91a'# -> unI64 125244
+  '\x1e91b'# -> unI64 125245
+  '\x1e91c'# -> unI64 125246
+  '\x1e91d'# -> unI64 125247
+  '\x1e91e'# -> unI64 125248
+  '\x1e91f'# -> unI64 125249
+  '\x1e920'# -> unI64 125250
+  '\x1e921'# -> unI64 125251
+  _ -> unI64 0
diff --git a/src/Data/Text/Internal/Fusion/Common.hs b/src/Data/Text/Internal/Fusion/Common.hs
index dc19cf1b..da0fbbe8 100644
--- a/src/Data/Text/Internal/Fusion/Common.hs
+++ b/src/Data/Text/Internal/Fusion/Common.hs
@@ -1,4 +1,5 @@
-{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
+{-# LANGUAGE BangPatterns, MagicHash, Rank2Types, PartialTypeSignatures #-}
+{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
 -- |
 -- Module      : Data.Text.Internal.Fusion.Common
 -- Copyright   : (c) Bryan O'Sullivan 2009, 2012
@@ -124,13 +125,15 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
                 (&&), fromIntegral, otherwise)
 import qualified Data.List as L
 import qualified Prelude as P
+import Data.Bits (shiftL, shiftR, (.&.))
 import Data.Char (isLetter, isSpace)
-import Data.Int (Int64)
+import GHC.Int (Int64(..))
 import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
 import Data.Text.Internal.Fusion.Types
 import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
                                      upperMapping)
 import Data.Text.Internal.Fusion.Size
+import GHC.Exts (Char(..), Char#, chr#)
 import GHC.Prim (Addr#, indexWord8OffAddr#)
 import GHC.Types (Int(..))
 import Data.Text.Internal.Unsafe.Char (unsafeChr8)
@@ -478,17 +481,27 @@ intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
 -- characters.
 
 -- | Map a 'Stream' through the given case-mapping function.
-caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
+caseConvert :: (Char# -> _ {- unboxed Int64 -})
             -> Stream Char -> Stream Char
 caseConvert remap (Stream next0 s0 len) =
-    Stream next (CC s0 '\0' '\0') (len `unionSize` (3*len))
+    Stream next (CC s0 0) (len `unionSize` (3*len))
   where
-    next (CC s '\0' _) =
+    next (CC s 0) =
         case next0 s of
           Done       -> Done
-          Skip s'    -> Skip (CC s' '\0' '\0')
-          Yield c s' -> remap c s'
-    next (CC s a b)  =  Yield a (CC s b '\0')
+          Skip s'    -> Skip (CC s' 0)
+          Yield c@(C# c#) s' -> case I64# (remap c#) of
+            0 -> Yield c (CC s' 0)
+            ab -> let (a, b) = chopOffChar ab in
+              Yield a (CC s' b)
+    next (CC s ab) = let (a, b) = chopOffChar ab in Yield a (CC s b)
+
+chopOffChar :: Int64 -> (Char, Int64)
+chopOffChar ab = (chr a, ab `shiftR` 21)
+  where
+    chr (I# n) = C# (chr# n)
+    mask = (1 `shiftL` 21) - 1
+    a = fromIntegral $ ab .&. mask
 
 -- | /O(n)/ Convert a string to folded case.  This function is mainly
 -- useful for performing caseless (or case insensitive) string
@@ -556,20 +569,25 @@ toLower = caseConvert lowerMapping
 --
 -- @ 'Data.Text.Internal.unstream' . 'toTitle' . 'Data.Text.Internal.Fusion.stream' = 'Data.Text.toTitle' @
 toTitle :: Stream Char -> Stream Char
-toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
+toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) 0) (len + unknownSize)
   where
-    next (CC (letter :*: s) '\0' _) =
+    next (CC (letter :*: s) 0) =
       case next0 s of
         Done            -> Done
-        Skip s'         -> Skip (CC (letter :*: s') '\0' '\0')
-        Yield c s'
-          | nonSpace    -> if letter
-                           then lowerMapping c (nonSpace :*: s')
-                           else titleMapping c (letter' :*: s')
-          | otherwise   -> Yield c (CC (letter' :*: s') '\0' '\0')
+        Skip s'         -> Skip (CC (letter :*: s') 0)
+        Yield c@(C# c#) s'
+          | nonSpace, letter -> case I64# (lowerMapping c#) of
+            0 -> Yield c (CC (nonSpace :*: s') 0)
+            ab -> let (a, b) = chopOffChar ab in
+              Yield a (CC (nonSpace :*: s') b)
+          | nonSpace    ->  case I64# (titleMapping c#) of
+            0 -> Yield c (CC (letter' :*: s') 0)
+            ab -> let (a, b) = chopOffChar ab in
+              Yield a (CC (letter' :*: s') b)
+          | otherwise   -> Yield c (CC (letter' :*: s') 0)
           where nonSpace = P.not (isSpace c)
                 letter'  = isLetter c
-    next (CC s a b)      = Yield a (CC s b '\0')
+    next (CC s ab) = let (a, b) = chopOffChar ab in Yield a (CC s b)
 {-# INLINE [0] toTitle #-}
 
 data Justify i s = Just1 !i !s
diff --git a/src/Data/Text/Internal/Fusion/Types.hs b/src/Data/Text/Internal/Fusion/Types.hs
index b97784f8..e56236bd 100644
--- a/src/Data/Text/Internal/Fusion/Types.hs
+++ b/src/Data/Text/Internal/Fusion/Types.hs
@@ -29,10 +29,11 @@ module Data.Text.Internal.Fusion.Types
     ) where
 
 import Data.Text.Internal.Fusion.Size
+import Data.Int (Int64)
 import Data.Word (Word8)
 
 -- | Specialised tuple for case conversion.
-data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char
+data CC s = CC !s {-# UNPACK #-} !Int64
 
 -- | Restreaming state.
 data RS s

From ddf3455547829f2e1447286dc0b0b8806fb58446 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 2 Aug 2021 02:04:56 +0100
Subject: [PATCH 20/38] Speed up instance Eq for lazy Text

---
 src/Data/Text/Lazy.hs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index c326a32b..fb5bf834 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -214,6 +214,7 @@ import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
 import qualified Data.Text as T
+import qualified Data.Text.Array as A
 import qualified Data.Text.Internal as T
 import qualified Data.Text.Internal.Fusion.Common as S
 import qualified Data.Text.Unsafe as T
@@ -278,15 +279,14 @@ equal :: Text -> Text -> Bool
 equal Empty Empty = True
 equal Empty _     = False
 equal _ Empty     = False
-equal (Chunk a as) (Chunk b bs) =
+equal (Chunk (T.Text arrA offA lenA) as) (Chunk (T.Text arrB offB lenB) bs) =
     case compare lenA lenB of
-      LT -> a == (T.takeWord8 lenA b) &&
-            as `equal` Chunk (T.dropWord8 lenA b) bs
-      EQ -> a == b && as `equal` bs
-      GT -> T.takeWord8 lenB a == b &&
-            Chunk (T.dropWord8 lenB a) as `equal` bs
-  where lenA = T.lengthWord8 a
-        lenB = T.lengthWord8 b
+      LT -> A.equal arrA offA arrB offB lenA &&
+            as `equal` Chunk (T.Text arrB (offB + lenA) (lenB - lenA)) bs
+      EQ -> A.equal arrA offA arrB offB lenA &&
+            as `equal` bs
+      GT -> A.equal arrA offA arrB offB lenB &&
+            Chunk (T.Text arrA (offA + lenB) (lenA - lenB)) as `equal` bs
 
 instance Eq Text where
     (==) = equal

From 81b1b50652791e33c069e9c52f620384dbed8765 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 2 Aug 2021 19:10:57 +0100
Subject: [PATCH 21/38] Move equal into Data.Text.Internal.Lazy

---
 src/Data/Text/Internal/Lazy.hs | 16 ++++++++++++++++
 src/Data/Text/Lazy.hs          | 16 +---------------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/Data/Text/Internal/Lazy.hs b/src/Data/Text/Internal/Lazy.hs
index dc45787f..928e5cfc 100644
--- a/src/Data/Text/Internal/Lazy.hs
+++ b/src/Data/Text/Internal/Lazy.hs
@@ -36,12 +36,15 @@ module Data.Text.Internal.Lazy
     , defaultChunkSize
     , smallChunkSize
     , chunkOverhead
+
+    , equal
     ) where
 
 import Data.Bits (shiftL)
 import Data.Text ()
 import Data.Typeable (Typeable)
 import Foreign.Storable (sizeOf)
+import qualified Data.Text.Array as A
 import qualified Data.Text.Internal as T
 
 data Text = Empty
@@ -117,3 +120,16 @@ smallChunkSize = 128 - chunkOverhead
 chunkOverhead :: Int
 chunkOverhead = sizeOf (undefined :: Int) `shiftL` 1
 {-# INLINE chunkOverhead #-}
+
+equal :: Text -> Text -> Bool
+equal Empty Empty = True
+equal Empty _     = False
+equal _ Empty     = False
+equal (Chunk (T.Text arrA offA lenA) as) (Chunk (T.Text arrB offB lenB) bs) =
+    case compare lenA lenB of
+      LT -> A.equal arrA offA arrB offB lenA &&
+            as `equal` Chunk (T.Text arrB (offB + lenA) (lenB - lenA)) bs
+      EQ -> A.equal arrA offA arrB offB lenA &&
+            as `equal` bs
+      GT -> A.equal arrA offA arrB offB lenB &&
+            Chunk (T.Text arrA (offA + lenB) (lenA - lenB)) as `equal` bs
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index fb5bf834..03f8de49 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -214,7 +214,6 @@ import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
 import qualified Data.Text as T
-import qualified Data.Text.Array as A
 import qualified Data.Text.Internal as T
 import qualified Data.Text.Internal.Fusion.Common as S
 import qualified Data.Text.Unsafe as T
@@ -222,7 +221,7 @@ import qualified Data.Text.Internal.Lazy.Fusion as S
 import Data.Text.Internal.Fusion.Types (PairS(..))
 import Data.Text.Internal.Lazy.Fusion (stream, unstream)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldlChunks,
-                                foldrChunks, smallChunkSize)
+                                foldrChunks, smallChunkSize, equal)
 import Data.Text.Internal (firstf, safe, text)
 import Data.Text.Lazy.Encoding (decodeUtf8', encodeUtf8)
 import Data.Text.Internal.Lazy.Search (indices)
@@ -275,19 +274,6 @@ import GHC.Stack (HasCallStack)
 -- >>> import qualified Data.Text as T
 -- >>> :seti -XOverloadedStrings
 
-equal :: Text -> Text -> Bool
-equal Empty Empty = True
-equal Empty _     = False
-equal _ Empty     = False
-equal (Chunk (T.Text arrA offA lenA) as) (Chunk (T.Text arrB offB lenB) bs) =
-    case compare lenA lenB of
-      LT -> A.equal arrA offA arrB offB lenA &&
-            as `equal` Chunk (T.Text arrB (offB + lenA) (lenB - lenA)) bs
-      EQ -> A.equal arrA offA arrB offB lenA &&
-            as `equal` bs
-      GT -> A.equal arrA offA arrB offB lenB &&
-            Chunk (T.Text arrA (offA + lenB) (lenA - lenB)) as `equal` bs
-
 instance Eq Text where
     (==) = equal
     {-# INLINE (==) #-}

From 1ab411bb5d5e8a94bec204bd1bf59acce97dcd9d Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 2 Aug 2021 19:11:33 +0100
Subject: [PATCH 22/38] Speed up replicate for lazy Text

---
 src/Data/Text/Lazy.hs | 48 +++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 03f8de49..f17e2254 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -2,6 +2,7 @@
 {-# LANGUAGE BangPatterns, MagicHash, CPP, TypeFamilies #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE TemplateHaskellQuotes #-}
+{-# LANGUAGE LambdaCase #-}
 
 -- |
 -- Module      : Data.Text.Lazy
@@ -199,7 +200,7 @@ module Data.Text.Lazy
 
 import Prelude (Char, Bool(..), Maybe(..), String,
                 Eq(..), Ord(..), Ordering(..), Read(..), Show(..),
-                (&&), (||), (+), (-), (.), ($), (++),
+                (&&), (+), (-), (.), ($), (++),
                 error, flip, fmap, fromIntegral, not, otherwise, quot)
 import qualified Prelude as P
 import Control.DeepSeq (NFData(..))
@@ -221,7 +222,7 @@ import qualified Data.Text.Internal.Lazy.Fusion as S
 import Data.Text.Internal.Fusion.Types (PairS(..))
 import Data.Text.Internal.Lazy.Fusion (stream, unstream)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldlChunks,
-                                foldrChunks, smallChunkSize, equal)
+                                foldrChunks, smallChunkSize, defaultChunkSize, equal)
 import Data.Text.Internal (firstf, safe, text)
 import Data.Text.Lazy.Encoding (decodeUtf8', encodeUtf8)
 import Data.Text.Internal.Lazy.Search (indices)
@@ -591,7 +592,7 @@ intersperse c t = unstream (S.intersperse (safe c) (stream t))
 justifyLeft :: Int64 -> Char -> Text -> Text
 justifyLeft k c t
     | len >= k  = t
-    | otherwise = t `append` replicateChar (k-len) c
+    | otherwise = t `append` replicateChunk (k-len) (T.singleton c)
   where len = length t
 {-# INLINE [1] justifyLeft #-}
 
@@ -606,7 +607,7 @@ justifyLeft k c t
 justifyRight :: Int64 -> Char -> Text -> Text
 justifyRight k c t
     | len >= k  = t
-    | otherwise = replicateChar (k-len) c `append` t
+    | otherwise = replicateChunk (k-len) (T.singleton c) `append` t
   where len = length t
 {-# INLINE justifyRight #-}
 
@@ -620,7 +621,7 @@ justifyRight k c t
 center :: Int64 -> Char -> Text -> Text
 center k c t
     | len >= k  = t
-    | otherwise = replicateChar l c `append` t `append` replicateChar r c
+    | otherwise = replicateChunk l (T.singleton c) `append` t `append` replicateChunk r (T.singleton c)
   where len = length t
         d   = k - len
         r   = d `quot` 2
@@ -910,14 +911,28 @@ repeat c = let t = Chunk (T.replicate smallChunkSize (T.singleton c)) t
 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
 -- @t@ repeated @n@ times.
 replicate :: Int64 -> Text -> Text
-replicate n t
-    | null t || n <= 0 = empty
-    | isSingleton t    = replicateChar n (head t)
-    | otherwise        = concat (rep 0)
-    where rep !i | i >= n    = []
-                 | otherwise = t : rep (i+1)
+replicate n
+  | n <= 0 = P.const Empty
+  | otherwise = \case
+    Empty -> Empty
+    Chunk t Empty -> replicateChunk n t
+    t -> concat (rep n)
+      where
+        rep 0 = []
+        rep i = t : rep (i - 1)
 {-# INLINE [1] replicate #-}
 
+replicateChunk :: Int64 -> T.Text -> Text
+replicateChunk !n !t@(T.Text _ _ len)
+  | n <= 0 = Empty
+  | otherwise = Chunk headChunk $ P.foldr Chunk Empty (L.genericReplicate q normalChunk)
+  where
+    perChunk = defaultChunkSize `quot` len
+    normalChunk = T.replicate perChunk t
+    (q, r) = n `P.quotRem` intToInt64 perChunk
+    headChunk = T.replicate (int64ToInt r) t
+{-# INLINE replicateChunk #-}
+
 -- | 'cycle' ties a finite, non-empty 'Text' into a circular one, or
 -- equivalently, the infinite repetition of the original 'Text'.
 --
@@ -937,17 +952,6 @@ iterate :: (Char -> Char) -> Char -> Text
 iterate f c = let t c' = Chunk (T.singleton c') (t (f c'))
                in t c
 
--- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
--- value of every element.
-replicateChar :: Int64 -> Char -> Text
-replicateChar n c = unstream (S.replicateCharI n (safe c))
-{-# INLINE replicateChar #-}
-
-{-# RULES
-"LAZY TEXT replicate/singleton -> replicateChar" [~1] forall n c.
-    replicate n (singleton c) = replicateChar n c
-  #-}
-
 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
 -- 'Text' from a seed value. The function takes the element and

From c68f8381ac617c44b0af3a9fd757eb68194e54e7 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 30 Aug 2021 22:23:59 +0100
Subject: [PATCH 23/38] Add memchr to utils.c

---
 cbits/utils.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cbits/utils.c b/cbits/utils.c
index 2baa78c7..0a2230c8 100644
--- a/cbits/utils.c
+++ b/cbits/utils.c
@@ -2,10 +2,18 @@
  * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
  */
 
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <sys/types.h>
 
 int _hs_text_memcmp(const void *arr1, size_t off1, const void *arr2, size_t off2, size_t len)
 {
   return memcmp(arr1 + off1, arr2 + off2, len);
 }
+
+ssize_t _hs_text_memchr(const void *arr, size_t off, size_t len, uint8_t byte)
+{
+  const void *ptr = memchr(arr + off, byte, len);
+  return ptr == NULL ? -1 : ptr - (arr + off);
+}

From 92efd57f11647d7fd4504d513dadae6878ed4c44 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 2 Aug 2021 19:11:49 +0100
Subject: [PATCH 24/38] Speed up strict and lazy search

---
 src/Data/Text.hs                      |   6 +-
 src/Data/Text/Internal/Lazy/Search.hs | 135 ++++++++++++--------------
 src/Data/Text/Internal/Search.hs      |  77 +++++++++------
 src/Data/Text/Lazy.hs                 |   4 +-
 4 files changed, 114 insertions(+), 108 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index dd7fb4f8..7a6ad12c 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1684,10 +1684,10 @@ findIndex p t = S.findIndex p (stream t)
 -- In (unlikely) bad cases, this function's time complexity degrades
 -- towards /O(n*m)/.
 count :: Text -> Text -> Int
-count pat src
+count pat
     | null pat        = emptyError "count"
-    | isSingleton pat = countChar (unsafeHead pat) src
-    | otherwise       = L.length (indices pat src)
+    | isSingleton pat = countChar (unsafeHead pat)
+    | otherwise       = L.length . indices pat
 {-# INLINE [1] count #-}
 
 {-# RULES
diff --git a/src/Data/Text/Internal/Lazy/Search.hs b/src/Data/Text/Internal/Lazy/Search.hs
index 78450cbf..1cf8831b 100644
--- a/src/Data/Text/Internal/Lazy/Search.hs
+++ b/src/Data/Text/Internal/Lazy/Search.hs
@@ -1,4 +1,6 @@
 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
 
 -- |
 -- Module      : Data.Text.Lazy.Search
@@ -27,9 +29,14 @@ import qualified Data.Text.Array as A
 import Data.Int (Int64)
 import Data.Word (Word8, Word64)
 import qualified Data.Text.Internal as T
+import qualified Data.Text as T (concat)
 import Data.Text.Internal.Fusion.Types (PairS(..))
-import Data.Text.Internal.Lazy (Text(..), foldlChunks)
+import Data.Text.Internal.Lazy (Text(..), foldrChunks, equal)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Bits ((.|.), (.&.))
+import Foreign.C.Types
+import GHC.Exts (ByteArray#)
+import System.Posix.Types (CSsize(..))
 
 -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
 -- @needle@ within @haystack@.
@@ -42,110 +49,90 @@ import Data.Bits ((.|.), (.&.))
 indices :: Text              -- ^ Substring to search for (@needle@)
         -> Text              -- ^ Text to search in (@haystack@)
         -> [Int64]
-indices needle@(Chunk n ns) _haystack@(Chunk k ks)
-    | nlen <= 0  = []
-    | nlen == 1  = indicesOne (nindex 0) 0 k ks
-    | otherwise  = advance k ks 0 0
+indices needle
+    | nlen <= 0  = const []
+    | nlen == 1  = indicesOne (A.unsafeIndex narr noff) 0
+    | otherwise  = advance 0 0
   where
-    advance x@(T.Text _ _ l) xs = scan
-     where
-      scan !g !i
-         | i >= m = case xs of
-                      Empty           -> []
-                      Chunk y ys      -> advance y ys g (i-m)
+    T.Text narr noff nlen = T.concat (foldrChunks (:) [] needle)
+
+    advance !_ !_ Empty = []
+    advance !(g :: Int64) !(i :: Int) xxs@(Chunk x@(T.Text xarr@(A.ByteArray xarr#) xoff l) xs)
+         | i >= l = advance g (i - l) xs
          | lackingHay (i + nlen) x xs  = []
-         | c == z && candidateMatch 0  = g : scan (g+nlen) (i+nlen)
-         | otherwise                   = scan (g+delta) (i+delta)
+         | c == z && candidateMatch    = g : advance (g + intToInt64 nlen) (i + nlen) xxs
+         | otherwise                   = advance (g + intToInt64 delta) (i + delta) xxs
        where
-         m = intToInt64 l
-         c = hindex (i + nlast)
+         c = index xxs (i + nlast)
          delta | nextInPattern = nlen + 1
                | c == z        = skip + 1
-               | otherwise     = 1
-         nextInPattern         = mask .&. swizzle (hindex (i+nlen)) == 0
-         candidateMatch !j
-             | j >= nlast               = True
-             | hindex (i+j) /= nindex j = False
-             | otherwise                = candidateMatch (j+1)
-         hindex                         = index x xs
-    nlen      = wordLength needle
+               | l >= i + nlen = case unsafeDupablePerformIO $
+                  memchr xarr# (intToCSize (xoff + i + nlen)) (intToCSize (l - i - nlen)) z of
+                    -1 -> max 1 (l - i - nlen)
+                    s  -> cSsizeToInt s + 1
+                | otherwise = 1
+         nextInPattern         = mask .&. swizzle (index xxs (i + nlen)) == 0
+
+         candidateMatch
+          | i + nlen <= l = A.equal narr noff xarr (xoff + i) nlen
+          | otherwise     = A.equal narr noff xarr (xoff + i) (l - i) &&
+            Chunk (T.Text narr (noff + l - i) (nlen - l + i)) Empty `equal` xs
+
     nlast     = nlen - 1
-    nindex    = index n ns
-    z         = foldlChunks fin 0 needle
-        where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
-    (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
+    z         = A.unsafeIndex narr (noff + nlen - 1)
+    (mask :: Word64) :*: skip = buildTable 0 0 0 (nlen-2)
 
     swizzle :: Word8 -> Word64
     swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f)
 
-    buildTable (T.Text xarr xoff xlen) xs = go
-      where
-        go !(g::Int64) !i !msk !skp
-            | i >= xlast = case xs of
-                             Empty      -> (msk .|. swizzle z) :*: skp
-                             Chunk y ys -> buildTable y ys g 0 msk' skp'
-            | otherwise = go (g+1) (i+1) msk' skp'
-            where c                = A.unsafeIndex xarr (xoff+i)
+    buildTable !g !i !msk !skp
+            | i >= nlast = (msk .|. swizzle z) :*: skp
+            | otherwise = buildTable (g+1) (i+1) msk' skp'
+            where c                = A.unsafeIndex narr (noff+i)
                   msk'             = msk .|. swizzle c
                   skp' | c == z    = nlen - g - 2
                        | otherwise = skp
-                  xlast = xlen - 1
 
     -- | Check whether an attempt to index into the haystack at the
     -- given offset would fail.
-    lackingHay :: Int64 -> T.Text -> Text -> Bool
-    lackingHay q = go 0
-      where
-        go p (T.Text _ _ l) ps = p' < q && case ps of
-                                             Empty      -> True
-                                             Chunk r rs -> go p' r rs
-            where p' = p + intToInt64 l
-indices _ _ = []
+    lackingHay :: Int -> T.Text -> Text -> Bool
+    lackingHay q (T.Text _ _ l) ps = l < q && case ps of
+      Empty -> True
+      Chunk r rs -> lackingHay (q - l) r rs
 
 -- | Fast index into a partly unpacked 'Text'.  We take into account
 -- the possibility that the caller might try to access one element
 -- past the end.
-index :: T.Text -> Text -> Int64 -> Word8
-index (T.Text arr off len) xs !i
-    | j < len   = A.unsafeIndex arr (off+j)
-    | otherwise = case xs of
-                    Empty
-                        -- out of bounds, but legal
-                        | j == len  -> 0
-                        -- should never happen, due to lackingHay above
-                        | otherwise -> emptyError "index"
-                    Chunk c cs -> index c cs (i-intToInt64 len)
-    where j = int64ToInt i
+index :: Text -> Int -> Word8
+index Empty !_ = 0
+index (Chunk (T.Text arr off len) xs) !i
+    | i < len   = A.unsafeIndex arr (off + i)
+    | otherwise = index xs (i - len)
 
 -- | A variant of 'indices' that scans linearly for a single 'Word8'.
-indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64]
+indicesOne :: Word8 -> Int64 -> Text -> [Int64]
 indicesOne c = chunk
   where
-    chunk :: Int64 -> T.Text -> Text -> [Int64]
-    chunk !i (T.Text oarr ooff olen) os = go 0
+    chunk :: Int64 -> Text -> [Int64]
+    chunk !_ Empty = []
+    chunk !i (Chunk (T.Text oarr ooff olen) os) = go 0
       where
-        go h | h >= olen = case os of
-                             Empty      -> []
-                             Chunk y ys -> chunk (i+intToInt64 olen) y ys
+        go h | h >= olen = chunk (i+intToInt64 olen) os
              | on == c = i + intToInt64 h : go (h+1)
              | otherwise = go (h+1)
              where on = A.unsafeIndex oarr (ooff+h)
 
--- | The number of 'Word8' values in a 'Text'.
-wordLength :: Text -> Int64
-wordLength = foldlChunks sumLength 0
-  where
-    sumLength :: Int64 -> T.Text -> Int64
-    sumLength i (T.Text _ _ l) = i + intToInt64 l
-
-emptyError :: String -> a
-emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")
-
 intToInt64 :: Int -> Int64
 intToInt64 = fromIntegral
 
-int64ToInt :: Int64 -> Int
-int64ToInt = fromIntegral
-
 word8ToInt :: Word8 -> Int
 word8ToInt = fromIntegral
+
+intToCSize :: Int -> CSize
+intToCSize = fromIntegral
+
+cSsizeToInt :: CSsize -> Int
+cSsizeToInt = fromIntegral
+
+foreign import ccall unsafe "_hs_text_memchr" memchr
+    :: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
diff --git a/src/Data/Text/Internal/Search.hs b/src/Data/Text/Internal/Search.hs
index 5688917f..bd16aae5 100644
--- a/src/Data/Text/Internal/Search.hs
+++ b/src/Data/Text/Internal/Search.hs
@@ -1,4 +1,6 @@
 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
 
 -- |
 -- Module      : Data.Text.Internal.Search
@@ -35,6 +37,10 @@ import qualified Data.Text.Array as A
 import Data.Word (Word64, Word8)
 import Data.Text.Internal (Text(..))
 import Data.Bits ((.|.), (.&.), unsafeShiftL)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Foreign.C.Types
+import GHC.Exts (ByteArray#)
+import System.Posix.Types (CSsize(..))
 
 data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
 
@@ -48,47 +54,60 @@ data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
 indices :: Text                -- ^ Substring to search for (@needle@)
         -> Text                -- ^ Text to search in (@haystack@)
         -> [Int]
-indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
-    | nlen == 1              = scanOne (nindex 0)
-    | nlen <= 0 || ldiff < 0 = []
-    | otherwise              = scan 0
+indices (Text narr noff nlen)
+  | nlen == 1 = scanOne (A.unsafeIndex narr noff)
+  | nlen <= 0 = const []
+  | otherwise = scan
   where
-    ldiff    = hlen - nlen
     nlast    = nlen - 1
-    z        = nindex nlast
+    !z       = nindex nlast
     nindex k = A.unsafeIndex narr (noff+k)
-    hindex k = A.unsafeIndex harr (hoff+k)
-    hindex' k | k == hlen  = 0
-              | otherwise = A.unsafeIndex harr (hoff+k)
     buildTable !i !msk !skp
         | i >= nlast           = (msk .|. swizzle z) :* skp
         | otherwise            = buildTable (i+1) (msk .|. swizzle c) skp'
-        where c                = nindex i
+        where !c               = nindex i
               skp' | c == z    = nlen - i - 2
                    | otherwise = skp
+    !(mask :* skip) = buildTable 0 0 (nlen-2)
 
     swizzle :: Word8 -> Word64
-    swizzle k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
+    swizzle !k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
 
-    scan !i
-        | i > ldiff                  = []
-        | c == z && candidateMatch 0 = i : scan (i + nlen)
-        | otherwise                  = scan (i + delta)
-        where c = hindex (i + nlast)
-              candidateMatch !j
-                    | j >= nlast               = True
-                    | hindex (i+j) /= nindex j = False
-                    | otherwise                = candidateMatch (j+1)
-              delta | nextInPattern = nlen + 1
-                    | c == z        = skip + 1
-                    | otherwise     = 1
-                where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
-              !(mask :* skip)       = buildTable 0 0 (nlen-2)
-    scanOne c = loop 0
-        where loop !i | i >= hlen     = []
-                      | hindex i == c = i : loop (i+1)
-                      | otherwise     = loop (i+1)
+    scan (Text harr@(A.ByteArray harr#) hoff hlen) = loop (hoff + nlen) where
+      loop !i
+        | i > hlen + hoff
+        = []
+        | A.unsafeIndex harr (i - 1) == z
+        = if A.equal narr noff harr (i - nlen) nlen
+          then i - nlen : loop (i + nlen)
+          else            loop (i + skip + 1)
+        | i == hlen + hoff
+        = []
+        | mask .&. swizzle (A.unsafeIndex harr i) == 0
+        = loop (i + nlen + 1)
+        | otherwise
+        = case unsafeDupablePerformIO $ memchr harr# (intToCSize i) (intToCSize (hlen + hoff - i)) z of
+          -1 -> []
+          x  -> loop (i + cSsizeToInt x + 1)
 {-# INLINE indices #-}
 
+scanOne :: Word8 -> Text -> [Int]
+scanOne c (Text harr hoff hlen) = loop 0
+  where
+    loop !i
+      | i >= hlen                        = []
+      | A.unsafeIndex harr (hoff+i) == c = i : loop (i+1)
+      | otherwise                        = loop (i+1)
+{-# INLINE scanOne #-}
+
 word8ToInt :: Word8 -> Int
 word8ToInt = fromIntegral
+
+intToCSize :: Int -> CSize
+intToCSize = fromIntegral
+
+cSsizeToInt :: CSsize -> Int
+cSsizeToInt = fromIntegral
+
+foreign import ccall unsafe "_hs_text_memchr" memchr
+    :: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index f17e2254..cc2142c7 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -1599,9 +1599,9 @@ index t n = S.index (stream t) n
 -- In (unlikely) bad cases, this function's time complexity degrades
 -- towards /O(n*m)/.
 count :: Text -> Text -> Int64
-count pat src
+count pat
     | null pat        = emptyError "count"
-    | otherwise       = go 0 (indices pat src)
+    | otherwise       = go 0  . indices pat
   where go !n []     = n
         go !n (_:xs) = go (n+1) xs
 {-# INLINE [1] count #-}

From 67e8943d79870dd8944bb7aba293b3e2852074f9 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 2 Aug 2021 21:32:48 +0100
Subject: [PATCH 25/38] Employ lexicographical comparison for compare

---
 src/Data/Text.hs       | 16 +++++-----------
 src/Data/Text/Array.hs | 22 +++++++++++++++++-----
 src/Data/Text/Lazy.hs  | 21 +++++++++------------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 7a6ad12c..137d2f18 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -411,17 +411,11 @@ textDataType = mkDataType "Data.Text.Text" [packConstr]
 
 -- | /O(n)/ Compare two 'Text' values lexicographically.
 compareText :: Text -> Text -> Ordering
-compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
-    | lenA == 0 && lenB == 0 = EQ
-    | otherwise              = go 0 0
-  where
-    go !i !j
-        | i >= lenA || j >= lenB = compare lenA lenB
-        | a < b                  = LT
-        | a > b                  = GT
-        | otherwise              = go (i+di) (j+dj)
-      where Iter a di = iter ta i
-            Iter b dj = iter tb j
+compareText (Text arrA offA lenA) (Text arrB offB lenB) =
+    A.compare arrA offA arrB offB (min lenA lenB) <> compare lenA lenB
+-- This is not a mistake: on contrary to UTF-16 (https://github.com/haskell/text/pull/208),
+-- lexicographic ordering of UTF-8 encoded strings matches lexicographic ordering
+-- of underlying bytearrays, no decoding is needed.
 
 -- -----------------------------------------------------------------------------
 -- * Conversion to/from 'Text'
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index ec7cee88..ef407c53 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -33,6 +33,7 @@ module Data.Text.Array
     , copyI
     , empty
     , equal
+    , compare
     , run
     , run2
     , toList
@@ -56,7 +57,8 @@ import Foreign.C.Types (CInt(..))
 import GHC.Exts hiding (toList)
 import GHC.ST (ST(..), runST)
 import GHC.Word (Word8(..))
-import Prelude hiding (length, read)
+import qualified Prelude
+import Prelude hiding (length, read, compare)
 
 -- | Immutable array type.
 data Array = ByteArray ByteArray#
@@ -250,13 +252,23 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
 
 -- | Compare portions of two arrays for equality.  No bounds checking
 -- is performed.
-equal :: Array                  -- ^ First
+equal :: Array -> Int -> Array -> Int -> Int -> Bool
+equal src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count == 0
+{-# INLINE equal #-}
+
+-- | Compare portions of two arrays. No bounds checking is performed.
+compare :: Array -> Int -> Array -> Int -> Int -> Ordering
+compare src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count `Prelude.compare` 0
+{-# INLINE compare #-}
+
+compareInternal
+      :: Array                  -- ^ First
       -> Int                    -- ^ Offset into first
       -> Array                  -- ^ Second
       -> Int                    -- ^ Offset into second
       -> Int                    -- ^ Count
-      -> Bool
-equal (ByteArray src1#) (I# off1#) (ByteArray src2#) (I# off2#) (I# count#) = i == 0
+      -> Int
+compareInternal (ByteArray src1#) (I# off1#) (ByteArray src2#) (I# off2#) (I# count#) = i
   where
 #if MIN_VERSION_base(4,11,0)
     i = I# (compareByteArrays# src1# off1# src2# off2# count#)
@@ -266,4 +278,4 @@ equal (ByteArray src1#) (I# off1#) (ByteArray src2#) (I# off2#) (I# count#) = i
 foreign import ccall unsafe "_hs_text_memcmp" memcmp
     :: ByteArray# -> Int# -> ByteArray# -> Int# -> Int# -> IO CInt
 #endif
-{-# INLINE equal #-}
+{-# INLINE compareInternal #-}
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index cc2142c7..49902e26 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -215,6 +215,7 @@ import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
 import qualified Data.Text as T
+import qualified Data.Text.Array as A
 import qualified Data.Text.Internal as T
 import qualified Data.Text.Internal.Fusion.Common as S
 import qualified Data.Text.Unsafe as T
@@ -286,18 +287,14 @@ compareText :: Text -> Text -> Ordering
 compareText Empty Empty = EQ
 compareText Empty _     = LT
 compareText _     Empty = GT
-compareText (Chunk a0 as) (Chunk b0 bs) = outer a0 b0
- where
-  outer ta@(T.Text arrA offA lenA) tb@(T.Text arrB offB lenB) = go 0 0
-   where
-    go !i !j
-      | i >= lenA = compareText as (chunk (T.Text arrB (offB+j) (lenB-j)) bs)
-      | j >= lenB = compareText (chunk (T.Text arrA (offA+i) (lenA-i)) as) bs
-      | a < b     = LT
-      | a > b     = GT
-      | otherwise = go (i+di) (j+dj)
-      where T.Iter a di = T.iter ta i
-            T.Iter b dj = T.iter tb j
+compareText (Chunk (T.Text arrA offA lenA) as) (Chunk (T.Text arrB offB lenB) bs) =
+  A.compare arrA offA arrB offB (min lenA lenB) <> case lenA `compare` lenB of
+    LT -> compareText as (Chunk (T.Text arrB (offB + lenA) (lenB - lenA)) bs)
+    EQ -> compareText as bs
+    GT -> compareText (Chunk (T.Text arrA (offA + lenB) (lenA - lenB)) as) bs
+-- This is not a mistake: on contrary to UTF-16 (https://github.com/haskell/text/pull/208),
+-- lexicographic ordering of UTF-8 encoded strings matches lexicographic ordering
+-- of underlying bytearrays, no decoding is needed.
 
 instance Show Text where
     showsPrec p ps r = showsPrec p (unpack ps) r

From 6a4e792f074602bd66c1021b4a365886539b37dd Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 30 Aug 2021 22:24:21 +0100
Subject: [PATCH 26/38] Speed up lines and unlines

---
 src/Data/Text.hs      | 45 +++++++++++++++----------------------------
 src/Data/Text/Lazy.hs | 31 +++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 137d2f18..acc91140 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1754,42 +1754,29 @@ isAsciiSpace :: Word8 -> Bool
 isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5)
 {-# INLINE isAsciiSpace #-}
 
--- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
--- newline 'Char's. The resulting strings do not contain newlines.
+-- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at newline characters
+-- @'\\n'@ (LF, line feed). The resulting strings do not contain newlines.
+--
+-- 'lines' __does not__ treat @'\\r'@ (CR, carriage return) as a newline character.
 lines :: Text -> [Text]
-lines ps | null ps   = []
-         | otherwise = h : if null t
-                           then []
-                           else lines (unsafeTail t)
-    where (# h,t #) = span_ (/= '\n') ps
+lines (Text arr@(A.ByteArray arr#) off len) = go off
+  where
+    go !n
+      | n >= len + off = []
+      | delta < 0 = [Text arr n (len + off - n)]
+      | otherwise = Text arr n delta : go (n + delta + 1)
+      where
+        delta = cSsizeToInt $ unsafeDupablePerformIO $
+          memchr arr# (intToCSize n) (intToCSize (len + off - n)) 0x0A
 {-# INLINE lines #-}
 
-{-
--- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
--- boundaries.
---
--- A line boundary is considered to be either a line feed, a carriage
--- return immediately followed by a line feed, or a carriage return.
--- This accounts for both Unix and Windows line ending conventions,
--- and for the old convention used on Mac OS 9 and earlier.
-lines' :: Text -> [Text]
-lines' ps | null ps   = []
-          | otherwise = h : case uncons t of
-                              Nothing -> []
-                              Just (c,t')
-                                  | c == '\n' -> lines t'
-                                  | c == '\r' -> case uncons t' of
-                                                   Just ('\n',t'') -> lines t''
-                                                   _               -> lines t'
-    where (h,t)    = span notEOL ps
-          notEOL c = c /= '\n' && c /= '\r'
-{-# INLINE lines' #-}
--}
+foreign import ccall unsafe "_hs_text_memchr" memchr
+    :: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
 
 -- | /O(n)/ Joins lines, after appending a terminating newline to
 -- each.
 unlines :: [Text] -> Text
-unlines = concat . L.map (`snoc` '\n')
+unlines = concat . L.foldr (\t acc -> t : singleton '\n' : acc) []
 {-# INLINE unlines #-}
 
 -- | /O(n)/ Joins words using single space characters.
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 49902e26..318206c3 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -1411,13 +1411,32 @@ chunksOf k = go
                    | otherwise -> a : go b
 {-# INLINE chunksOf #-}
 
--- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
--- newline 'Char's. The resulting strings do not contain newlines.
+-- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at newline characters
+-- @'\\n'@ (LF, line feed). The resulting strings do not contain newlines.
+--
+-- 'lines' __does not__ treat @'\\r'@ (CR, carriage return) as a newline character.
 lines :: Text -> [Text]
 lines Empty = []
-lines t = let (l,t') = break ((==) '\n') t
-          in l : if null t' then []
-                 else lines (tail t')
+lines (Chunk c cs)
+  | hasNlEnd c = P.map fromStrict (T.lines c) ++ lines cs
+  | otherwise = case T.lines c of
+    [] -> error "lines: unexpected empty chunk"
+    l : ls -> go l ls cs
+  where
+    go l [] Empty = [fromStrict l]
+    go l [] (Chunk x xs) = case T.lines x of
+      [] -> error "lines: unexpected empty chunk"
+      [xl]
+        | hasNlEnd x -> chunk l (fromStrict xl) : lines xs
+        | otherwise  -> go (l `T.append` xl) [] xs
+      xl : yl : yls -> chunk l (fromStrict xl) :
+        if hasNlEnd x
+        then P.map fromStrict (yl : yls) ++ lines xs
+        else go yl yls xs
+    go l (m : ms) xs = fromStrict l : go m ms xs
+
+hasNlEnd :: T.Text -> Bool
+hasNlEnd (T.Text arr off len) = A.unsafeIndex arr (off + len - 1) == 0x0A
 
 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
 -- representing white space.
@@ -1428,7 +1447,7 @@ words = L.filter (not . null) . split isSpace
 -- | /O(n)/ Joins lines, after appending a terminating newline to
 -- each.
 unlines :: [Text] -> Text
-unlines = concat . L.map (`snoc` '\n')
+unlines = concat . L.foldr (\t acc -> t : singleton '\n' : acc) []
 {-# INLINE unlines #-}
 
 -- | /O(n)/ Joins words using single space characters.

From a5b2deee781c253cc8f102b17410d3b95bd7560a Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 7 Aug 2021 20:41:07 +0100
Subject: [PATCH 27/38] Speed up strict and lazy reading of numbers

---
 src/Data/Text/Internal/Private.hs | 13 +++++++
 src/Data/Text/Internal/Read.hs    | 17 ++++++++--
 src/Data/Text/Lazy/Read.hs        | 56 ++++++++++++++++++++++---------
 src/Data/Text/Read.hs             | 38 ++++++++++++---------
 4 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/src/Data/Text/Internal/Private.hs b/src/Data/Text/Internal/Private.hs
index b150fed2..4de74b66 100644
--- a/src/Data/Text/Internal/Private.hs
+++ b/src/Data/Text/Internal/Private.hs
@@ -13,12 +13,14 @@ module Data.Text.Internal.Private
     (
       runText
     , span_
+    , spanAscii_
     ) where
 
 import Control.Monad.ST (ST, runST)
 import Data.Text.Internal (Text(..), text)
 import Data.Text.Unsafe (Iter(..), iter)
 import qualified Data.Text.Array as A
+import Data.Word (Word8)
 
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
@@ -34,6 +36,17 @@ span_ p t@(Text arr off len) = (# hd,tl #)
             where Iter c d       = iter t i
 {-# INLINE span_ #-}
 
+-- | For the sake of performance this function does not check
+-- that a char is in ASCII range; it is a responsibility of @p@.
+spanAscii_ :: (Word8 -> Bool) -> Text -> (# Text, Text #)
+spanAscii_ p (Text arr off len) = (# hd, tl #)
+  where hd = text arr off k
+        tl = text arr (off + k) (len - k)
+        !k = loop 0
+        loop !i | i < len && p (A.unsafeIndex arr (off + i)) = loop (i + 1)
+                | otherwise = i
+{-# INLINE spanAscii_ #-}
+
 runText ::
 #if defined(ASSERTS)
   HasCallStack =>
diff --git a/src/Data/Text/Internal/Read.hs b/src/Data/Text/Internal/Read.hs
index 8f06b8ea..17d63154 100644
--- a/src/Data/Text/Internal/Read.hs
+++ b/src/Data/Text/Internal/Read.hs
@@ -61,9 +61,20 @@ perhaps def m = P $ \t -> case runP m t of
 
 hexDigitToInt :: Char -> Int
 hexDigitToInt c
-    | c >= '0' && c <= '9' = ord c - ord '0'
-    | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10)
-    | otherwise            = ord c - (ord 'A' - 10)
+    | to0 < 10  = wordToInt to0
+    | toa < 6   = wordToInt toa + 10
+    | otherwise = wordToInt toA + 10
+    where
+        ordW = intToWord (ord c)
+        to0 = ordW - intToWord (ord '0')
+        toa = ordW - intToWord (ord 'a')
+        toA = ordW - intToWord (ord 'A')
 
 digitToInt :: Char -> Int
 digitToInt c = ord c - ord '0'
+
+intToWord :: Int -> Word
+intToWord = fromIntegral
+
+wordToInt :: Word -> Int
+wordToInt = fromIntegral
diff --git a/src/Data/Text/Lazy/Read.hs b/src/Data/Text/Lazy/Read.hs
index f8963458..037844bf 100644
--- a/src/Data/Text/Lazy/Read.hs
+++ b/src/Data/Text/Lazy/Read.hs
@@ -1,5 +1,7 @@
 {-# LANGUAGE OverloadedStrings, CPP #-}
-{-# LANGUAGE Safe #-}
+{-# LANGUAGE Trustworthy #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE UnboxedTuples #-}
 
 -- |
 -- Module      : Data.Text.Lazy.Read
@@ -21,11 +23,15 @@ module Data.Text.Lazy.Read
     ) where
 
 import Control.Monad (liftM)
-import Data.Char (isDigit, isHexDigit)
+import Data.Char (ord)
 import Data.Int (Int8, Int16, Int32, Int64)
 import Data.Ratio ((%))
 import Data.Text.Internal.Read
+import Data.Text.Array as A
 import Data.Text.Lazy as T
+import Data.Text.Internal.Lazy as T (Text(..))
+import qualified Data.Text.Internal as T (Text(..))
+import qualified Data.Text.Internal.Private as T (spanAscii_)
 import Data.Word (Word, Word8, Word16, Word32, Word64)
 
 -- | Read some text.  If the read succeeds, return its value and the
@@ -59,7 +65,7 @@ decimal :: Integral a => Reader a
 decimal txt
     | T.null h  = Left "input does not start with a digit"
     | otherwise = Right (T.foldl' go 0 h, t)
-  where (h,t)  = T.span isDigit txt
+  where (# h, t #)  = spanAscii_ (\w -> w - ord8 '0' < 10) txt
         go n d = (n * 10 + fromIntegral (digitToInt d))
 
 -- | Read a hexadecimal integer, consisting of an optional leading
@@ -97,7 +103,7 @@ hex :: Integral a => Reader a
 hex txt
     | T.null h  = Left "input does not start with a hexadecimal digit"
     | otherwise = Right (T.foldl' go 0 h, t)
-  where (h,t)  = T.span isHexDigit txt
+  where (# h, t #)  = spanAscii_ (\w -> w - ord8 '0' < 10 || w - ord8 'A' < 6 || w - ord8 'a' < 6) txt
         go n d = (n * 16 + fromIntegral (hexDigitToInt d))
 
 -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
@@ -156,26 +162,30 @@ signa :: Num a => Parser a -> Parser a
 {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
 {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
 signa p = do
-  sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
-  if sign == '+' then p else negate `liftM` p
+  sign <- perhaps (ord8 '+') $ charAscii (\c -> c == ord8 '-' || c == ord8 '+')
+  if sign == ord8 '+' then p else negate `liftM` p
 
-char :: (Char -> Bool) -> Parser Char
-char p = P $ \t -> case T.uncons t of
-                     Just (c,t') | p c -> Right (c,t')
-                     _                 -> Left "character does not match"
+charAscii :: (Word8 -> Bool) -> Parser Word8
+charAscii p = P $ \case
+  Empty -> Left "character does not match"
+  -- len is > 0, unless the internal invariant of Text is violated
+  Chunk (T.Text arr off len) ts -> let c = A.unsafeIndex arr off in
+    if p c
+    then Right (c, if len <= 1 then ts else Chunk (T.Text arr (off + 1) (len - 1)) ts)
+    else Left "character does not match"
 
 floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
 {-# INLINE floaty #-}
 floaty f = runP $ do
-  sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+  sign <- perhaps (ord8 '+') $ charAscii (\c -> c == ord8 '-' || c == ord8 '+')
   real <- P decimal
   T fraction fracDigits <- perhaps (T 0 0) $ do
-    _ <- char (=='.')
-    digits <- P $ \t -> Right (int64ToInt . T.length $ T.takeWhile isDigit t, t)
+    _ <- charAscii (== ord8 '.')
+    digits <- P $ \t -> Right (let (# hd, _ #) = spanAscii_ (\w -> w - ord8 '0' < 10) t in int64ToInt (T.length hd), t)
     n <- P decimal
     return $ T n digits
-  let e c = c == 'e' || c == 'E'
-  power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
+  let e c = c == ord8 'e' || c == ord8 'E'
+  power <- perhaps 0 (charAscii e >> signa (P decimal) :: Parser Int)
   let n = if fracDigits == 0
           then if power == 0
                then fromInteger real
@@ -183,9 +193,23 @@ floaty f = runP $ do
           else if power == 0
                then f real fraction (10 ^ fracDigits)
                else f real fraction (10 ^ fracDigits) * (10 ^^ power)
-  return $! if sign == '+'
+  return $! if sign == ord8 '+'
             then n
             else -n
 
 int64ToInt :: Int64 -> Int
 int64ToInt = fromIntegral
+
+ord8 :: Char -> Word8
+ord8 = fromIntegral . ord
+
+-- | For the sake of performance this function does not check
+-- that a char is in ASCII range; it is a responsibility of @p@.
+spanAscii_ :: (Word8 -> Bool) -> Text -> (# Text, Text #)
+spanAscii_ p = loop
+  where
+    loop Empty = (# Empty, Empty #)
+    loop (Chunk t ts) = let (# t', t''@(T.Text _ _ len) #) = T.spanAscii_ p t in
+      if len == 0
+      then let (# ts', ts'' #) = loop ts in (# Chunk t ts', ts'' #)
+      else (# Chunk t' Empty, Chunk t'' ts #)
diff --git a/src/Data/Text/Read.hs b/src/Data/Text/Read.hs
index 17557f92..361f773a 100644
--- a/src/Data/Text/Read.hs
+++ b/src/Data/Text/Read.hs
@@ -21,11 +21,13 @@ module Data.Text.Read
     ) where
 
 import Control.Monad (liftM)
-import Data.Char (isDigit, isHexDigit)
+import Data.Char (ord)
 import Data.Int (Int8, Int16, Int32, Int64)
 import Data.Ratio ((%))
 import Data.Text as T
-import Data.Text.Internal.Private (span_)
+import Data.Text.Internal as T (Text(..))
+import Data.Text.Array as A
+import Data.Text.Internal.Private (spanAscii_)
 import Data.Text.Internal.Read
 import Data.Word (Word, Word8, Word16, Word32, Word64)
 
@@ -60,7 +62,7 @@ decimal :: Integral a => Reader a
 decimal txt
     | T.null h  = Left "input does not start with a digit"
     | otherwise = Right (T.foldl' go 0 h, t)
-  where (# h,t #)  = span_ isDigit txt
+  where (# h,t #)  = spanAscii_ (\w -> w - ord8 '0' < 10) txt
         go n d = (n * 10 + fromIntegral (digitToInt d))
 
 -- | Read a hexadecimal integer, consisting of an optional leading
@@ -107,7 +109,7 @@ hex :: Integral a => Reader a
 hex txt
     | T.null h  = Left "input does not start with a hexadecimal digit"
     | otherwise = Right (T.foldl' go 0 h, t)
-  where (# h,t #)  = span_ isHexDigit txt
+  where (# h,t #)  = spanAscii_ (\w -> w - ord8 '0' < 10 || w - ord8 'A' < 6 || w - ord8 'a' < 6) txt
         go n d = (n * 16 + fromIntegral (hexDigitToInt d))
 
 -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
@@ -166,26 +168,27 @@ signa :: Num a => Parser a -> Parser a
 {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
 {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
 signa p = do
-  sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
-  if sign == '+' then p else negate `liftM` p
+  sign <- perhaps (ord8 '+') $ charAscii (\c -> c == ord8 '-' || c == ord8 '+')
+  if sign == ord8 '+' then p else negate `liftM` p
 
-char :: (Char -> Bool) -> Parser Char
-char p = P $ \t -> case T.uncons t of
-                     Just (c,t') | p c -> Right (c,t')
-                     _                 -> Left "character does not match"
+charAscii :: (Word8 -> Bool) -> Parser Word8
+charAscii p = P $ \(Text arr off len) -> let c = A.unsafeIndex arr off in
+  if len > 0 && p c
+  then Right (c, Text arr (off + 1) (len - 1))
+  else Left "character does not match"
 
 floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
 {-# INLINE floaty #-}
 floaty f = runP $ do
-  sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+  sign <- perhaps (ord8 '+') $ charAscii (\c -> c == ord8 '-' || c == ord8 '+')
   real <- P decimal
   T fraction fracDigits <- perhaps (T 0 0) $ do
-    _ <- char (=='.')
-    digits <- P $ \t -> Right (T.length $ T.takeWhile isDigit t, t)
+    _ <- charAscii (== ord8 '.')
+    digits <- P $ \t -> Right (let (# hd, _ #) = spanAscii_ (\w -> w - ord8 '0' < 10) t in T.length hd, t)
     n <- P decimal
     return $ T n digits
-  let e c = c == 'e' || c == 'E'
-  power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
+  let e c = c == ord8 'e' || c == ord8 'E'
+  power <- perhaps 0 (charAscii e >> signa (P decimal) :: Parser Int)
   let n = if fracDigits == 0
           then if power == 0
                then fromInteger real
@@ -193,6 +196,9 @@ floaty f = runP $ do
           else if power == 0
                then f real fraction (10 ^ fracDigits)
                else f real fraction (10 ^ fracDigits) * (10 ^^ power)
-  return $! if sign == '+'
+  return $! if sign == ord8 '+'
             then n
             else -n
+
+ord8 :: Char -> Word8
+ord8 = fromIntegral . ord

From 79d9da0f026d5891d1189084e83884e3e23ff8fd Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 25 Aug 2021 00:02:54 +0100
Subject: [PATCH 28/38] Implement Data.Text.map explicitly

---
 src/Data/Text.hs      | 23 ++++++++++++++++++++++-
 src/Data/Text/Lazy.hs |  2 +-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index acc91140..9deac162 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -619,7 +619,28 @@ compareLength t n = S.compareLengthI (stream t) n
 --
 -- Performs replacement on invalid scalar values.
 map :: (Char -> Char) -> Text -> Text
-map f t = unstream (S.map (safe . f) (stream t))
+map f = go
+  where
+    go (Text src o l) = runST $ do
+      marr <- A.new (l + 4)
+      outer marr (l + 4) o 0
+      where
+        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s Text
+        outer !dst !dstLen = inner
+          where
+            inner !srcOff !dstOff
+              | srcOff >= l + o = do
+                A.shrinkM dst dstOff
+                arr <- A.unsafeFreeze dst
+                return (Text arr 0 dstOff)
+              | dstOff + 4 > dstLen = do
+                let !dstLen' = dstLen + (l + o) - srcOff + 4
+                dst' <- A.resizeM dst dstLen'
+                outer dst' dstLen' srcOff dstOff
+              | otherwise = do
+                let !(Iter c d) = iterArray src srcOff
+                d' <- unsafeWrite dst dstOff (safe (f c))
+                inner (srcOff + d) (dstOff + d')
 {-# INLINE [1] map #-}
 
 {-# RULES
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 318206c3..ad607361 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -556,7 +556,7 @@ compareLength t n = S.compareLengthI (stream t) n
 -- each element of @t@. Performs replacement on
 -- invalid scalar values.
 map :: (Char -> Char) -> Text -> Text
-map f t = unstream (S.map (safe . f) (stream t))
+map f = foldrChunks (Chunk . T.map f) Empty
 {-# INLINE [1] map #-}
 
 {-# RULES

From 1a701176894123fb1f3b90d73f9ca6e76c3aa810 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Wed, 25 Aug 2021 23:28:47 +0100
Subject: [PATCH 29/38] Speed up Data.Text.intersperse

---
 src/Data/Text.hs | 62 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 9deac162..b7d44466 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -225,7 +225,7 @@ import Data.Binary (Binary(get, put))
 import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
-import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr2, chr3, chr4)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr2, chr3, chr4, ord2, ord3, ord4)
 import qualified Data.Text.Internal.Fusion as S
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
@@ -669,8 +669,61 @@ intercalate t = concat . L.intersperse t
 -- "S.H.I.E.L.D"
 --
 -- Performs replacement on invalid scalar values.
-intersperse     :: Char -> Text -> Text
-intersperse c t = unstream (S.intersperse (safe c) (stream t))
+intersperse :: Char -> Text -> Text
+intersperse c t@(Text src o l) = if l == 0 then mempty else runST $ do
+    let !cLen = utf8Length c
+        dstLen = l + length t P.* cLen
+
+    dst <- A.new dstLen
+
+    let writeSep = case cLen of
+          1 -> \dstOff ->
+            A.unsafeWrite dst dstOff (ord8 c)
+          2 -> let (c0, c1) = ord2 c in \dstOff -> do
+            A.unsafeWrite dst dstOff c0
+            A.unsafeWrite dst (dstOff + 1) c1
+          3 -> let (c0, c1, c2) = ord3 c in \dstOff -> do
+            A.unsafeWrite dst dstOff c0
+            A.unsafeWrite dst (dstOff + 1) c1
+            A.unsafeWrite dst (dstOff + 2) c2
+          _ -> let (c0, c1, c2, c3) = ord4 c in \dstOff -> do
+            A.unsafeWrite dst dstOff c0
+            A.unsafeWrite dst (dstOff + 1) c1
+            A.unsafeWrite dst (dstOff + 2) c2
+            A.unsafeWrite dst (dstOff + 3) c3
+    let go !srcOff !dstOff = if srcOff >= o + l then return () else do
+          let m0 = A.unsafeIndex src srcOff
+              m1 = A.unsafeIndex src (srcOff + 1)
+              m2 = A.unsafeIndex src (srcOff + 2)
+              m3 = A.unsafeIndex src (srcOff + 3)
+              !d = utf8LengthByLeader m0
+          case d of
+            1 -> do
+              A.unsafeWrite dst dstOff m0
+              writeSep (dstOff + 1)
+              go (srcOff + 1) (dstOff + 1 + cLen)
+            2 -> do
+              A.unsafeWrite dst dstOff m0
+              A.unsafeWrite dst (dstOff + 1) m1
+              writeSep (dstOff + 2)
+              go (srcOff + 2) (dstOff + 2 + cLen)
+            3 -> do
+              A.unsafeWrite dst dstOff m0
+              A.unsafeWrite dst (dstOff + 1) m1
+              A.unsafeWrite dst (dstOff + 2) m2
+              writeSep (dstOff + 3)
+              go (srcOff + 3) (dstOff + 3 + cLen)
+            _ -> do
+              A.unsafeWrite dst dstOff m0
+              A.unsafeWrite dst (dstOff + 1) m1
+              A.unsafeWrite dst (dstOff + 2) m2
+              A.unsafeWrite dst (dstOff + 3) m3
+              writeSep (dstOff + 4)
+              go (srcOff + 4) (dstOff + 4 + cLen)
+
+    go o 0
+    arr <- A.unsafeFreeze dst
+    return (Text arr 0 (dstLen - cLen))
 {-# INLINE [1] intersperse #-}
 
 -- | /O(n)/ Reverse the characters of a string.
@@ -1958,6 +2011,9 @@ copy (Text arr off len) = Text (A.run go) 0 len
       A.copyI len marr 0 arr off
       return marr
 
+ord8 :: Char -> Word8
+ord8 = P.fromIntegral . ord
+
 intToCSize :: Int -> CSize
 intToCSize = P.fromIntegral
 

From adbfc2d17d57cbb4966ecf69d66cdc50490eaad8 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 6 Sep 2021 20:33:19 +0100
Subject: [PATCH 30/38] Implement copy from/to pointer via primops

---
 src/Data/Text/Array.hs   | 36 ++++++++++++++++++++++++++++++++++++
 src/Data/Text/Foreign.hs | 32 ++++++++------------------------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index ef407c53..93ad29f8 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -31,6 +31,8 @@ module Data.Text.Array
     , shrinkM
     , copyM
     , copyI
+    , copyFromPointer
+    , copyToPointer
     , empty
     , equal
     , compare
@@ -250,6 +252,40 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
       s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
+-- | Copy from pointer.
+copyFromPointer
+  :: MArray s               -- ^ Destination
+  -> Int                    -- ^ Destination offset
+  -> Ptr Word8              -- ^ Source
+  -> Int                    -- ^ Count
+  -> ST s ()
+copyFromPointer (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I# count#)
+#if defined(ASSERTS)
+  | count < 0 = error $
+    "copyFromPointer: count must be >= 0, but got " ++ show count
+#endif
+  | otherwise = ST $ \s1# ->
+    case copyAddrToByteArray# src# dst# dstOff# count# s1# of
+      s2# -> (# s2#, () #)
+{-# INLINE copyFromPointer #-}
+
+-- | Copy to pointer.
+copyToPointer
+  :: Array                  -- ^ Source
+  -> Int                    -- ^ Source offset
+  -> Ptr Word8              -- ^ Destination
+  -> Int                    -- ^ Count
+  -> ST s ()
+copyToPointer (ByteArray src#) srcOff@(I# srcOff#) (Ptr dst#) count@(I# count#)
+#if defined(ASSERTS)
+  | count < 0 = error $
+    "copyToPointer: count must be >= 0, but got " ++ show count
+#endif
+  | otherwise = ST $ \s1# ->
+    case copyByteArrayToAddr# src# srcOff# dst# count# s1# of
+      s2# -> (# s2#, () #)
+{-# INLINE copyToPointer #-}
+
 -- | Compare portions of two arrays for equality.  No bounds checking
 -- is performed.
 equal :: Array -> Int -> Array -> Int -> Int -> Bool
diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs
index 87742011..afe3234e 100644
--- a/src/Data/Text/Foreign.hs
+++ b/src/Data/Text/Foreign.hs
@@ -34,7 +34,7 @@ module Data.Text.Foreign
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
-import Control.Monad.ST.Unsafe (unsafeIOToST)
+import Control.Monad.ST.Unsafe (unsafeSTToIO)
 import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen)
 import Data.Text.Encoding (decodeUtf8, encodeUtf8)
 import Data.Text.Internal (Text(..), empty)
@@ -44,8 +44,7 @@ import Data.Word (Word8)
 import Foreign.C.String (CStringLen)
 import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray)
 import Foreign.Marshal.Alloc (allocaBytes)
-import Foreign.Ptr (Ptr, castPtr, plusPtr)
-import Foreign.Storable (peek, poke)
+import Foreign.Ptr (Ptr, castPtr)
 import qualified Data.Text.Array as A
 
 -- $interop
@@ -68,20 +67,11 @@ newtype I8 = I8 Int
 fromPtr :: Ptr Word8           -- ^ source array
         -> I8                  -- ^ length of source array (in 'Word8' units)
         -> IO Text
-fromPtr _   (I8 0)   = return empty
-fromPtr ptr (I8 len) =
-#if defined(ASSERTS)
-    assert (len > 0) $
-#endif
-    return $! Text arr 0 len
-  where
-    arr = A.run (A.new len >>= copy)
-    copy marr = loop ptr 0
-      where
-        loop !p !i | i == len = return marr
-                   | otherwise = do
-          A.unsafeWrite marr i =<< unsafeIOToST (peek p)
-          loop (p `plusPtr` 1) (i + 1)
+fromPtr ptr (I8 len) = unsafeSTToIO $ do
+  dst <- A.new len
+  A.copyFromPointer dst 0 ptr len
+  arr <- A.unsafeFreeze dst
+  return $! Text arr 0 len
 
 -- $lowlevel
 --
@@ -130,13 +120,7 @@ splitAtWord8 (I8 n) t@(Text arr off len)
 -- | /O(n)/ Copy a 'Text' to an array.  The array is assumed to be big
 -- enough to hold the contents of the entire 'Text'.
 unsafeCopyToPtr :: Text -> Ptr Word8 -> IO ()
-unsafeCopyToPtr (Text arr off len) ptr = loop ptr off
-  where
-    end = off + len
-    loop !p !i | i == end  = return ()
-               | otherwise = do
-      poke p (A.unsafeIndex arr i)
-      loop (p `plusPtr` 1) (i + 1)
+unsafeCopyToPtr (Text arr off len) ptr = unsafeSTToIO $ A.copyToPointer arr off ptr len
 
 -- | /O(n)/ Perform an action on a temporary, mutable copy of a
 -- 'Text'.  The copy is freed as soon as the action returns.

From fc95ea4833e71cd35c2132496e1e0acc5687db36 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 6 Sep 2021 20:35:17 +0100
Subject: [PATCH 31/38] Reimplement decodeASCII and decodeLatin1 to share C
 code

---
 cbits/cbits.c                  | 20 -------------
 cbits/is_ascii.c               | 47 ++++++++++++++++++++++++++++++
 src/Data/Text/Encoding.hs      | 52 ++++++++++++++++++++++++----------
 src/Data/Text/Lazy/Encoding.hs |  5 +++-
 text.cabal                     |  1 +
 5 files changed, 89 insertions(+), 36 deletions(-)
 create mode 100644 cbits/is_ascii.c

diff --git a/cbits/cbits.c b/cbits/cbits.c
index 33bab908..5fb77941 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -55,26 +55,6 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   return *state = utf8d[256 + *state + type];
 }
 
-size_t
-_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
-                       const uint8_t *srcend)
-{
-  const uint8_t *dest0 = dest;
-  const uint8_t *p = src;
-
-  while (p != srcend){
-    uint8_t codepoint = *p++;
-    if(codepoint < 0x80){
-      *dest++ = (uint8_t)codepoint;
-    } else {
-      *dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
-      *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
-    }
-  }
-
-  return (dest - dest0);
-}
-
 /*
  * A best-effort decoder. Runs until it hits either end of input or
  * the start of an invalid byte sequence.
diff --git a/cbits/is_ascii.c b/cbits/is_ascii.c
new file mode 100644
index 00000000..dd785441
--- /dev/null
+++ b/cbits/is_ascii.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#ifdef __x86_64__
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+#include <stdbool.h>
+
+/*
+  _hs_text_is_ascii takes a UTF-8 encoded buffer,
+  and returns the length of the ASCII-compatible prefix.
+*/
+const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){
+  const uint8_t *src = src0;
+
+#ifdef __x86_64__
+  // I experimented with larger vector registers,
+  // but did not notice any measurable speed up, so let's keep it simple.
+  while (src < srcend - 15){
+    __m128i w128 = _mm_loadu_si128((__m128i *)src);
+    // Which bytes are < 128?
+    uint16_t mask = _mm_movemask_epi8(w128);
+    if (mask) break;
+    src+= 16;
+  }
+#endif
+
+  while (src < srcend - 7){
+    uint64_t w64;
+    memcpy(&w64, src, sizeof(uint64_t));
+    if (w64 & 0x8080808080808080ULL) break;
+    src+= 8;
+  }
+
+  while (src < srcend){
+    uint8_t leadByte = *src;
+    if(leadByte >= 0x80) break;
+    src++;
+  }
+
+  return src - src0;
+}
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 0312cd73..b5bc13f8 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -64,21 +64,24 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
 import Control.Monad.ST (runST)
-import Data.ByteString as B
+import Data.Bits (shiftR, (.&.))
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Short.Internal as SBS
 import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), safe, text)
+import Data.Text.Internal (Text(..), safe, empty, text)
 import Data.Text.Internal.Private (runText)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show ()
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word8, Word32)
-import Foreign.C.Types (CSize)
+import Foreign.C.Types (CSize(..))
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke)
+import Foreign.Storable (Storable, peek, poke, peekByteOff)
 import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
@@ -112,7 +115,13 @@ import GHC.Stack (HasCallStack)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: ByteString -> Text
-decodeASCII = decodeUtf8
+decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
+  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+    c_is_ascii src (src `plusPtr` len)
+  if asciiPrefixLen == len
+  then let !(SBS.SBS arr) = SBS.toShort bs in
+        return (Text (A.ByteArray arr) 0 len)
+  else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
@@ -124,13 +133,29 @@ decodeLatin1 ::
   HasCallStack =>
 #endif
   ByteString -> Text
-decodeLatin1 bs = withBS bs aux where
-  aux fp len = text a 0 actualLen
-   where
-    (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
-    go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
-      destLen <- c_decode_latin1 dest src (src `plusPtr` len)
-      return (A.MutableByteArray dest, destLen)
+decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
+  dst <- A.new (2 * len)
+  let inner srcOff dstOff = if srcOff >= len then return dstOff else do
+        asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+          c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
+        if asciiPrefixLen == 0
+        then do
+          byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
+          A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
+          A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
+          inner (srcOff + 1) (dstOff + 2)
+        else do
+          unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+            unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
+          inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
+
+  actualLen <- inner 0 0
+  dst' <- A.resizeM dst actualLen
+  arr <- A.unsafeFreeze dst'
+  return $ Text arr 0 actualLen
+
+foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
+    :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -538,6 +563,3 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr (Ptr Word8) -> Ptr Word8
     -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
-
-foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
-    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index a82ba8c2..e32cfd51 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -1,5 +1,8 @@
 {-# LANGUAGE BangPatterns,CPP #-}
 {-# LANGUAGE Trustworthy #-}
+
+{-# OPTIONS_GHC -fno-warn-deprecations #-}
+
 -- |
 -- Module      : Data.Text.Lazy.Encoding
 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
@@ -80,7 +83,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: B.ByteString -> Text
-decodeASCII = decodeUtf8
+decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
diff --git a/text.cabal b/text.cabal
index 4e634af6..e376c16d 100644
--- a/text.cabal
+++ b/text.cabal
@@ -65,6 +65,7 @@ flag developer
 
 library
   c-sources:    cbits/cbits.c
+                cbits/is_ascii.c
                 cbits/measure_off.c
                 cbits/reverse.c
                 cbits/utils.c

From c0fd44309056e46b315bce6ec5da2f74f1c04394 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sat, 21 Aug 2021 20:57:04 +0100
Subject: [PATCH 32/38] Use native (and naive) UTF8 decoder

---
 cbits/cbits.c                           | 160 -----------------
 include/text_cbits.h                    |  11 --
 src/Data/Text.hs                        |  20 +--
 src/Data/Text/Encoding.hs               | 219 +++++++++---------------
 src/Data/Text/Internal.hs               |  20 +++
 src/Data/Text/Internal/Encoding/Utf8.hs |  69 +++++++-
 tests/Tests/Properties/Transcoding.hs   |  15 +-
 tests/Tests/QuickCheckUtils.hs          |   2 +-
 text.cabal                              |   5 +-
 9 files changed, 175 insertions(+), 346 deletions(-)
 delete mode 100644 cbits/cbits.c
 delete mode 100644 include/text_cbits.h

diff --git a/cbits/cbits.c b/cbits/cbits.c
deleted file mode 100644
index 5fb77941..00000000
--- a/cbits/cbits.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
- *
- * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
- *
- * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
- */
-
-#include <string.h>
-#include <stdint.h>
-#include <stdio.h>
-#if defined(__x86_64__)
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#endif
-
-#include "text_cbits.h"
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 12
-
-static const uint8_t utf8d[] = {
-  /*
-   * The first part of the table maps bytes to character classes that
-   * to reduce the size of the transition table and create bitmasks.
-   */
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-
-  /*
-   * The second part is a transition table that maps a combination of
-   * a state of the automaton and a character class to a state.
-   */
-   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
-  12,36,12,12,12,12,12,12,12,12,12,12,
-};
-
-static inline uint32_t
-decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
-  uint32_t type = utf8d[byte];
-
-  *codep = (*state != UTF8_ACCEPT) ?
-    (byte & 0x3fu) | (*codep << 6) :
-    (0xff >> type) & (byte);
-
-  return *state = utf8d[256 + *state + type];
-}
-
-/*
- * A best-effort decoder. Runs until it hits either end of input or
- * the start of an invalid byte sequence.
- *
- * At exit, we update *destoff with the next offset to write to, *src
- * with the next source location past the last one successfully
- * decoded, and return the next source location to read from.
- *
- * Moreover, we expose the internal decoder state (state0 and
- * codepoint0), allowing one to restart the decoder after it
- * terminates (say, due to a partial codepoint).
- *
- * In particular, there are a few possible outcomes,
- *
- *   1) We decoded the buffer entirely:
- *      In this case we return srcend
- *      state0 == UTF8_ACCEPT
- *
- *   2) We met an invalid encoding
- *      In this case we return the address of the first invalid byte
- *      state0 == UTF8_REJECT
- *
- *   3) We reached the end of the buffer while decoding a codepoint
- *      In this case we return a pointer to the first byte of the partial codepoint
- *      state0 != UTF8_ACCEPT, UTF8_REJECT
- *
- */
-#if defined(__GNUC__) || defined(__clang__)
-static inline uint8_t const *
-_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
-			 const uint8_t **src, const uint8_t *srcend,
-			 uint32_t *codepoint0, uint32_t *state0)
-  __attribute((always_inline));
-#endif
-
-static inline uint8_t const *
-_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
-			 const uint8_t **src, const uint8_t *srcend,
-			 uint32_t *codepoint0, uint32_t *state0)
-{
-  uint8_t *d = dest + *destoff;
-  const uint8_t *s = *src, *last = *src;
-  uint32_t state = *state0;
-  uint32_t codepoint = *codepoint0;
-
-  while (s < srcend) {
-    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
-      if (state != UTF8_REJECT)
-	      continue;
-      break;
-    }
-
-    if(codepoint < 0x80){
-      *d++ = (uint8_t) codepoint;
-    } else if(codepoint < 0x800){
-      *d++ = (uint8_t) (0xC0 + (codepoint >> 6));
-      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
-    } else if(codepoint < 0x10000){
-      *d++ = (uint8_t) (0xE0 + (codepoint >> 12));
-      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
-      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
-    } else {
-      *d++ = (uint8_t) (0xF0 + (codepoint >> 18));
-      *d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
-      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
-      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
-    }
-
-    last = s;
-  }
-
-  *destoff = d - dest;
-  *codepoint0 = codepoint;
-  *state0 = state;
-  *src = last;
-
-  return s;
-}
-
-uint8_t const *
-_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
-                           const uint8_t **src,
-                           const uint8_t *srcend,
-                           uint32_t *codepoint0, uint32_t *state0)
-{
-  _hs_text_decode_utf8_int(dest, destoff, src, srcend, codepoint0, state0);
-
-  return *src;
-}
-
-/*
- * Helper to decode buffer and discard final decoder state
- */
-const uint8_t *
-_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
-                     const uint8_t *src, const uint8_t *const srcend)
-{
-  uint32_t codepoint;
-  uint32_t state = UTF8_ACCEPT;
-  _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
-                          &codepoint, &state);
-  return src;
-}
diff --git a/include/text_cbits.h b/include/text_cbits.h
deleted file mode 100644
index 3523efea..00000000
--- a/include/text_cbits.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
- */
-
-#ifndef _text_cbits_h
-#define _text_cbits_h
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 12
-
-#endif
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index b7d44466..7114c853 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -231,7 +231,7 @@ import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
 import Data.Text.Internal.Private (span_)
-import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
+import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text, append)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
@@ -446,24 +446,6 @@ snoc :: Text -> Char -> Text
 snoc t c = unstream (S.snoc (stream t) (safe c))
 {-# INLINE snoc #-}
 
--- | /O(n)/ Appends one 'Text' to the other by copying both of them
--- into a new 'Text'.
-append :: Text -> Text -> Text
-append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
-    | len1 == 0 = b
-    | len2 == 0 = a
-    | len > 0   = Text (A.run x) 0 len
-    | otherwise = overflowError "append"
-    where
-      len = len1+len2
-      x :: ST s (A.MArray s)
-      x = do
-        arr <- A.new len
-        A.copyI len1 arr 0 arr1 off1
-        A.copyI len2 arr len1 arr2 off2
-        return arr
-{-# NOINLINE append #-}
-
 -- | /O(1)/ Returns the first character of a 'Text', which must be
 -- non-empty.
 head :: Text -> Char
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index b5bc13f8..45520878 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -2,6 +2,8 @@
     UnliftedFFITypes #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE ViewPatterns #-}
 -- |
 -- Module      : Data.Text.Encoding
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
@@ -62,33 +64,30 @@ module Data.Text.Encoding
 
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
-import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
-import Control.Monad.ST (runST)
+import Control.Exception (evaluate, try)
+import Control.Monad.ST (runST, ST)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), safe, empty, text)
-import Data.Text.Internal.Private (runText)
+import Data.Text.Internal (Text(..), safe, empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
-import Data.Text.Show ()
+import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8, Word32)
+import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
-import Foreign.Marshal.Utils (with)
-import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke, peekByteOff)
-import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
+import Foreign.Ptr (Ptr, minusPtr, plusPtr)
+import Foreign.Storable (poke, peekByteOff)
+import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
-import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
+import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, utf8DecodeStart, utf8DecodeContinue, DecoderResult(..))
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
@@ -97,8 +96,6 @@ import Data.Text.Internal.ByteStringCompat
 import GHC.Stack (HasCallStack)
 #endif
 
-#include "text_cbits.h"
-
 -- $strict
 --
 -- All of the single-parameter functions for decoding bytestrings
@@ -159,53 +156,77 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
--- __NOTE__: The replacement character returned by 'OnDecodeError'
--- MUST be within the BMP plane; surrogate code points will
--- automatically be remapped to the replacement char @U+FFFD@
--- (/since 0.11.3.0/), whereas code points beyond the BMP will throw an
--- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
--- those unsupported code points would result in undefined behavior.
+-- Surrogate code points in replacement character returned by 'OnDecodeError'
+-- will be automatically remapped to the replacement char @U+FFFD@.
 decodeUtf8With ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs = withBS bs aux
- where
-  aux fp len = runText $ \done -> do
-    let go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
-          with (0::CSize) $ \destOffPtr -> do
-            let end = ptr `plusPtr` len
-                loop curPtr = do
-                  curPtr' <- c_decode_utf8 dest destOffPtr curPtr end
-                  if curPtr' == end
-                    then do
-                      n <- peek destOffPtr
-                      unsafeSTToIO (done (A.MutableByteArray dest) (cSizeToInt n))
-                    else do
-                      x <- peek curPtr'
-                      case onErr desc (Just x) of
-                        Nothing -> loop $ curPtr' `plusPtr` 1
-                        Just c
-                          -- TODO This is problematic, because even BMP replacement characters
-                          -- can take longer than one UTF8 code unit (which is byte).
-                          | c > '\xFFFF' -> throwUnsupportedReplChar
-                          | otherwise -> do
-                              destOff <- peek destOffPtr
-                              w <- unsafeSTToIO $
-                                   unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff)
-                                               (safe c)
-                              poke destOffPtr (destOff + intToCSize w)
-                              loop $ curPtr' `plusPtr` 1
-            loop ptr
-    -- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.
-    -- This is incorrect in general, but for now we just want to pass tests.
-    (unsafeIOToST . go) =<< A.new (len * 2 + 100)
-   where
-    desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
-
-    throwUnsupportedReplChar = throwIO $
-      ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
+decodeUtf8With onErr bs
+  | B.null undecoded = txt
+  | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
+    Nothing -> txt'
+    Just c  -> T.singleton c `append` txt')
+  where
+    (txt, undecoded) = decodeUtf8With2 onErr mempty bs
+    txt' = decodeUtf8With onErr (B.tail undecoded)
+    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+
+-- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
+decodeUtf8With2 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
+decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+  marr <- A.new len'
+  outer marr len' 0 0
+  where
+    len = len1 + len2
+    len' = len + 4
+
+    index i
+      | i < len1  = B.index bs1 i
+      | otherwise = B.index bs2 (i - len1)
+
+    decodeFrom :: Int -> DecoderResult
+    decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
+      where
+        step i (Incomplete a b)
+          | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
+        step _ st = st
+
+    outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
+    outer dst dstLen = inner
+        where
+          inner srcOff dstOff
+            | srcOff >= len = do
+              A.shrinkM dst dstOff
+              arr <- A.unsafeFreeze dst
+              return (Text arr 0 dstOff, mempty)
+            | dstOff + 4 > dstLen = do
+              let dstLen' = dstLen + 4
+              dst' <- A.resizeM dst dstLen'
+              outer dst' dstLen' srcOff dstOff
+            | otherwise = case decodeFrom srcOff of
+              Accept c -> do
+                d <- unsafeWrite dst dstOff c
+                inner (srcOff + d) (dstOff + d)
+              Reject -> case onErr desc (Just (index srcOff)) of
+                Nothing -> inner (srcOff + 1) dstOff
+                Just c -> do
+                  d <- unsafeWrite dst dstOff (safe c)
+                  inner (srcOff + 1) (dstOff + d)
+              Incomplete{} -> do
+                A.shrinkM dst dstOff
+                arr <- A.unsafeFreeze dst
+                let bs = if srcOff >= len1
+                      then B.drop (srcOff - len1) bs2
+                      else B.drop srcOff (bs1 `B.append` bs2)
+                return (Text arr 0 dstOff, bs)
+
+    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
 -- $stream
 --
@@ -272,9 +293,6 @@ instance Show Decoding where
                                 showString " _"
       where prec = 10; prec' = prec + 1
 
-newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
-newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-
 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
 -- encoded text that is known to be valid.
 --
@@ -300,72 +318,11 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
- where
-  -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential code point started in the last buffer (@undecoded0@), or
-  -- replacement characters for each byte in @undecoded0@ if the
-  -- sequence turns out to be invalid. There can be up to three bytes there,
-  -- hence we allocate @len+3@ bytes.
-  decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-              -> Decoding
-  decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
-    -- TODO Replace (+100) with something sensible.
-    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100)
-       where
-        decodeChunkToBuffer :: A.MArray s -> IO Decoding
-        decodeChunkToBuffer (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
-          with (0::CSize) $ \destOffPtr ->
-          with codepoint0 $ \codepointPtr ->
-          with state0 $ \statePtr ->
-          with nullPtr $ \curPtrPtr ->
-            let end = ptr `plusPtr` len
-                loop curPtr = do
-                  prevState <- peek statePtr
-                  poke curPtrPtr curPtr
-                  lastPtr <- c_decode_utf8_with_state dest destOffPtr
-                             curPtrPtr end codepointPtr statePtr
-                  state <- peek statePtr
-                  case state of
-                    UTF8_REJECT -> do
-                      -- We encountered an encoding error
-                      poke statePtr 0
-                      let skipByte x = case onErr desc (Just x) of
-                            Nothing -> return ()
-                            Just c -> do
-                              destOff <- peek destOffPtr
-                              w <- unsafeSTToIO $
-                                   unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff) (safe c)
-                              poke destOffPtr (destOff + intToCSize w)
-                      if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
-                        -- If we can't complete the sequence @undecoded0@ from
-                        -- the previous chunk, we invalidate the bytes from
-                        -- @undecoded0@ and retry decoding the current chunk from
-                        -- the initial state.
-                        traverse_ skipByte (B.unpack undecoded0)
-                        loop lastPtr
-                      else do
-                        peek lastPtr >>= skipByte
-                        loop (lastPtr `plusPtr` 1)
-
-                    _ -> do
-                      -- We encountered the end of the buffer while decoding
-                      n <- peek destOffPtr
-                      codepoint <- peek codepointPtr
-                      chunkText <- unsafeSTToIO $ do
-                          let l = cSizeToInt n
-                          A.shrinkM (A.MutableByteArray dest) l
-                          arr <- A.unsafeFreeze (A.MutableByteArray dest)
-                          return $! text arr 0 l
-                      let left = lastPtr `minusPtr` ptr
-                          !undecoded = case state of
-                            UTF8_ACCEPT -> B.empty
-                            _ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
-                              | otherwise -> B.drop left bs
-                      return $ Some chunkText undecoded
-                               (decodeChunk undecoded codepoint state)
-            in loop ptr
-  desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+streamDecodeUtf8With onErr = go mempty
+  where
+    go bs1 bs2 = Some txt undecoded (go undecoded)
+      where
+        (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
@@ -551,15 +508,3 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 
 cSizeToInt :: CSize -> Int
 cSizeToInt = fromIntegral
-
-intToCSize :: Int -> CSize
-intToCSize = fromIntegral
-
-foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
-    :: MutableByteArray# s -> Ptr CSize
-    -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
-
-foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
-    :: MutableByteArray# s -> Ptr CSize
-    -> Ptr (Ptr Word8) -> Ptr Word8
-    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
diff --git a/src/Data/Text/Internal.hs b/src/Data/Text/Internal.hs
index b5a1d443..bc327c02 100644
--- a/src/Data/Text/Internal.hs
+++ b/src/Data/Text/Internal.hs
@@ -33,6 +33,7 @@ module Data.Text.Internal
     -- * Code that must be here for accessibility
     , empty
     , empty_
+    , append
     -- * Utilities
     , firstf
     -- * Checked multiplication
@@ -47,6 +48,7 @@ module Data.Text.Internal
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
+import Control.Monad.ST (ST)
 import Data.Bits
 import Data.Int (Int32, Int64)
 import Data.Text.Internal.Unsafe.Char (ord)
@@ -89,6 +91,24 @@ empty_ :: Text
 empty_ = Text A.empty 0 0
 {-# NOINLINE empty_ #-}
 
+-- | /O(n)/ Appends one 'Text' to the other by copying both of them
+-- into a new 'Text'.
+append :: Text -> Text -> Text
+append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
+    | len1 == 0 = b
+    | len2 == 0 = a
+    | len > 0   = Text (A.run x) 0 len
+    | otherwise = error $ "Data.Text.append: size overflow"
+    where
+      len = len1+len2
+      x :: ST s (A.MArray s)
+      x = do
+        arr <- A.new len
+        A.copyI len1 arr 0 arr1 off1
+        A.copyI len2 arr len1 arr2 off2
+        return arr
+{-# NOINLINE append #-}
+
 -- | Construct a 'Text' without invisibly pinning its byte array in
 -- memory if its length has dwindled to zero.
 text ::
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 9ee0c1c9..0c1a42aa 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -33,6 +33,10 @@ module Data.Text.Internal.Encoding.Utf8
     , validate2
     , validate3
     , validate4
+    -- * Naive decoding
+    , DecoderResult(..)
+    , utf8DecodeStart
+    , utf8DecodeContinue
     ) where
 
 #if defined(ASSERTS)
@@ -40,7 +44,7 @@ import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
 import Data.Bits (Bits(..), FiniteBits(..))
-import Data.Char (ord)
+import Data.Char (ord, chr)
 import GHC.Exts
 import GHC.Word (Word8(..))
 
@@ -213,3 +217,66 @@ validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
 
 intToWord8 :: Int -> Word8
 intToWord8 = fromIntegral
+
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
+
+-------------------------------------------------------------------------------
+-- Naive UTF8 decoder.
+-- See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for the explanation of the state machine.
+
+newtype ByteClass = ByteClass Word8
+
+byteToClass :: Word8 -> ByteClass
+byteToClass n = ByteClass (W8# el#)
+  where
+    !(I# n#) = word8ToInt n
+    el# = indexWord8OffAddr# table# n#
+
+    table# :: Addr#
+    table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
+
+newtype DecoderState = DecoderState Word8
+  deriving (Eq)
+
+utf8AcceptState :: DecoderState
+utf8AcceptState = DecoderState 0
+
+utf8RejectState :: DecoderState
+utf8RejectState = DecoderState 12
+
+updateState :: ByteClass -> DecoderState -> DecoderState
+updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
+  where
+    !(I# n#) = word8ToInt (c + s)
+    el# = indexWord8OffAddr# table# n#
+
+    table# :: Addr#
+    table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
+
+newtype CodePoint = CodePoint Int
+
+data DecoderResult
+  = Accept !Char
+  | Incomplete !DecoderState !CodePoint
+  | Reject
+
+utf8DecodeStart :: Word8 -> DecoderResult
+utf8DecodeStart w
+  | st == utf8AcceptState = Accept (chr (word8ToInt w))
+  | st == utf8RejectState = Reject
+  | otherwise             = Incomplete st (CodePoint cp)
+  where
+    cl@(ByteClass cl') = byteToClass w
+    st = updateState cl utf8AcceptState
+    cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w
+
+utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
+utf8DecodeContinue w st (CodePoint cp)
+  | st' == utf8AcceptState = Accept (chr cp')
+  | st' == utf8RejectState = Reject
+  | otherwise              = Incomplete st' (CodePoint cp')
+  where
+    cl  = byteToClass w
+    st' = updateState cl st
+    cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f)
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 8d4607f0..dcebd94a 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -95,19 +95,8 @@ instance Arbitrary InvalidUtf8 where
     =  map (\c' -> InvalidUtf8 a b c') (shrink c)
     ++ map (\a' -> InvalidUtf8 a' b c) (shrink a)
 
-t_utf8_err :: InvalidUtf8 -> Maybe DecodeErr -> Property
--- generate an invalid character
-t_utf8_err bad Nothing = forAll (choose ('\x10000', maxBound)) $ \c -> ioProperty $ do
-  let onErr _ _ = Just c
-      decoded = E.decodeUtf8With onErr (toByteString bad)
-      len = T.length decoded
-  l <- Exception.try (Exception.evaluate len)
-  pure $ case l of
-    Left (err :: Exception.SomeException) -> counterexample (show err) $
-      "non-BMP replacement characters not supported" `T.isInfixOf` T.pack (show err)
-    Right _  -> counterexample (show (decoded, l)) False
--- generate a valid onErr
-t_utf8_err bad (Just de) = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioProperty $ do
+t_utf8_err :: InvalidUtf8 -> DecodeErr -> Property
+t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioProperty $ do
   let decoded = E.decodeUtf8With onErr (toByteString bad)
       len = T.length (E.decodeUtf8With onErr (toByteString bad))
   l <- Exception.try (Exception.evaluate len)
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 94790fa0..833be1ad 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -122,7 +122,7 @@ genDecodeErr Ignore  = return T.ignore
 genDecodeErr Strict  = return T.strictDecode
 genDecodeErr Replace = (\c _ _ -> c) <$> frequency
   [ (1, return Nothing)
-  , (50, Just <$> choose ('\x1', '\xffff'))
+  , (50, Just <$> arbitraryUnicodeChar)
   ]
 
 instance Arbitrary DecodeErr where
diff --git a/text.cabal b/text.cabal
index e376c16d..e6992850 100644
--- a/text.cabal
+++ b/text.cabal
@@ -53,7 +53,6 @@ extra-source-files:
     -- scripts/SpecialCasing.txt
     README.markdown
     changelog.md
-    include/*.h
     scripts/*.hs
     tests/literal-rule-test.sh
     tests/LiteralRuleTest.hs
@@ -64,12 +63,10 @@ flag developer
   manual: True
 
 library
-  c-sources:    cbits/cbits.c
-                cbits/is_ascii.c
+  c-sources:    cbits/is_ascii.c
                 cbits/measure_off.c
                 cbits/reverse.c
                 cbits/utils.c
-  include-dirs: include
   hs-source-dirs: src
 
   exposed-modules:

From 32c76d1015d9c9dcb8a6dc9acdc8160f4127b76e Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Thu, 26 Aug 2021 23:38:34 +0100
Subject: [PATCH 33/38] Redesign concat

---
 src/Data/Text/Lazy.hs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index ad607361..a7b0e325 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -794,12 +794,10 @@ foldr1 f t = S.foldr1 f (stream t)
 
 -- | /O(n)/ Concatenate a list of 'Text's.
 concat :: [Text] -> Text
-concat = to
-  where
-    go Empty        css = to css
-    go (Chunk c cs) css = Chunk c (go cs css)
-    to []               = Empty
-    to (cs:css)         = go cs css
+concat []                    = Empty
+concat (Empty : css)         = concat css
+concat (Chunk c Empty : css) = Chunk c (concat css)
+concat (Chunk c cs : css)    = Chunk c (concat (cs : css))
 {-# INLINE concat #-}
 
 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and

From 012612af7853c46cea69e48122acbe5b60ae56a1 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 1 Aug 2021 15:50:43 +0100
Subject: [PATCH 34/38] Use simdutf for UTF8 validation

---
 cbits/simdutf.cpp         | 14204 ++++++++++++++++++++++++++++++++++++
 cbits/validate_utf8.cpp   |     6 +
 include/simdutf.h         |  1084 +++
 src/Data/Text/Encoding.hs |    38 +-
 text.cabal                |    20 +-
 5 files changed, 15349 insertions(+), 3 deletions(-)
 create mode 100644 cbits/simdutf.cpp
 create mode 100644 cbits/validate_utf8.cpp
 create mode 100644 include/simdutf.h

diff --git a/cbits/simdutf.cpp b/cbits/simdutf.cpp
new file mode 100644
index 00000000..cf7d32ff
--- /dev/null
+++ b/cbits/simdutf.cpp
@@ -0,0 +1,14204 @@
+/* auto-generated on 2021-07-29 10:43:28 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf.cpp
+/* begin file src/simdutf.cpp */
+#include "simdutf.h"
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=implementation.cpp
+/* begin file src/implementation.cpp */
+#include <initializer_list>
+#include <string>
+#include <climits>
+
+// Useful for debugging purposes
+namespace simdutf {
+namespace {
+
+template <typename T>
+std::string toBinaryString(T b) {
+   std::string binary = "";
+   T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
+   while (mask > 0) {
+    binary += ((b & mask) == 0) ? '0' : '1';
+    mask >>= 1;
+  }
+  return binary;
+}
+}
+}
+
+// Implementations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
+/* begin file src/simdutf/arm64.h */
+#ifndef SIMDUTF_ARM64_H
+#define SIMDUTF_ARM64_H
+
+#ifdef SIMDUTF_FALLBACK_H
+#error "arm64.h must be included before fallback.h"
+#endif
+
+
+#ifndef SIMDUTF_IMPLEMENTATION_ARM64
+#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
+#endif
+#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
+
+
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+
+namespace simdutf {
+/**
+ * Implementation for NEON (ARMv8).
+ */
+namespace arm64 {
+} // namespace arm64
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
+/* begin file src/simdutf/arm64/implementation.h */
+#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
+#define SIMDUTF_ARM64_IMPLEMENTATION_H
+
+
+namespace simdutf {
+namespace arm64 {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+};
+
+} // namespace arm64
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_IMPLEMENTATION_H
+/* end file src/simdutf/arm64/implementation.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+/* begin file src/simdutf/arm64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "arm64"
+// #define SIMDUTF_IMPLEMENTATION arm64
+/* end file src/simdutf/arm64/begin.h */
+
+// Declarations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
+/* begin file src/simdutf/arm64/intrinsics.h */
+#ifndef SIMDUTF_ARM64_INTRINSICS_H
+#define SIMDUTF_ARM64_INTRINSICS_H
+
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <arm_neon.h>
+
+#endif //  SIMDUTF_ARM64_INTRINSICS_H
+/* end file src/simdutf/arm64/intrinsics.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
+/* begin file src/simdutf/arm64/bitmanipulation.h */
+#ifndef SIMDUTF_ARM64_BITMANIPULATION_H
+#define SIMDUTF_ARM64_BITMANIPULATION_H
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+NO_SANITIZE_UNDEFINED
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num-1);
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int count_ones(uint64_t input_num) {
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+}
+
+simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  *result = value1 + value2;
+  return *result < value1;
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_BITMANIPULATION_H
+/* end file src/simdutf/arm64/bitmanipulation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmask.h
+/* begin file src/simdutf/arm64/bitmask.h */
+#ifndef SIMDUTF_ARM64_BITMASK_H
+#define SIMDUTF_ARM64_BITMASK_H
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdutf_really_inline uint64_t prefix_xor(uint64_t bitmask) {
+  bitmask ^= bitmask << 1;
+  bitmask ^= bitmask << 2;
+  bitmask ^= bitmask << 4;
+  bitmask ^= bitmask << 8;
+  bitmask ^= bitmask << 16;
+  bitmask ^= bitmask << 32;
+  return bitmask;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+
+#endif
+/* end file src/simdutf/arm64/bitmask.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
+/* begin file src/simdutf/arm64/simd.h */
+#ifndef SIMDUTF_ARM64_SIMD_H
+#define SIMDUTF_ARM64_SIMD_H
+
+#include <type_traits>
+
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace simd {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+namespace {
+// Start of private section with Visual Studio workaround
+
+
+/**
+ * make_uint8x16_t initializes a SIMD register (uint8x16_t).
+ * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
+ * is not recognized under Visual Studio! This is a workaround.
+ * Using a std::initializer_list<uint8_t>  as a parameter resulted in
+ * inefficient code. With the current approach, if the parameters are
+ * compile-time constants,
+ * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
+ * You should not use this function except for compile-time constants:
+ * it is not efficient.
+ */
+simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
+                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
+                                         uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
+                                         uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_u8(array);
+  uint8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_u8(x1, x, 0);
+  x = vsetq_lane_u8(x2, x, 1);
+  x = vsetq_lane_u8(x3, x, 2);
+  x = vsetq_lane_u8(x4, x, 3);
+  x = vsetq_lane_u8(x5, x, 4);
+  x = vsetq_lane_u8(x6, x, 5);
+  x = vsetq_lane_u8(x7, x, 6);
+  x = vsetq_lane_u8(x8, x, 7);
+  x = vsetq_lane_u8(x9, x, 8);
+  x = vsetq_lane_u8(x10, x, 9);
+  x = vsetq_lane_u8(x11, x, 10);
+  x = vsetq_lane_u8(x12, x, 11);
+  x = vsetq_lane_u8(x13, x, 12);
+  x = vsetq_lane_u8(x14, x, 13);
+  x = vsetq_lane_u8(x15, x, 14);
+  x = vsetq_lane_u8(x16, x, 15);
+  return x;
+}
+
+// We have to do the same work for make_int8x16_t
+simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
+                                       int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
+                                       int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
+                                       int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_s8(array);
+  int8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_s8(x1, x, 0);
+  x = vsetq_lane_s8(x2, x, 1);
+  x = vsetq_lane_s8(x3, x, 2);
+  x = vsetq_lane_s8(x4, x, 3);
+  x = vsetq_lane_s8(x5, x, 4);
+  x = vsetq_lane_s8(x6, x, 5);
+  x = vsetq_lane_s8(x7, x, 6);
+  x = vsetq_lane_s8(x8, x, 7);
+  x = vsetq_lane_s8(x9, x, 8);
+  x = vsetq_lane_s8(x10, x, 9);
+  x = vsetq_lane_s8(x11, x, 10);
+  x = vsetq_lane_s8(x12, x, 11);
+  x = vsetq_lane_s8(x13, x, 12);
+  x = vsetq_lane_s8(x14, x, 13);
+  x = vsetq_lane_s8(x15, x, 14);
+  x = vsetq_lane_s8(x16, x, 15);
+  return x;
+}
+
+simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1,  uint16_t x2,  uint16_t x3,  uint16_t x4,
+                                       uint16_t x5,  uint16_t x6,  uint16_t x7,  uint16_t x8) {
+  uint16x8_t x{};
+  x = vsetq_lane_u16(x1, x, 0);
+  x = vsetq_lane_u16(x2, x, 1);
+  x = vsetq_lane_u16(x3, x, 2);
+  x = vsetq_lane_u16(x4, x, 3);
+  x = vsetq_lane_u16(x5, x, 4);
+  x = vsetq_lane_u16(x6, x, 5);
+  x = vsetq_lane_u16(x7, x, 6);
+  x = vsetq_lane_u16(x8, x, 7);;
+  return x;
+}
+
+simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t x3,  int16_t x4,
+                                       int16_t x5,  int16_t x6,  int16_t x7,  int16_t x8) {
+  uint16x8_t x{};
+  x = vsetq_lane_s16(x1, x, 0);
+  x = vsetq_lane_s16(x2, x, 1);
+  x = vsetq_lane_s16(x3, x, 2);
+  x = vsetq_lane_s16(x4, x, 3);
+  x = vsetq_lane_s16(x5, x, 4);
+  x = vsetq_lane_s16(x6, x, 5);
+  x = vsetq_lane_s16(x7, x, 6);
+  x = vsetq_lane_s16(x8, x, 7);;
+  return x;
+}
+
+
+// End of private section with Visual Studio workaround
+} // namespace
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+
+
+  template<typename T>
+  struct simd8;
+
+  //
+  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+  //
+  template<typename T, typename Mask=simd8<bool>>
+  struct base_u8 {
+    uint8x16_t value;
+    static const int SIZE = sizeof(value);
+
+    // Conversion from/to SIMD register
+    simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+    simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
+    simdutf_really_inline operator uint8x16_t&() { return this->value; }
+    simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
+    simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
+
+    // Bit operations
+    simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
+    simdutf_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
+    simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
+    simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
+    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+
+    simdutf_really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
+
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return vextq_u8(prev_chunk, *this, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base_u8<bool> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
+
+    static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
+
+    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+    // False constructor
+    simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+    // Splat constructor
+    simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+    simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
+
+    // We return uint32_t instead of uint16_t because that seems to be more efficient for most
+    // purposes (cutting it down to uint16_t costs performance in some compilers).
+    simdutf_really_inline uint32_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+      auto minput = *this & bit_mask;
+      uint8x16_t tmp = vpaddq_u8(minput, minput);
+      tmp = vpaddq_u8(tmp, tmp);
+      tmp = vpaddq_u8(tmp, tmp);
+      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+    }
+    simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
+    simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
+    simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
+
+
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base_u8<uint8_t> {
+    static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
+    static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
+    static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
+    simdutf_really_inline simd8(const simd8<uint8_t>& value) = default;
+    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+    // Zero constructor
+    simdutf_really_inline simd8() : simd8(zero()) {}
+    // Array constructor
+    simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+    // Splat constructor
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Member-by-member initialization
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    simdutf_really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(make_uint8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
+    simdutf_really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(uint8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
+#endif
+
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Store to array
+    simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
+
+    // Saturated math
+    simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
+    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
+    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+
+    // Order-specific operations
+    simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
+    simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
+    simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
+    simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
+    simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
+    simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
+    // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+    simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
+    // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+    simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
+    simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; }
+
+    simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
+    }
+
+
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+
+    template<typename T>
+    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
+      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> {
+    int8x16_t value;
+
+    static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
+    static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
+    static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
+      vst1q_u16(reinterpret_cast<uint16_t*>(p), vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))));
+      vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), vmovl_high_u8(vreinterpretq_u8_s8(this->value)));
+    }
+    // Conversion from/to SIMD register
+    simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
+    simdutf_really_inline operator const int8x16_t&() const { return this->value; }
+    simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
+    simdutf_really_inline operator int8x16_t&() { return this->value; }
+
+    // Zero constructor
+    simdutf_really_inline simd8() : simd8(zero()) {}
+    // Splat constructor
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    simdutf_really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(make_int8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
+    simdutf_really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(int8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
+#endif
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Store to array
+    simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); }
+    // Explicit conversion to/from unsigned
+    //
+    // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
+    // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
+    // and relatively ugly and hard to read.
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
+    simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
+#endif
+    simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
+
+    simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t> operator^(const simd8<int8_t> other) const { return veorq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t> bit_andnot(const simd8<int8_t> other) const { return vbicq_s8(value, other.value); }
+
+    // Math
+    simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
+    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+
+    simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
+    simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
+    simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(value, other.value); }
+    simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(value, other.value); }
+    simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(value, other.value); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
+    simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
+
+    template<int N=1>
+    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
+      return vextq_s8(prev_chunk, *this, 16 - N);
+    }
+
+    // Perform a lookup assuming no value is larger than 16
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
+    }
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+
+    template<typename T>
+    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
+      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+    }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
+#else
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
+#endif
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
+      uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return  simd8x64<bool>(
+      this->chunks[0] == mask,
+      this->chunks[1] == mask,
+      this->chunks[2] == mask,
+      this->chunks[3] == mask
+    ).to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return  simd8x64<bool>(
+      this->chunks[0] <= mask,
+      this->chunks[1] <= mask,
+      this->chunks[2] <= mask,
+      this->chunks[3] <= mask
+    ).to_bitmask();
+  }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
+/* begin file src/simdutf/arm64/simd16-inl.h */
+template<typename T>
+struct simd16;
+
+  template<typename T, typename Mask=simd16<bool>>
+  struct base_u16 {
+    uint16x8_t value;
+    static const int SIZE = sizeof(value);
+
+    // Conversion from/to SIMD register
+    simdutf_really_inline base_u16() = default;
+    simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
+    simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
+    simdutf_really_inline operator uint16x8_t&() { return this->value; }
+    // Bit operations
+    simdutf_really_inline simd16<T> operator|(const simd16<T> other) const { return vorrq_u16(*this, other); }
+    simdutf_really_inline simd16<T> operator&(const simd16<T> other) const { return vandq_u16(*this, other); }
+    simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
+    simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+
+    simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+
+    template<int N=1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+      return vextq_u18(prev_chunk, *this, 8 - N);
+    }
+  };
+
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base_u16<T> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdutf_really_inline base16() : base_u16<T>() {}
+  simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
+
+  simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+
+  static const int SIZE = sizeof(base_u16<T>::value);
+
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return vextq_u18(prev_chunk, *this, 8 - N);
+  }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
+
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+
+};
+
+template<typename T>
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+};
+
+// Signed words
+template<>
+struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
+  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
+#endif
+  simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
+  simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
+  simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
+
+  simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
+  simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+};
+
+
+
+
+// Unsigned words
+template<>
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+
+
+  simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
+  simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
+  
+  // logical operations
+  simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
+
+  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    return vqmovn_high_u16(vqmovn_u16(v0), v1);
+  }
+};
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
+
+
+  template<typename T>
+  struct simd16x32 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+    static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
+    const simd16<T> chunks[NUM_CHUNKS];
+
+    simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
+    simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
+    simd16x32() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
+#else
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
+#endif
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
+      uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return  simd16x32<bool>(
+      this->chunks[0] == mask,
+      this->chunks[1] == mask,
+      this->chunks[2] == mask,
+      this->chunks[3] == mask
+    ).to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return  simd16x32<bool>(
+      this->chunks[0] <= mask,
+      this->chunks[1] <= mask,
+      this->chunks[2] <= mask,
+      this->chunks[3] <= mask
+    ).to_bitmask();
+  }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+      return  simd16x32<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+
+  }; // struct simd16x32<T>
+  template<>
+  simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
+      const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+      const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+      simd16x32<uint16_t> x(
+        simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
+        simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
+        simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
+        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+      );
+      return  x.to_bitmask();
+    }
+/* end file src/simdutf/arm64/simd16-inl.h */
+} // namespace simd
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_SIMD_H
+/* end file src/simdutf/arm64/simd.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+/* begin file src/simdutf/arm64/end.h */
+/* end file src/simdutf/arm64/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_ARM64
+
+#endif // SIMDUTF_ARM64_H
+/* end file src/simdutf/arm64.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
+/* begin file src/simdutf/haswell.h */
+#ifndef SIMDUTF_HASWELL_H
+#define SIMDUTF_HASWELL_H
+
+#ifdef SIMDUTF_WESTMERE_H
+#error "haswell.h must be included before westmere.h"
+#endif
+#ifdef SIMDUTF_FALLBACK_H
+#error "haswell.h must be included before fallback.h"
+#endif
+
+
+// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
+// at runtime.
+#ifndef SIMDUTF_IMPLEMENTATION_HASWELL
+//
+// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
+// because we want to rely on *runtime dispatch*.
+//
+#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
+#endif
+// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// https://github.com/simdutf/simdutf/issues/1247
+#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
+
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+
+#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,pclmul,lzcnt")
+
+namespace simdutf {
+/**
+ * Implementation for Haswell (Intel AVX2).
+ */
+namespace haswell {
+} // namespace haswell
+} // namespace simdutf
+
+//
+// These two need to be included outside SIMDUTF_TARGET_REGION
+//
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
+/* begin file src/simdutf/haswell/implementation.h */
+#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
+#define SIMDUTF_HASWELL_IMPLEMENTATION_H
+
+
+// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
+namespace simdutf {
+namespace haswell {
+
+using namespace simdutf;
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation() : simdutf::implementation(
+      "haswell",
+      "Intel/AMD AVX2",
+      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
+  ) {}
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+};
+
+} // namespace haswell
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
+/* end file src/simdutf/haswell/implementation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
+/* begin file src/simdutf/haswell/intrinsics.h */
+#ifndef SIMDUTF_HASWELL_INTRINSICS_H
+#define SIMDUTF_HASWELL_INTRINSICS_H
+
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h>  // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif // SIMDUTF_VISUAL_STUDIO
+
+#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn,  we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdutf, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h>  (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+#include <bmiintrin.h>   // for _blsr_u64
+#include <lzcntintrin.h> // for  __lzcnt64
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
+// unfortunately, we may not get _blsr_u64, but, thankfully, clang
+// has it as a macro.
+#ifndef _blsr_u64
+// we roll our own
+SIMDUTF_TARGET_HASWELL
+static simdutf_really_inline uint64_t _blsr_u64(uint64_t n) {
+  return (n - 1) & n;
+}
+SIMDUTF_UNTARGET_REGION
+#endif //  _blsr_u64
+#endif // SIMDUTF_CLANG_VISUAL_STUDIO
+
+#endif // SIMDUTF_HASWELL_INTRINSICS_H
+/* end file src/simdutf/haswell/intrinsics.h */
+
+//
+// The rest need to be inside the region
+//
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+/* begin file src/simdutf/haswell/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "haswell"
+// #define SIMDUTF_IMPLEMENTATION haswell
+SIMDUTF_TARGET_HASWELL
+/* end file src/simdutf/haswell/begin.h */
+
+// Declarations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
+/* begin file src/simdutf/haswell/bitmanipulation.h */
+#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
+#define SIMDUTF_HASWELL_BITMANIPULATION_H
+
+namespace simdutf {
+namespace haswell {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+NO_SANITIZE_UNDEFINED
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  return (int)_tzcnt_u64(input_num);
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  ////////
+  // You might expect the next line to be equivalent to
+  // return (int)_tzcnt_u64(input_num);
+  // but the generated code differs and might be less efficient?
+  ////////
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return _blsr_u64(input_num);
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int leading_zeroes(uint64_t input_num) {
+  return int(_lzcnt_u64(input_num));
+}
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
+}
+#endif
+
+simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                uint64_t *result) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  return _addcarry_u64(0, value1, value2,
+                       reinterpret_cast<unsigned __int64 *>(result));
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_BITMANIPULATION_H
+/* end file src/simdutf/haswell/bitmanipulation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmask.h
+/* begin file src/simdutf/haswell/bitmask.h */
+#ifndef SIMDUTF_HASWELL_BITMASK_H
+#define SIMDUTF_HASWELL_BITMASK_H
+
+namespace simdutf {
+namespace haswell {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdutf_really_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processor supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_BITMASK_H
+/* end file src/simdutf/haswell/bitmask.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
+/* begin file src/simdutf/haswell/simd.h */
+#ifndef SIMDUTF_HASWELL_SIMD_H
+#define SIMDUTF_HASWELL_SIMD_H
+
+
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace simd {
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename Child>
+  struct base {
+    __m256i value;
+
+    // Zero constructor
+    simdutf_really_inline base() : value{__m256i()} {}
+
+    // Conversion from SIMD register
+    simdutf_really_inline base(const __m256i _value) : value(_value) {}
+    // Conversion to SIMD register
+    simdutf_really_inline operator const __m256i&() const { return this->value; }
+    simdutf_really_inline operator __m256i&() { return this->value; }
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this)));
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1)));
+    }
+    // Bit operations
+    simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
+    simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
+    simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
+    simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
+    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef uint32_t bitmask_t;
+    typedef uint64_t bitmask2_t;
+
+    simdutf_really_inline base8() : base<simd8<T>>() {}
+    simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
+    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
+    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
+
+    static const int SIZE = sizeof(base<T>::value);
+
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
+
+    simdutf_really_inline simd8<bool>() : base8() {}
+    simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
+    simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+    simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
+    simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
+    simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
+    static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
+    static simdutf_really_inline simd8<T> load(const T values[32]) {
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdutf_really_inline simd8<T> repeat_16(
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    simdutf_really_inline base8_numeric() : base8<T>() {}
+    simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+
+    // Store to array
+    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
+    simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm256_shuffle_epi8(lookup_table, *this);
+    }
+
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+
+    // Splat constructor
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+    simdutf_really_inline operator simd8<uint8_t>() const;
+    // Member-by-member initialization
+    simdutf_really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+    simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
+    // Order-sensitive comparisons
+    simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
+    simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
+    simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdutf_really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+
+    // Saturated math
+    simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+    simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+    simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
+  };
+  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return this->chunks[0] | this->chunks[1];
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8<T>));
+    }
+
+    simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return simd8x64<T>(
+        this->chunks[0] | mask,
+        this->chunks[1] | mask
+      );
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+        (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
+/* begin file src/simdutf/haswell/simd16-inl.h */
+#ifdef __GNUC__
+#if __GNUC__ < 8
+#define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#endif
+#endif
+
+template<typename T>
+struct simd16;
+
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base<simd16<T>> {
+  using bitmask_type = uint32_t;
+
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
+
+  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm256_cmpeq_epi16(*this, other); }
+
+  /// the size of vector in bytes
+  static const int SIZE = sizeof(base<simd16<T>>::value);
+
+  /// the number of elements of type T a vector can hold
+  static const int ELEMENTS = SIZE / sizeof(T);
+
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
+  }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
+
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+
+  simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
+  simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template<typename T>
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+};
+
+// Signed words
+template<>
+struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
+};
+
+// Unsigned words
+template<>
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
+  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
+  // Get one of the bits and make a bitmask out of it.
+  // e.g. value.get_bit<7>() gets the high bit
+  template<int N>
+  simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
+
+  // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
+    //       we have to shuffle lanes in order to produce bytes in the
+    //       correct order.
+
+    // get the 0th lanes
+    const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
+    const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
+
+    // get the 1st lanes
+    const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
+    const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
+
+    // build new vectors (shuffle lanes)
+    const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
+    const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
+
+    // pack words in linear order from v0 and v1
+    return _mm256_packus_epi16(t0, t1);
+  }
+};
+
+
+  template<typename T>
+  struct simd16x32 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+    static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
+    const simd16<T> chunks[NUM_CHUNKS];
+
+    simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
+    simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
+    simd16x32() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return this->chunks[0] | this->chunks[1];
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
+    }
+
+    simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return simd16x32<T>(
+        this->chunks[0] | mask,
+        this->chunks[1] | mask
+      );
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+      return  simd16x32<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
+      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
+      return simd16x32<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask
+      ).to_bitmask();
+    }
+  }; // struct simd16x32<T>
+/* end file src/simdutf/haswell/simd16-inl.h */
+
+} // namespace simd
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_SIMD_H
+/* end file src/simdutf/haswell/simd.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+/* begin file src/simdutf/haswell/end.h */
+SIMDUTF_UNTARGET_REGION
+/* end file src/simdutf/haswell/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_HASWELL
+#endif // SIMDUTF_HASWELL_COMMON_H
+/* end file src/simdutf/haswell.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
+/* begin file src/simdutf/westmere.h */
+#ifndef SIMDUTF_WESTMERE_H
+#define SIMDUTF_WESTMERE_H
+
+#ifdef SIMDUTF_FALLBACK_H
+#error "westmere.h must be included before fallback.h"
+#endif
+
+
+// Default Westmere to on if this is x86-64, unless we'll always select Haswell.
+#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
+//
+// You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL)
+// because you want to rely on runtime dispatch!
+//
+#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
+#endif
+#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__ && __PCLMUL__)
+
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+
+#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,pclmul")
+
+namespace simdutf {
+/**
+ * Implementation for Westmere (Intel SSE4.2).
+ */
+namespace westmere {
+} // namespace westmere
+} // namespace simdutf
+
+//
+// These two need to be included outside SIMDUTF_TARGET_REGION
+//
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
+/* begin file src/simdutf/westmere/implementation.h */
+#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
+#define SIMDUTF_WESTMERE_IMPLEMENTATION_H
+
+
+// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
+namespace simdutf {
+namespace westmere {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+};
+
+} // namespace westmere
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
+/* end file src/simdutf/westmere/implementation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
+/* begin file src/simdutf/westmere/intrinsics.h */
+#ifndef SIMDUTF_WESTMERE_INTRINSICS_H
+#define SIMDUTF_WESTMERE_INTRINSICS_H
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h> // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif // SIMDUTF_VISUAL_STUDIO
+
+
+#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ */
+#include <smmintrin.h>  // for _mm_alignr_epi8
+#include <wmmintrin.h>  // for  _mm_clmulepi64_si128
+#endif
+
+
+
+#endif // SIMDUTF_WESTMERE_INTRINSICS_H
+/* end file src/simdutf/westmere/intrinsics.h */
+
+//
+// The rest need to be inside the region
+//
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+/* begin file src/simdutf/westmere/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "westmere"
+// #define SIMDUTF_IMPLEMENTATION westmere
+SIMDUTF_TARGET_WESTMERE
+/* end file src/simdutf/westmere/begin.h */
+
+// Declarations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
+/* begin file src/simdutf/westmere/bitmanipulation.h */
+#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
+#define SIMDUTF_WESTMERE_BITMANIPULATION_H
+
+namespace simdutf {
+namespace westmere {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+NO_SANITIZE_UNDEFINED
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num-1);
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
+}
+#endif
+
+simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                uint64_t *result) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  return _addcarry_u64(0, value1, value2,
+                       reinterpret_cast<unsigned __int64 *>(result));
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
+/* end file src/simdutf/westmere/bitmanipulation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmask.h
+/* begin file src/simdutf/westmere/bitmask.h */
+#ifndef SIMDUTF_WESTMERE_BITMASK_H
+#define SIMDUTF_WESTMERE_BITMASK_H
+
+namespace simdutf {
+namespace westmere {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdutf_really_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_BITMASK_H
+/* end file src/simdutf/westmere/bitmask.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
+/* begin file src/simdutf/westmere/simd.h */
+#ifndef SIMDUTF_WESTMERE_SIMD_H
+#define SIMDUTF_WESTMERE_SIMD_H
+
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace simd {
+
+  template<typename Child>
+  struct base {
+    __m128i value;
+
+    // Zero constructor
+    simdutf_really_inline base() : value{__m128i()} {}
+
+    // Conversion from SIMD register
+    simdutf_really_inline base(const __m128i _value) : value(_value) {}
+    // Conversion to SIMD register
+    simdutf_really_inline operator const __m128i&() const { return this->value; }
+    simdutf_really_inline operator __m128i&() { return this->value; }
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi16(*this));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi16(_mm_srli_si128(*this,8)));
+    }
+    // Bit operations
+    simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
+    simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
+    simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
+    simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
+    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
+
+    simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
+    simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
+    simdutf_really_inline base8() : base<simd8<T>>() {}
+    simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
+
+    static const int SIZE = sizeof(base<simd8<T>>::value);
+
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
+
+    simdutf_really_inline simd8<bool>() : base8() {}
+    simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+    simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
+    simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
+    static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
+    static simdutf_really_inline simd8<T> load(const T values[16]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdutf_really_inline simd8<T> repeat_16(
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    simdutf_really_inline base8_numeric() : base8<T>() {}
+    simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+
+    // Store to array
+    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
+    simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm_shuffle_epi8(lookup_table, *this);
+    }
+
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdutf_really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+    simdutf_really_inline operator simd8<uint8_t>() const;
+    simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
+    simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
+    simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t>  {
+    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+
+    // Splat constructor
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdutf_really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Saturated math
+    simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
+    simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+    simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+    simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
+  };
+  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint16_t>: base<uint16_t> {
+    static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
+    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    }
+
+    simdutf_really_inline simd8() : base<uint16_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
+    // Splat constructor
+    simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdutf_really_inline simd8(
+      uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
+    ) : simd8(_mm_setr_epi16(
+      v0, v1, v2, v3, v4, v5, v6, v7
+    )) {}
+
+    // Saturated math
+    simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
+    simdutf_really_inline simd8<uint16_t> saturating_sub(const simd8<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd8<uint16_t> max_val(const simd8<uint16_t> other) const { return _mm_max_epu16(*this, other); }
+    simdutf_really_inline simd8<uint16_t> min_val(const simd8<uint16_t> other) const { return _mm_min_epu16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint16_t> gt_bits(const simd8<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint16_t> lt_bits(const simd8<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd8<bool> operator<=(const simd8<uint16_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator>=(const simd8<uint16_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd8<bool> operator==(const simd8<uint16_t> other) const { return _mm_cmpeq_epi16(*this, other); }
+    simdutf_really_inline simd8<bool> operator&(const simd8<uint16_t> other) const { return _mm_and_si128(*this, other); }
+    simdutf_really_inline simd8<bool> operator|(const simd8<uint16_t> other) const { return _mm_or_si128(*this, other); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint16_t(0); }
+    simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+     };
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
+      uint64_t r1 =          this->chunks[1].to_bitmask() ;
+      uint64_t r2 =          this->chunks[2].to_bitmask() ;
+      uint64_t r3 =          this->chunks[3].to_bitmask() ;
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1],
+        this->chunks[2] == other.chunks[2],
+        this->chunks[3] == other.chunks[3]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low-1);
+      const simd8<T> mask_high = simd8<T>::splat(high+1);
+      return simd8x64<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
+/* begin file src/simdutf/westmere/simd16-inl.h */
+template<typename T>
+struct simd16;
+
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base<simd16<T>> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
+
+  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm_cmpeq_epi16(*this, other); }
+
+  static const int SIZE = sizeof(base<simd16<T>>::value);
+
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+  }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
+
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+
+  simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+  simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template<typename T>
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+};
+
+// Signed words
+template<>
+struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd16(
+    int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
+    : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
+
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
+};
+
+// Unsigned words
+template<>
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd16(
+    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+  : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd16<uint16_t> repeat_16(
+    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
+  ) {
+    return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
+  }
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
+  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
+  // Get one of the bits and make a bitmask out of it.
+  // e.g. value.get_bit<7>() gets the high bit
+  template<int N>
+  simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
+
+  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    return _mm_packus_epi16(v0, v1);
+  }
+};
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
+
+template<typename T>
+  struct simd16x32 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+    static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
+    const simd16<T> chunks[NUM_CHUNKS];
+
+    simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
+    simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
+    simd16x32() = delete; // no default constructor allowed
+
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
+      uint64_t r1 =          this->chunks[1].to_bitmask() ;
+      uint64_t r2 =          this->chunks[2].to_bitmask() ;
+      uint64_t r3 =          this->chunks[3].to_bitmask() ;
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+      return  simd16x32<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1],
+        this->chunks[2] == other.chunks[2],
+        this->chunks[3] == other.chunks[3]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
+      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
+      return simd16x32<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+  }; // struct simd16x32<T>
+/* end file src/simdutf/westmere/simd16-inl.h */
+
+} // namespace simd
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
+/* end file src/simdutf/westmere/simd.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+/* begin file src/simdutf/westmere/end.h */
+SIMDUTF_UNTARGET_REGION
+/* end file src/simdutf/westmere/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
+#endif // SIMDUTF_WESTMERE_COMMON_H
+/* end file src/simdutf/westmere.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h
+/* begin file src/simdutf/ppc64.h */
+#ifndef SIMDUTF_PPC64_H
+#define SIMDUTF_PPC64_H
+
+#ifdef SIMDUTF_FALLBACK_H
+#error "ppc64.h must be included before fallback.h"
+#endif
+
+
+#ifndef SIMDUTF_IMPLEMENTATION_PPC64
+#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
+#endif
+#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
+
+
+
+#if SIMDUTF_IMPLEMENTATION_PPC64
+
+namespace simdutf {
+/**
+ * Implementation for ALTIVEC (PPC64).
+ */
+namespace ppc64 {
+} // namespace ppc64
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h
+/* begin file src/simdutf/ppc64/implementation.h */
+#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
+#define SIMDUTF_PPC64_IMPLEMENTATION_H
+
+
+namespace simdutf {
+namespace ppc64 {
+
+namespace {
+using namespace simdutf;
+} // namespace
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
+                                 internal::instruction_set::ALTIVEC) {}
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+};
+
+} // namespace ppc64
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_IMPLEMENTATION_H
+/* end file src/simdutf/ppc64/implementation.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+/* begin file src/simdutf/ppc64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
+// #define SIMDUTF_IMPLEMENTATION ppc64
+/* end file src/simdutf/ppc64/begin.h */
+
+// Declarations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h
+/* begin file src/simdutf/ppc64/intrinsics.h */
+#ifndef SIMDUTF_PPC64_INTRINSICS_H
+#define SIMDUTF_PPC64_INTRINSICS_H
+
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <altivec.h>
+
+// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
+#ifdef bool
+#undef bool
+#endif
+
+#ifdef vector
+#undef vector
+#endif
+
+#endif //  SIMDUTF_PPC64_INTRINSICS_H
+/* end file src/simdutf/ppc64/intrinsics.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
+/* begin file src/simdutf/ppc64/bitmanipulation.h */
+#ifndef SIMDUTF_PPC64_BITMANIPULATION_H
+#define SIMDUTF_PPC64_BITMANIPULATION_H
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+NO_SANITIZE_UNDEFINED
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else  // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num - 1);
+}
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __builtin_popcountll(input_num);
+}
+#endif
+
+simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                         uint64_t *result) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  *result = value1 + value2;
+  return *result < value1;
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_BITMANIPULATION_H
+/* end file src/simdutf/ppc64/bitmanipulation.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmask.h
+/* begin file src/simdutf/ppc64/bitmask.h */
+#ifndef SIMDUTF_PPC64_BITMASK_H
+#define SIMDUTF_PPC64_BITMASK_H
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is
+// encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdutf_really_inline uint64_t prefix_xor(uint64_t bitmask) {
+  // You can use the version below, however gcc sometimes miscompiles
+  // vec_pmsum_be, it happens somewhere around between 8 and 9th version.
+  // The performance boost was not noticeable, falling back to a usual
+  // implementation.
+  //   __vector unsigned long long all_ones = {~0ull, ~0ull};
+  //   __vector unsigned long long mask = {bitmask, 0};
+  //   // Clang and GCC return different values for pmsum for ull so cast it to one.
+  //   // Generally it is not specified by ALTIVEC ISA what is returned by
+  //   // vec_pmsum_be.
+  // #if defined(__LITTLE_ENDIAN__)
+  //   return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]);
+  // #else
+  //   return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]);
+  // #endif
+  bitmask ^= bitmask << 1;
+  bitmask ^= bitmask << 2;
+  bitmask ^= bitmask << 4;
+  bitmask ^= bitmask << 8;
+  bitmask ^= bitmask << 16;
+  bitmask ^= bitmask << 32;
+  return bitmask;
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+
+#endif
+/* end file src/simdutf/ppc64/bitmask.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h
+/* begin file src/simdutf/ppc64/simd.h */
+#ifndef SIMDUTF_PPC64_SIMD_H
+#define SIMDUTF_PPC64_SIMD_H
+
+#include <type_traits>
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace simd {
+
+using __m128i = __vector unsigned char;
+
+template <typename Child> struct base {
+  __m128i value;
+
+  // Zero constructor
+  simdutf_really_inline base() : value{__m128i()} {}
+
+  // Conversion from SIMD register
+  simdutf_really_inline base(const __m128i _value) : value(_value) {}
+
+  // Conversion to SIMD register
+  simdutf_really_inline operator const __m128i &() const {
+    return this->value;
+  }
+  simdutf_really_inline operator __m128i &() { return this->value; }
+
+  // Bit operations
+  simdutf_really_inline Child operator|(const Child other) const {
+    return vec_or(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child operator&(const Child other) const {
+    return vec_and(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child operator^(const Child other) const {
+    return vec_xor(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child bit_andnot(const Child other) const {
+    return vec_andc(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child &operator|=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator&=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator^=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
+  }
+};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdutf_really_inline base8() : base<simd8<T>>() {}
+  simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+  simdutf_really_inline Mask operator==(const simd8<T> other) const {
+    return (__m128i)vec_cmpeq(this->value, (__m128i)other);
+  }
+
+  static const int SIZE = sizeof(base<simd8<T>>::value);
+
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
+    __m128i chunk = this->value;
+#ifdef __LITTLE_ENDIAN__
+    chunk = (__m128i)vec_reve(this->value);
+    prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+#endif
+    chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+#ifdef __LITTLE_ENDIAN__
+    chunk = (__m128i)vec_reve((__m128i)chunk);
+#endif
+    return chunk;
+  }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+  }
+
+  simdutf_really_inline simd8<bool>() : base8() {}
+  simdutf_really_inline simd8<bool>(const __m128i _value)
+      : base8<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8<bool>(bool _value)
+      : base8<bool>(splat(_value)) {}
+
+  simdutf_really_inline int to_bitmask() const {
+    __vector unsigned long long result;
+    const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+                               0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
+
+    result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+                                                       (__m128i)perm_mask));
+#ifdef __LITTLE_ENDIAN__
+    return static_cast<int>(result[1]);
+#else
+    return static_cast<int>(result[0]);
+#endif
+  }
+  simdutf_really_inline bool any() const {
+    return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline simd8<bool> operator~() const {
+    return this->value ^ (__m128i)splat(true);
+  }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+  static simdutf_really_inline simd8<T> splat(T value) {
+    (void)value;
+    return (__m128i)vec_splats(value);
+  }
+  static simdutf_really_inline simd8<T> zero() { return splat(0); }
+  static simdutf_really_inline simd8<T> load(const T values[16]) {
+    return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
+  }
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+                                                   T v5, T v6, T v7, T v8, T v9,
+                                                   T v10, T v11, T v12, T v13,
+                                                   T v14, T v15) {
+    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+                    v14, v15);
+  }
+
+  simdutf_really_inline base8_numeric() : base8<T>() {}
+  simdutf_really_inline base8_numeric(const __m128i _value)
+      : base8<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[16]) const {
+    vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
+  }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value + (__m128i)other);
+  }
+  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value - (__m128i)other);
+  }
+  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+    *this = *this + other;
+    return *static_cast<simd8<T> *>(this);
+  }
+  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+    *this = *this - other;
+    return *static_cast<simd8<T> *>(this);
+  }
+
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
+  }
+
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
+  }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+  simdutf_really_inline simd8(const __m128i _value)
+      : base8_numeric<int8_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+                               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                               int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+                               int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+      : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
+                                              v8, v9, v10, v11, v12, v13, v14,
+                                              v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15);
+  }
+
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t>
+  max_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_max((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<int8_t>
+  min_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_min((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmpgt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmplt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+  simdutf_really_inline simd8(const __m128i _value)
+      : base8_numeric<uint8_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+      : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                        v13, v14, v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15);
+  }
+
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return (__m128i)vec_adds(this->value, (__m128i)other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return (__m128i)vec_subs(this->value, (__m128i)other);
+  }
+
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_max(this->value, (__m128i)other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_min(this->value, (__m128i)other);
+  }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return this->saturating_sub(other);
+  }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return other.saturating_sub(*this);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return other.max_val(*this) == other;
+  }
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return other.min_val(*this) == other;
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> bits_not_set() const {
+    return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+  }
+  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+    return (*this & bits).bits_not_set();
+  }
+  simdutf_really_inline simd8<bool> any_bits_set() const {
+    return ~this->bits_not_set();
+  }
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return ~this->bits_not_set(bits);
+  }
+
+  simdutf_really_inline bool is_ascii() const {
+      return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
+  }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const {
+    return vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    return !bits_not_set_anywhere();
+  }
+  simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+    return vec_all_eq(vec_and(this->value, (__m128i)bits),
+                      (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return !bits_not_set_anywhere(bits);
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+  }
+};
+
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(NUM_CHUNKS == 4,
+                "PPC64 kernel should use four registers per 64-byte block.");
+  const simd8<T> chunks[NUM_CHUNKS];
+
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
+
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+                                  const simd8<T> chunk2, const simd8<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+
+  simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+
+  simdutf_really_inline void store(T* ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
+  }
+
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
+  }
+
+
+  simdutf_really_inline bool is_ascii() const {
+    return input.reduce_or().is_ascii();
+  }
+
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r1 = this->chunks[1].to_bitmask();
+    uint64_t r2 = this->chunks[2].to_bitmask();
+    uint64_t r3 = this->chunks[3].to_bitmask();
+    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+  }
+
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                          this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+                          this->chunks[1] == other.chunks[1],
+                          this->chunks[2] == other.chunks[2],
+                          this->chunks[3] == other.chunks[3])
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                          this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                          this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(this->chunks[0]) >= mask,
+        simd8<uint8_t>(this->chunks[1]) >= mask,
+        simd8<uint8_t>(this->chunks[2]) >= mask,
+        simd8<uint8_t>(this->chunks[3]) >= mask
+      ).to_bitmask();
+  }
+}; // struct simd8x64<T>
+
+} // namespace simd
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_SIMD_INPUT_H
+/* end file src/simdutf/ppc64/simd.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+/* begin file src/simdutf/ppc64/end.h */
+/* end file src/simdutf/ppc64/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_PPC64
+
+#endif // SIMDUTF_PPC64_H
+/* end file src/simdutf/ppc64.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback.h
+/* begin file src/simdutf/fallback.h */
+#ifndef SIMDUTF_FALLBACK_H
+#define SIMDUTF_FALLBACK_H
+
+
+// Default Fallback to on unless a builtin implementation has already been selected.
+#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
+#define SIMDUTF_IMPLEMENTATION_FALLBACK 1 // (!SIMDUTF_CAN_ALWAYS_RUN_ARM64 && !SIMDUTF_CAN_ALWAYS_RUN_HASWELL && !SIMDUTF_CAN_ALWAYS_RUN_WESTMERE && !SIMDUTF_CAN_ALWAYS_RUN_PPC64)
+#endif
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK SIMDUTF_IMPLEMENTATION_FALLBACK
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+
+namespace simdutf {
+/**
+ * Fallback implementation (runs on any machine).
+ */
+namespace fallback {
+} // namespace fallback
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h
+/* begin file src/simdutf/fallback/implementation.h */
+#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
+#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
+
+
+namespace simdutf {
+namespace fallback {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation() : simdutf::implementation(
+      "fallback",
+      "Generic fallback implementation",
+      0
+  ) {}
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+};
+
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
+/* end file src/simdutf/fallback/implementation.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+// Declarations
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
+/* begin file src/simdutf/fallback/bitmanipulation.h */
+#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
+#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+
+#include <limits>
+
+namespace simdutf {
+namespace fallback {
+namespace {
+
+#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
+  unsigned long x0 = (unsigned long)x, top, bottom;
+  _BitScanForward(&top, (unsigned long)(x >> 32));
+  _BitScanForward(&bottom, x0);
+  *ret = x0 ? bottom : 32 + top;
+  return x != 0;
+}
+static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
+  unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+  _BitScanReverse(&top, x1);
+  _BitScanReverse(&bottom, (unsigned long)x);
+  *ret = x1 ? top + 32 : bottom;
+  return x != 0;
+}
+#endif
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int leading_zeroes(uint64_t input_num) {
+#ifdef _MSC_VER
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// _MSC_VER
+}
+
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
+/* end file src/simdutf/fallback/bitmanipulation.h */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
+#endif // SIMDUTF_FALLBACK_H
+/* end file src/simdutf/fallback.h */
+
+
+namespace simdutf {
+bool implementation::supported_by_runtime_system() const {
+  uint32_t required_instruction_sets = this->required_instruction_sets();
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
+}
+
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+    // UTF8 is common, it includes ASCII, and is commonly represented
+    // without a BOM, so if it fits, go with that. Note that it is still
+    // possible to get it wrong, we are only 'guessing'. If some has UTF-16
+    // data without a BOM, it could pass as UTF-8.
+    //
+    // An interesting twist might be to check for UTF-16 ASCII first (every
+    // other byte is zero).
+    if(validate_utf8(input, length)) { return encoding_type::UTF8; }
+    // The next most common encoding that might appear without BOM is probably
+    // UTF-16LE, so try that next.
+    if((length % 2) == 0) {
+      if(validate_utf16(reinterpret_cast<const char16_t*>(input), length)) { return encoding_type::UTF16_LE; }
+    }
+    return encoding_type::unspecified;
+}
+
+namespace internal {
+
+// Static array of known implementations. We're hoping these get baked into the executable
+// without requiring a static initializer.
+
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+const haswell::implementation haswell_singleton{};
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+const westmere::implementation westmere_singleton{};
+#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
+#if SIMDUTF_IMPLEMENTATION_ARM64
+const arm64::implementation arm64_singleton{};
+#endif // SIMDUTF_IMPLEMENTATION_ARM64
+#if SIMDUTF_IMPLEMENTATION_PPC64
+const ppc64::implementation ppc64_singleton{};
+#endif // SIMDUTF_IMPLEMENTATION_PPC64
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+const fallback::implementation fallback_singleton{};
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
+
+/**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+class detect_best_supported_implementation_on_first_use final : public implementation {
+public:
+  const std::string &name() const noexcept final { return set_best()->name(); }
+  const std::string &description() const noexcept final { return set_best()->description(); }
+  uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+
+  simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
+
+  simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16(buf, len);
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf16_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16(buf, len);
+  }
+
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->count_utf8(buf, len);
+  }
+  
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf8(buf, len);
+  }
+
+  simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
+
+private:
+  const implementation *set_best() const noexcept;
+};
+
+const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
+
+const std::initializer_list<const implementation *> available_implementation_pointers {
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+  &haswell_singleton,
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+  &westmere_singleton,
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+  &arm64_singleton,
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+  &ppc64_singleton,
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+  &fallback_singleton,
+#endif
+}; // available_implementation_pointers
+
+// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
+class unsupported_implementation final : public implementation {
+public:
+  simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+    // then it will flag all strings as invalid. The alternative is to return an error_code
+    // from which the user has to figure out whether the string is valid UTF-8... which seems
+    // like a lot of work just to handle the very unlikely case that we have an unsupported
+    // implementation. And, when it does happen (that we have an unsupported implementation),
+    // what are the chances that the programmer has a fallback? Given that *we* provide the
+    // fallback, it implies that the programmer would need a fallback for our fallback.
+  }
+
+  simdutf_warn_unused bool validate_utf16(const char16_t*, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t count_utf16(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
+    return 0;
+  }
+  
+  simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
+};
+
+const unsupported_implementation unsupported_singleton{};
+
+size_t available_implementation_list::size() const noexcept {
+  return internal::available_implementation_pointers.size();
+}
+const implementation * const *available_implementation_list::begin() const noexcept {
+  return internal::available_implementation_pointers.begin();
+}
+const implementation * const *available_implementation_list::end() const noexcept {
+  return internal::available_implementation_pointers.end();
+}
+const implementation *available_implementation_list::detect_best_supported() const noexcept {
+  // They are prelisted in priority order, so we just go down the list
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  for (const implementation *impl : internal::available_implementation_pointers) {
+    uint32_t required_instruction_sets = impl->required_instruction_sets();
+    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
+  }
+  return &unsupported_singleton; // this should never happen?
+}
+
+const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+  SIMDUTF_PUSH_DISABLE_WARNINGS
+  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+  char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+  SIMDUTF_POP_DISABLE_WARNINGS
+
+  if (force_implementation_name) {
+    auto force_implementation = available_implementations[force_implementation_name];
+    if (force_implementation) {
+      return active_implementation = force_implementation;
+    } else {
+      // Note: abort() and stderr usage within the library is forbidden.
+      return active_implementation = &unsupported_singleton;
+    }
+  }
+  return active_implementation = available_implementations.detect_best_supported();
+}
+
+} // namespace internal
+
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{};
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
+
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+  return active_implementation->validate_utf8(buf, len);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return active_implementation->convert_utf8_to_utf16(input, length, utf16_output);
+}
+simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
+  return active_implementation->validate_utf16(buf, len);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
+  return active_implementation->convert_valid_utf8_to_utf16(input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return active_implementation->convert_utf16_to_utf8(buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return active_implementation->convert_valid_utf16_to_utf8(buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
+  return active_implementation->count_utf16(input, length);
+}
+simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
+  return active_implementation->count_utf8(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
+  return active_implementation->utf8_length_from_utf16(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
+  return active_implementation->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
+  return active_implementation->autodetect_encoding(buf, length);
+}
+
+const implementation * builtin_implementation() {
+  static const implementation * builtin_impl = available_implementations[STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
+  return builtin_impl;
+}
+
+
+} // namespace simdutf
+
+/* end file src/implementation.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=encoding_types.cpp
+/* begin file src/encoding_types.cpp */
+
+namespace simdutf {
+std::string to_string(encoding_type bom) {
+  switch (bom) {
+      case UTF16_LE:     return "UTF16 little-endian";
+      case UTF16_BE:     return "UTF16 big-endian";
+      case UTF32_LE:     return "UTF32 little-endian";
+      case UTF32_BE:     return "UTF32 big-endian";
+      case UTF8:         return "UTF8";
+      case unspecified:  return "unknown";
+      default:           return "error";
+  }
+}
+
+namespace BOM {
+// Note that BOM for UTF8 is discouraged.
+encoding_type check_bom(const uint8_t* byte, size_t length) {
+        if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+            if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+                return encoding_type::UTF32_LE;
+            } else {
+                return encoding_type::UTF16_LE;
+            }
+        } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+            return encoding_type::UTF16_BE;
+        } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
+            return encoding_type::UTF32_BE;
+        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
+            return encoding_type::UTF8;
+        }
+        return encoding_type::unspecified;
+    }
+
+encoding_type check_bom(const char* byte, size_t length) {
+      return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
+ }
+
+ size_t bom_byte_size(encoding_type bom) {
+        switch (bom) {
+            case UTF16_LE:     return 2;
+            case UTF16_BE:     return 2;
+            case UTF32_LE:     return 4;
+            case UTF32_BE:     return 4;
+            case UTF8:         return 3;
+            case unspecified:  return 0;
+            default:           return 0;
+        }
+}
+
+}
+}
+/* end file src/encoding_types.cpp */
+// The large tables should be included once and they
+// should not depend on a kernel.
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h
+/* begin file src/tables/utf8_to_utf16_tables.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf8_to_utf16 {
+/**
+ * utf8bigindex uses about 8 kB
+ * shufutf8 uses about 3344 B
+ *
+ * So we use a bit over 11 kB. It would be
+ * easy to save about 4 kB by only
+ * storing the index in utf8bigindex, and
+ * deriving the consumed bytes otherwise.
+ * However, this may come at a significant (10% to 20%)
+ * performance penalty.
+ */
+
+const uint8_t shufutf8[209][16] =
+{	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+/* number of two bytes : 64 */
+/* number of two + three bytes : 145 */
+/* number of two + three + four bytes : 209 */
+const uint8_t utf8bigindex[4096][2] =
+{	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{148, 6},
+ 	{0, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{0, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{0, 12},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{171, 8},
+ 	{72, 8},
+ 	{183, 8},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{186, 8},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{187, 9},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{77, 7},
+ 	{95, 7},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{148, 6},
+ 	{0, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{0, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{192, 11},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{204, 11},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{207, 11},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{117, 11},
+ 	{72, 8},
+ 	{135, 11},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{141, 11},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{143, 11},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{31, 11},
+ 	{47, 11},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{55, 11},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{59, 11},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{61, 11},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{62, 11},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{148, 6},
+ 	{0, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{0, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{0, 12},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{208, 12},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{171, 8},
+ 	{72, 8},
+ 	{183, 8},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{186, 8},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{144, 12},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{77, 7},
+ 	{95, 7},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{63, 12},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{148, 6},
+ 	{0, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{0, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{192, 11},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{204, 11},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{207, 11},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{117, 11},
+ 	{72, 8},
+ 	{135, 11},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{141, 11},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{143, 11},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{31, 11},
+ 	{47, 11},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{55, 11},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{59, 11},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{61, 11},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{147, 5},
+ 	{0, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{0, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{62, 11},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{146, 4},
+ 	{0, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{0, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{0, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{0, 12},
+ 	{0, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{0, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{0, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{0, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{0, 12},
+ 	{0, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{0, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6}};
+} // utf8_to_utf16 namespace
+} // tables namespace
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
+/* end file src/tables/utf8_to_utf16_tables.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h
+/* begin file src/tables/utf16_to_utf8_tables.h */
+// file generated by scripts/sse_convert_utf16_to_utf8.py
+#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
+#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf16_to_utf8 {
+
+  // 1 byte for length, 16 bytes for mask
+  const uint8_t pack_1_2_utf8_bytes[256][17] = {
+    {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
+    {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
+  };
+
+  // 1 byte for length, 16 bytes for mask
+  const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
+    {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
+  };
+
+} // utf16_to_utf8 namespace
+} // tables namespace
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
+/* end file src/tables/utf16_to_utf8_tables.h */
+// End of tables.
+
+// The scalar routines should be included once.
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
+/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
+#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 4 ASCII characters
+    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while(pos < final_pos) {
+          *utf8_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word = data[pos];
+    if((word & 0xFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word &0xF800 ) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;      
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      uint16_t next_word = data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value>>18) | 0b11110000);
+      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
+/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+#ifndef SIMDUTF_UTF16_TO_UTF8_H
+#define SIMDUTF_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII characters
+    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while(pos < final_pos) {
+          *utf8_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word = data[pos];
+    if((word & 0xFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word &0xF800 ) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;      
+    } else {
+      // must be a surrogate pair
+      if(pos + 1 >= len) { return 0; }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(diff > 0x3FF) { return 0; }
+      uint16_t next_word = data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if(diff2 > 0x3FF) { return 0; }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value>>18) | 0b11110000);
+      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
+/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
+#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while(pos < final_pos) {
+          *utf16_output++ = char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 1 >= len) { break; } // minimal bound checking
+      *utf16_output++ = char16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 2 >= len) { break; } // minimal bound checking
+      *utf16_output++ = char16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { break; } // minimal bound checking
+      uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
+                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
+      code_word -= 0x10000;
+      *utf16_output++ = char16_t(0xD800 + (code_word >> 10));
+      *utf16_output++ = char16_t(0xDC00 + (code_word & 0x3FF));
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
+/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf16_output++ = char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 2 >= len) { return 0; } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
+
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
+      code_point -= 0x10000;
+      *utf16_output++ = char16_t(0xD800 + (code_point >> 10));
+      *utf16_output++ = char16_t(0xDC00 + (code_point & 0x3FF));
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8.h
+/* begin file src/scalar/utf8.h */
+#ifndef SIMDUTF_UTF8_H
+#define SIMDUTF_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8 {
+// credit: based on code from Google Fuchsia (Apache Licensed)
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point) ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+inline size_t count_code_points(const char* buf, size_t len) {
+    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
+    size_t counter{0};
+    for(size_t i = 0; i < len; i++) {
+        // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
+        if(p[i] > -65) { counter++; }
+    }
+    return counter;
+}
+
+inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
+    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
+    size_t counter{0};
+    for(size_t i = 0; i < len; i++) {
+        if(p[i] > -65) { counter++; }
+        if(uint8_t(p[i]) >= 240) { counter++; }
+    }
+    return counter;
+}
+
+} // utf8 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16.h
+/* begin file src/scalar/utf16.h */
+#ifndef SIMDUTF_UTF16_H
+#define SIMDUTF_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16 {
+
+inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  uint64_t pos = 0;
+  while (pos < len) {
+    uint16_t word = data[pos];
+    if((word &0xF800) == 0xD800) {
+        if(pos + 1 >= len) { return false; }
+        uint16_t diff = uint16_t(word - 0xD800);
+        if(diff > 0x3FF) { return false; }
+        uint16_t next_word = data[pos + 1];
+        uint16_t diff2 = uint16_t(next_word - 0xDC00);
+        if(diff2 > 0x3FF) { return false; }
+        pos += 2;
+    } else {
+        pos++;
+    }
+  }
+  return true;
+}
+
+
+inline size_t count_code_points(const char16_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    counter += ((p[i] & 0xFC00) != 0xDC00);
+  }
+  return counter;
+}
+
+inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    /** ASCII **/
+    if(p[i] <= 0x7F) { counter++; }
+    /** two-byte **/
+    else if(p[i] <= 0x7FF) { counter += 2; }
+    /** three-byte **/
+    else if((p[i] <= 0xD7FF) || (p[i] >= 0xE000)) { counter += 3; }
+    /** surrogates -- 4 bytes **/
+    else { counter += 2; }
+  }
+  return counter;
+}
+
+} // utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16.h */
+//
+
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp
+/* begin file src/arm64/implementation.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+/* begin file src/simdutf/arm64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "arm64"
+// #define SIMDUTF_IMPLEMENTATION arm64
+/* end file src/simdutf/arm64/begin.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+#ifndef SIMDUTF_ARM64_H
+#error "arm64.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+    simd8<uint8_t> bits = input.reduce_or();
+    return bits.max_val() < 0b10000000u;
+}
+
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+    simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+    // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
+    // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
+    // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
+    // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
+    // The error will be detected there.
+    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+    return is_third_byte ^ is_fourth_byte;
+}
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
+/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit words.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit words
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit words, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
+  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+  const char16_t* end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  while (buf + 16 <= end) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+        // It is common enough that we have sequences of 16 consecutive ASCII characters.
+        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+        if(vmaxvq_u16(nextin) > 0x7F) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x8_t utf8_packed = vmovn_u16(in);
+          // 2. store (8 bytes)
+          vst1_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+          // 2. store (16 bytes)
+          vst1q_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const uint16x8_t t0 = vshlq_n_u16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const uint16x8_t t2 = vandq_u16(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const uint16x8_t t3 = vorrq_u16(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+          // 2. merge ASCII and 2-byte codewords
+          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+          // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080);
+#else
+          const uint16x8_t mask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080 };
+#endif
+          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const uint8x16_t shuffle = vld1q_u8(row + 1);
+          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          vst1q_u8(utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+      if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(in, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef vec
+
+        // 4. expand words 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 );
+        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 );
+#else
+        const uint16x8_t onemask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 };
+        const uint16x8_t twomask = { 0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 };
+#endif
+        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+}
+/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
+/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  if((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), vmovl_u8(vget_low_u8 (in)));
+    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, vmovl_high_u8(in));
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+    const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+#endif
+    uint8x16_t perm = vqtbl1q_u8(in, sh);
+    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+#else
+    const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
+#endif
+    uint8x16_t perm = vqtbl1q_u8(in, sh);
+    uint8x16_t ascii =
+        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+    uint8x16_t middlebyte =
+        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+    uint32x4_t highbyte =
+        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+    uint32x4_t composed =
+        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+    utf16_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    uint8x16_t perm = vqtbl1q_u8(in, sh);
+    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    uint8x16_t perm = vqtbl1q_u8(in, sh);
+    uint8x16_t ascii =
+        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+    uint8x16_t middlebyte =
+        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+    uint32x4_t highbyte =
+        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+    uint32x4_t composed =
+        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+    utf16_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    uint8x16_t perm = vqtbl1q_u8(in, sh);
+    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
+    uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
+    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+    uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
+    // correct for spurious high bit
+    uint8x16_t correct =
+        vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
+    middlehighbyte = veorq_u8(correct, middlehighbyte);
+    uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
+    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
+    uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
+    uint8x16_t composed =
+        vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
+                     vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
+    uint32x4_t composedminus =
+        vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
+    uint32x4_t lowtenbits =
+        vandq_u32(composedminus, vmovq_n_u32(0x3ff));
+    uint32x4_t hightenbits = vshrq_n_u32(composedminus, 10);
+    uint32x4_t lowtenbitsadd =
+        vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
+    uint32x4_t hightenbitsadd =
+        vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
+    uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
+    uint32x4_t surrogates =
+        vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
+    uint32_t surrogate_buffer[4];
+    vst1q_u32(surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if (basic_buffer[i] < 65536) {
+        utf16_output[0] = uint16_t(basic_buffer[i]);
+        utf16_output++;
+      } else {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
+      }
+    }
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16le.cpp
+/* begin file src/arm64/arm_validate_utf16le.cpp */
+
+const char16_t* arm_validate_utf16le(const char16_t* input, size_t size) {
+    const char16_t* end = input + size;
+    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+    const auto v_fc = simd8<uint8_t>::splat(0xfc);
+    const auto v_dc = simd8<uint8_t>::splat(0xdc);
+    while (input + 16 < end) {
+        // 0. Load data: since the validation takes into account only higher
+        //    byte of each word, we compress the two vectors into one which
+        //    consists only the higher bytes.
+        const auto in0 = simd16<uint16_t>(input);
+        const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+        const auto t0 = in0.shr<8>();
+        const auto t1 = in1.shr<8>();
+        const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+        // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+        const auto surrogates_wordmask = ((in & v_f8) == v_d8);
+        if(surrogates_wordmask.none()) {
+            input += 16;
+        } else {
+            const auto vH = simd8<uint8_t>((in & v_fc) ==  v_dc);
+            const auto vL = simd8<uint8_t>(surrogates_wordmask).bit_andnot(vH);
+            // We are going to need these later:
+            const uint8_t low_vh = vH.first();
+            const uint8_t high_vl = vL.last();
+            // We shift vH down, possibly killing low_vh
+            const auto sh = simd8<uint8_t>({1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0xFF});
+            const auto vHshifteddown = vH.apply_lookup_16_to(sh);
+            const auto match = vHshifteddown == vL;
+            // We need to handle the fact that high_vl is unmatched.
+            // We could use this...
+            // const uint8x16_t allbutlast = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xFF};
+            //             match = vorrq_u8(match, allbutlast);
+            // but sh will do:
+            const auto fmatch = simd8<bool>(simd8<uint8_t>(match) | sh);
+            // We deliberately take these two lines out of the following branchy code
+            // so that they are always s
+            if (fmatch.all() && low_vh == 0) {
+                input += (high_vl == 0) ? 16 : 15;
+            } else {
+                return nullptr;
+            }
+        }
+    }
+    return input;
+}
+/* end file src/arm64/arm_validate_utf16le.cpp */
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
+    }
+    // do not forget to call check_eof!
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return !c.errors();
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+
+simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    //
+    // For pure ASCII inputs, this function is not optimally fast because they are
+    // faster ways to just check for ASCII than to compute the continuation mask.
+    // However, the continuation mask is more informative. There might be a trade-off
+    // involved.
+    //
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    if(utf8_continuation_mask != 0) {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block.These bytes will be processed again. So we have an 
+      // 80% efficiency (in the worst case). In practice we expect an 
+      // 85% to 90% efficiency.
+    } else {
+      in.store_ascii_as_utf16(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+
+  struct validating_transcoder {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+
+    validating_transcoder() : error(uint8_t(0)) {}
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      const size_t safety_margin = 16; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block.These bytes will be processed again. So we have an 
+          // 80% efficiency (in the worst case). In practice we expect an 
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// other functions
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
+    }
+    return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
+    }
+    return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // utf8 namespace
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
+/* begin file src/generic/utf16.h */
+#include <iostream>
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf16 {
+
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
+    }
+    return count + scalar::utf16::count_code_points(in + pos, size - pos);
+}
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    }
+    return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos);
+}
+} // utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace arm64 {
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8(buf,len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = arm_validate_utf16le(buf, len);
+  if (tail) {
+    return scalar::utf16::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size,
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid(input, size,  utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+} // namespace arm64
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+/* begin file src/simdutf/arm64/end.h */
+/* end file src/simdutf/arm64/end.h */
+/* end file src/arm64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp
+/* begin file src/fallback/implementation.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+
+namespace simdutf {
+namespace fallback {
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+    return scalar::utf8::validate(buf, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept {
+    return scalar::utf16::validate(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+   return scalar::utf8_to_utf16::convert(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+   return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+} // namespace fallback
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+/* end file src/fallback/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp
+/* begin file src/haswell/implementation.cpp */
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+/* begin file src/simdutf/haswell/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "haswell"
+// #define SIMDUTF_IMPLEMENTATION haswell
+SIMDUTF_TARGET_HASWELL
+/* end file src/simdutf/haswell/begin.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+#ifndef SIMDUTF_HASWELL_H
+#error "haswell.h must be included"
+#endif
+using namespace simd;
+
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
+/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
+
+
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF)) {
+    // We process the data in chunks of 16 bytes.
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), _mm256_cvtepu8_epi16(in));
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    const __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
+  }
+
+  const uint8_t idx =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    const __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    const __m128i surrogates =
+        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if (basic_buffer[i] < 65536) {
+        utf16_output[0] = uint16_t(basic_buffer[i]);
+        utf16_output++;
+      } else {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
+      }
+    }
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16le.cpp
+/* begin file src/haswell/avx2_validate_utf16le.cpp */
+/*
+    In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7 words
+                                  and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check the rest);
+   - nullptr if an error was detected.
+*/
+const char16_t* avx2_validate_utf16le(const char16_t* input, size_t size) {
+    const char16_t* end = input + size;
+
+    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+    const auto v_fc = simd8<uint8_t>::splat(0xfc);
+    const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+    while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+        // 0. Load data: since the validation takes into account only higher
+        //    byte of each word, we compress the two vectors into one which
+        //    consists only the higher bytes.
+        const auto in0 = simd16<uint16_t>(input);
+        const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+        const auto t0 = in0.shr<8>();
+        const auto t1 = in1.shr<8>();
+
+        const auto in = simd16<uint16_t>::pack(t0, t1);
+
+        // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+        const auto surrogates_wordmask = (in & v_f8) == v_d8;
+        const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+        if (surrogates_bitmask == 0x0) {
+            input += simd16<uint16_t>::ELEMENTS * 2;
+        } else {
+            // 2. We have some surrogates that have to be distinguished:
+            //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+            //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+            //
+            //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+            // V - non-surrogate words
+            //     V = not surrogates_wordmask
+            const uint32_t V = ~surrogates_bitmask;
+
+            // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+            const auto    vH = (in & v_fc) == v_dc;
+            const uint32_t H = vH.to_bitmask();
+
+            // L - word mask for low surrogates
+            //     L = not H and surrogates_wordmask
+            const uint32_t L = ~H & surrogates_bitmask;
+
+            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
+
+            if (c == 0xffffffff) {
+                // The whole input register contains valid UTF-16, i.e.,
+                // either single words or proper surrogate pairs.
+                input += simd16<uint16_t>::ELEMENTS * 2;
+            } else if (c == 0x7fffffff) {
+                // The 31 lower words of the input register contains valid UTF-16.
+                // The 31 word may be either a low or high surrogate. It the next
+                // iteration we 1) check if the low surrogate is followed by a high
+                // one, 2) reject sole high surrogate.
+                input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+            } else {
+                return nullptr;
+            }
+        }
+    }
+
+    return input;
+}
+/* end file src/haswell/avx2_validate_utf16le.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
+/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit words.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit words
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit words, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
+  const char16_t* end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin = 8; // to avoid overruns
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m256i t0 = _mm256_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m256i t2 = _mm256_and_si256(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m256i t3 = _mm256_or_si256(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          const uint32_t M0 = one_byte_bitmask & 0x55555555;
+          const uint32_t M1 = M0 >> 7;
+          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+          // 4. pack the bytes
+
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+          utf8_output += row[0];
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+          utf8_output += row_2[0];
+
+          // 6. adjust pointers
+          buf += 16;
+          continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a suggogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                                0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m256i s0 = _mm256_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
+        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
+        const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef vec
+
+        // 4. expand words 16-bit => 32-bit
+        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+        // Due to the wider registers, the following path is less likely to be useful.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+          utf8_output += 12;
+          buf += 16;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+        utf8_output += row2[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+        utf8_output += row3[0];
+        buf += 16;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, utf8_output);
+}
+/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
+    }
+    // do not forget to call check_eof!
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return !c.errors();
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+
+simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    //
+    // For pure ASCII inputs, this function is not optimally fast because they are
+    // faster ways to just check for ASCII than to compute the continuation mask.
+    // However, the continuation mask is more informative. There might be a trade-off
+    // involved.
+    //
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    if(utf8_continuation_mask != 0) {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block.These bytes will be processed again. So we have an 
+      // 80% efficiency (in the worst case). In practice we expect an 
+      // 85% to 90% efficiency.
+    } else {
+      in.store_ascii_as_utf16(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+
+  struct validating_transcoder {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+
+    validating_transcoder() : error(uint8_t(0)) {}
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      const size_t safety_margin = 16; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block.These bytes will be processed again. So we have an 
+          // 80% efficiency (in the worst case). In practice we expect an 
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// other functions
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
+    }
+    return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
+    }
+    return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // utf8 namespace
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
+/* begin file src/generic/utf16.h */
+#include <iostream>
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf16 {
+
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
+    }
+    return count + scalar::utf16::count_code_points(in + pos, size - pos);
+}
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    }
+    return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos);
+}
+} // utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+
+namespace simdutf {
+namespace haswell {
+
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8(buf,len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = avx2_validate_utf16le(buf, len);
+  if (tail) {
+    return scalar::utf16::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size,
+    char16_t* utf16_output) const noexcept {
+   return utf8_to_utf16::convert_valid(input, size,  utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = haswell::sse_convert_utf16_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+} // namespace haswell
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+/* begin file src/simdutf/haswell/end.h */
+SIMDUTF_UNTARGET_REGION
+/* end file src/simdutf/haswell/end.h */
+/* end file src/haswell/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp
+/* begin file src/ppc64/implementation.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+/* begin file src/simdutf/ppc64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
+// #define SIMDUTF_IMPLEMENTATION ppc64
+/* end file src/simdutf/ppc64/begin.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+#ifndef SIMDUTF_PPC64_H
+#error "ppc64.h must be included"
+#endif
+using namespace simd;
+
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  // careful: 0x80 is not ascii.
+  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
+    }
+    // do not forget to call check_eof!
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return !c.errors();
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+
+simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    //
+    // For pure ASCII inputs, this function is not optimally fast because they are
+    // faster ways to just check for ASCII than to compute the continuation mask.
+    // However, the continuation mask is more informative. There might be a trade-off
+    // involved.
+    //
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    if(utf8_continuation_mask != 0) {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block.These bytes will be processed again. So we have an 
+      // 80% efficiency (in the worst case). In practice we expect an 
+      // 85% to 90% efficiency.
+    } else {
+      in.store_ascii_as_utf16(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+
+  struct validating_transcoder {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+
+    validating_transcoder() : error(uint8_t(0)) {}
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      const size_t safety_margin = 16; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block.These bytes will be processed again. So we have an 
+          // 80% efficiency (in the worst case). In practice we expect an 
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// other functions
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
+    }
+    return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
+    }
+    return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // utf8 namespace
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
+/* begin file src/generic/utf16.h */
+#include <iostream>
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf16 {
+
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
+    }
+    return count + scalar::utf16::count_code_points(in + pos, size - pos);
+}
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    }
+    return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos);
+}
+} // utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace ppc64 {
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8(buf,len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+} // namespace ppc64
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+/* begin file src/simdutf/ppc64/end.h */
+/* end file src/simdutf/ppc64/end.h */
+/* end file src/ppc64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp
+/* begin file src/westmere/implementation.cpp */
+#include <utility>
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+/* begin file src/simdutf/westmere/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "westmere"
+// #define SIMDUTF_IMPLEMENTATION westmere
+SIMDUTF_TARGET_WESTMERE
+/* end file src/simdutf/westmere/begin.h */
+namespace simdutf {
+namespace westmere {
+namespace {
+#ifndef SIMDUTF_WESTMERE_H
+#error "westmere.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
+/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
+
+
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF)) {
+    // We process the data in chunks of 16 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), _mm_cvtepu8_epi16(in));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), _mm_cvtepu8_epi16(_mm_srli_si128(in,8)));
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    const __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    const __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    const __m128i surrogates =
+        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if (basic_buffer[i] < 65536) {
+        utf16_output[0] = uint16_t(basic_buffer[i]);
+        utf16_output++;
+      } else {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
+      }
+    }
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16le.cpp
+/* begin file src/westmere/sse_validate_utf16le.cpp */
+/*
+    In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7 words
+                                  and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check the rest);
+   - nullptr if an error was detected.
+*/
+const char16_t* sse_validate_utf16le(const char16_t* input, size_t size) {
+    const char16_t* end = input + size;
+
+    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+    const auto v_fc = simd8<uint8_t>::splat(0xfc);
+    const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+    while (input + simd16<uint16_t>::SIZE * 2 < end) {
+        // 0. Load data: since the validation takes into account only higher
+        //    byte of each word, we compress the two vectors into one which
+        //    consists only the higher bytes.
+        const auto in0 = simd16<uint16_t>(input);
+        const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+        const auto t0 = in0.shr<8>();
+        const auto t1 = in1.shr<8>();
+
+        const auto in = simd16<uint16_t>::pack(t0, t1);
+
+        // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+        const auto surrogates_wordmask = (in & v_f8) == v_d8;
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+        if (surrogates_bitmask == 0x0000) {
+            input += 16;
+        } else {
+            // 2. We have some surrogates that have to be distinguished:
+            //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+            //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+            //
+            //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+            // V - non-surrogate words
+            //     V = not surrogates_wordmask
+            const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+            // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+            const auto    vH = (in & v_fc) == v_dc;
+            const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+            // L - word mask for low surrogates
+            //     L = not H and surrogates_wordmask
+            const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opposite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
+
+            if (c == 0xffff) {
+                // The whole input register contains valid UTF-16, i.e.,
+                // either single words or proper surrogate pairs.
+                input += 16;
+            } else if (c == 0x7fff) {
+                // The 15 lower words of the input register contains valid UTF-16.
+                // The 15th word may be either a low or high surrogate. It the next
+                // iteration we 1) check if the low surrogate is followed by a high
+                // one, 2) reject sole high surrogate.
+                input += 15;
+            } else {
+                return nullptr;
+            }
+        }
+    }
+
+    return input;
+}
+/* end file src/westmere/sse_validate_utf16le.cpp */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
+/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit words.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit words
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit words, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
+
+  const char16_t* end = buf + len;
+
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+  while (buf + 16 <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+        if(!_mm_testz_si128(nextin, v_ff80)) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,in);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m128i t0 = _mm_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m128i t1 = _mm_and_si128(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m128i t2 = _mm_and_si128(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m128i t3 = _mm_or_si128(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
+          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
+          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a suggogate word
+    //    at the end of a chunk.
+    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x0000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m128i s0 = _mm_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
+        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
+        const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef vec
+
+        // 4. expand words 16-bit => 32-bit
+        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16_t mask = (one_byte_bitmask & 0x5555) |
+                              (one_or_two_bytes_bitmask & 0xaaaa);
+        if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(buf, utf8_output);
+}
+/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
+
+// UTF-16 => UTF-8 conversion
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace westmere {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
+    }
+    // do not forget to call check_eof!
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return !c.errors();
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+
+simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    //
+    // For pure ASCII inputs, this function is not optimally fast because they are
+    // faster ways to just check for ASCII than to compute the continuation mask.
+    // However, the continuation mask is more informative. There might be a trade-off
+    // involved.
+    //
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    if(utf8_continuation_mask != 0) {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block.These bytes will be processed again. So we have an 
+      // 80% efficiency (in the worst case). In practice we expect an 
+      // 85% to 90% efficiency.
+    } else {
+      in.store_ascii_as_utf16(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+
+  struct validating_transcoder {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+
+    validating_transcoder() : error(uint8_t(0)) {}
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      const size_t safety_margin = 16; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block.These bytes will be processed again. So we have an 
+          // 80% efficiency (in the worst case). In practice we expect an 
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
+    }
+
+  }; // struct utf8_checker
+} // utf8_to_utf16 namespace
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// other functions
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
+    }
+    return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
+    }
+    return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // utf8 namespace
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h
+/* begin file src/generic/utf16.h */
+#include <iostream>
+namespace simdutf {
+namespace westmere {
+namespace {
+namespace utf16 {
+
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
+    }
+    return count + scalar::utf16::count_code_points(in + pos, size - pos);
+}
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+    size_t pos = 0;
+    size_t count = 0;
+    // This algorithm could no doubt be improved!
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    }
+    return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos);
+}
+} // utf16
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+//
+// Implementation-specific overrides
+//
+
+namespace simdutf {
+namespace westmere {
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return westmere::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = sse_validate_utf16le(buf, len);
+  if (tail) {
+    return scalar::utf16::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size,
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid(input, size,  utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+} // namespace westmere
+} // namespace simdutf
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+/* begin file src/simdutf/westmere/end.h */
+SIMDUTF_UNTARGET_REGION
+/* end file src/simdutf/westmere/end.h */
+/* end file src/westmere/implementation.cpp */
+#endif
+
+SIMDUTF_POP_DISABLE_WARNINGS
+/* end file src/simdutf.cpp */
diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp
new file mode 100644
index 00000000..73ab5f25
--- /dev/null
+++ b/cbits/validate_utf8.cpp
@@ -0,0 +1,6 @@
+#include "simdutf.h"
+
+extern "C"
+int _hs_text_is_valid_utf8(const char* str, size_t len){
+  return simdutf::validate_utf8(str, len);
+}
diff --git a/include/simdutf.h b/include/simdutf.h
new file mode 100644
index 00000000..82e999fb
--- /dev/null
+++ b/include/simdutf.h
@@ -0,0 +1,1084 @@
+/* auto-generated on 2021-07-29 10:43:28 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf.h
+/* begin file include/simdutf.h */
+#ifndef SIMDUTF_H
+#define SIMDUTF_H
+#include <string>
+#include <cstring>
+#include <atomic>
+#include <vector>
+
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h
+/* begin file include/simdutf/compiler_check.h */
+#ifndef SIMDUTF_COMPILER_CHECK_H
+#define SIMDUTF_COMPILER_CHECK_H
+
+#ifndef __cplusplus
+#error simdutf requires a C++ compiler
+#endif
+
+#ifndef SIMDUTF_CPLUSPLUS
+#if defined(_MSVC_LANG) && !defined(__clang__)
+#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
+#else
+#define SIMDUTF_CPLUSPLUS __cplusplus
+#endif
+#endif
+
+// C++ 17
+#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
+#define SIMDUTF_CPLUSPLUS17 1
+#endif
+
+// C++ 14
+#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
+#define SIMDUTF_CPLUSPLUS14 1
+#endif
+
+// C++ 11
+#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
+#define SIMDUTF_CPLUSPLUS11 1
+#endif
+
+#ifndef SIMDUTF_CPLUSPLUS11
+#error simdutf requires a compiler compliant with the C++11 standard
+#endif
+
+#endif // SIMDUTF_COMPILER_CHECK_H
+/* end file include/simdutf/compiler_check.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h
+/* begin file include/simdutf/common_defs.h */
+#ifndef SIMDUTF_COMMON_DEFS_H
+#define SIMDUTF_COMMON_DEFS_H
+
+#include <cassert>
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/portability.h
+/* begin file include/simdutf/portability.h */
+#ifndef SIMDUTF_PORTABILITY_H
+#define SIMDUTF_PORTABILITY_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cfloat>
+#include <cassert>
+#ifndef _WIN32
+// strcasecmp, strncasecmp
+#include <strings.h>
+#endif
+
+#ifdef _MSC_VER
+#define SIMDUTF_VISUAL_STUDIO 1
+/**
+ * We want to differentiate carefully between
+ * clang under visual studio and regular visual
+ * studio.
+ *
+ * Under clang for Windows, we enable:
+ *  * target pragmas so that part and only part of the
+ *     code gets compiled for advanced instructions.
+ *
+ */
+#ifdef __clang__
+// clang under visual studio
+#define SIMDUTF_CLANG_VISUAL_STUDIO 1
+#else
+// just regular visual studio (best guess)
+#define SIMDUTF_REGULAR_VISUAL_STUDIO 1
+#endif // __clang__
+#endif // _MSC_VER
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+// https://en.wikipedia.org/wiki/C_alternative_tokens
+// This header should have no effect, except maybe
+// under Visual Studio.
+#include <iso646.h>
+#endif
+
+#if defined(__x86_64__) || defined(_M_AMD64)
+#define SIMDUTF_IS_X86_64 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define SIMDUTF_IS_ARM64 1
+#elif defined(__PPC64__) || defined(_M_PPC64)
+//#define SIMDUTF_IS_PPC64 1
+#pragma message("The simdutf library does yet support SIMD acceleration under\
+POWER processors. Please see https://github.com/lemire/simdutf/issues/51")
+#else
+// The simdutf library is designed
+// for 64-bit processors and it seems that you are not
+// compiling for a known 64-bit platform. Please
+// use a 64-bit target such as x64 or 64-bit ARM for best performance.
+#define SIMDUTF_IS_32BITS 1
+
+// We do not support 32-bit platforms, but it can be
+// handy to identify them.
+#if defined(_M_IX86) || defined(__i386__)
+#define SIMDUTF_IS_X86_32BITS 1
+#elif defined(__arm__) || defined(_M_ARM)
+#define SIMDUTF_IS_ARM_32BITS 1
+#elif defined(__PPC__) || defined(_M_PPC)
+#define SIMDUTF_IS_PPC_32BITS 1
+#endif
+
+#endif // defined(__x86_64__) || defined(_M_AMD64)
+
+#ifdef SIMDUTF_IS_32BITS
+#ifndef SIMDUTF_NO_PORTABILITY_WARNING
+#pragma message("The simdutf library is designed \
+for 64-bit processors and it seems that you are not \
+compiling for a known 64-bit platform. All fast kernels \
+will be disabled and performance may be poor. Please \
+use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
+#endif // SIMDUTF_NO_PORTABILITY_WARNING
+#endif // SIMDUTF_IS_32BITS
+
+// this is almost standard?
+#undef STRINGIFY_IMPLEMENTATION_
+#undef STRINGIFY
+#define STRINGIFY_IMPLEMENTATION_(a) #a
+#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
+
+// Our fast kernels require 64-bit systems.
+//
+// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
+// Furthermore, the number of SIMD registers is reduced.
+//
+// On 32-bit ARM, we would have smaller registers.
+//
+// The simdutf users should still have the fallback kernel. It is
+// slower, but it should run everywhere.
+
+//
+// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION
+//
+
+// We are going to use runtime dispatch.
+#ifdef SIMDUTF_IS_X86_64
+#ifdef __clang__
+// clang does not have GCC push pop
+// warning: clang attribute push can't be used within a namespace in clang up
+// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
+// namespace.
+#define SIMDUTF_TARGET_REGION(T)                                                       \
+  _Pragma(STRINGIFY(                                                           \
+      clang attribute push(__attribute__((target(T))), apply_to = function)))
+#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
+#elif defined(__GNUC__)
+// GCC is easier
+#define SIMDUTF_TARGET_REGION(T)                                                       \
+  _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
+#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
+#endif // clang then gcc
+
+#endif // x86
+
+// Default target region macros don't do anything.
+#ifndef SIMDUTF_TARGET_REGION
+#define SIMDUTF_TARGET_REGION(T)
+#define SIMDUTF_UNTARGET_REGION
+#endif
+
+// Is threading enabled?
+#if defined(_REENTRANT) || defined(_MT)
+#ifndef SIMDUTF_THREADS_ENABLED
+#define SIMDUTF_THREADS_ENABLED
+#endif
+#endif
+
+// workaround for large stack sizes under -O0.
+// https://github.com/simdutf/simdutf/issues/691
+#ifdef __APPLE__
+#ifndef __OPTIMIZE__
+// Apple systems have small stack sizes in secondary threads.
+// Lack of compiler optimization may generate high stack usage.
+// Users may want to disable threads for safety, but only when
+// in debug mode which we detect by the fact that the __OPTIMIZE__
+// macro is not defined.
+#undef SIMDUTF_THREADS_ENABLED
+#endif
+#endif
+
+
+#if defined(__clang__)
+#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
+#elif defined(__GNUC__)
+#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined))
+#else
+#define NO_SANITIZE_UNDEFINED
+#endif
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+// This is one case where we do not distinguish between
+// regular visual studio and clang under visual studio.
+// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
+#define simdutf_strcasecmp _stricmp
+#define simdutf_strncasecmp _strnicmp
+#else
+// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
+// So they are only useful for ASCII in our context.
+// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
+#define simdutf_strcasecmp strcasecmp
+#define simdutf_strncasecmp strncasecmp
+#endif
+
+#ifdef NDEBUG
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+#define SIMDUTF_UNREACHABLE() __assume(0)
+#define SIMDUTF_ASSUME(COND) __assume(COND)
+#else
+#define SIMDUTF_UNREACHABLE() __builtin_unreachable();
+#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
+#endif
+
+#else // NDEBUG
+
+#define SIMDUTF_UNREACHABLE() assert(0);
+#define SIMDUTF_ASSUME(COND) assert(COND)
+
+#endif
+
+#endif // SIMDUTF_PORTABILITY_H
+/* end file include/simdutf/portability.h */
+
+
+#if defined(__GNUC__)
+  // Marks a block with a name so that MCA analysis can see it.
+  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
+  #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+  #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
+#else
+  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
+  #define SIMDUTF_END_DEBUG_BLOCK(name)
+  #define SIMDUTF_DEBUG_BLOCK(name, block)
+#endif
+
+// Align to N-byte boundary
+#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
+#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
+
+#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
+
+#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
+
+  #define simdutf_really_inline __forceinline
+  #define simdutf_never_inline __declspec(noinline)
+
+  #define simdutf_unused
+  #define simdutf_warn_unused
+
+  #ifndef simdutf_likely
+  #define simdutf_likely(x) x
+  #endif
+  #ifndef simdutf_unlikely
+  #define simdutf_unlikely(x) x
+  #endif
+
+  #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
+  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
+  #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
+  // Get rid of Intellisense-only warnings (Code Analysis)
+  // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
+  #ifdef __has_include
+  #if __has_include(<CppCoreCheck\Warnings.h>)
+  #include <CppCoreCheck\Warnings.h>
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
+  #endif
+  #endif
+
+  #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #endif
+
+  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
+  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
+  #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
+
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+
+  #define simdutf_really_inline inline __attribute__((always_inline))
+  #define simdutf_never_inline inline __attribute__((noinline))
+
+  #define simdutf_unused __attribute__((unused))
+  #define simdutf_warn_unused __attribute__((warn_unused_result))
+
+  #ifndef simdutf_likely
+  #define simdutf_likely(x) __builtin_expect(!!(x), 1)
+  #endif
+  #ifndef simdutf_unlikely
+  #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
+  #endif
+
+  #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
+  // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
+  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
+    SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
+  #define SIMDUTF_PRAGMA(P) _Pragma(#P)
+  #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
+  #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
+  #else
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #endif
+  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
+  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
+  #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
+
+
+
+#endif // MSC_VER
+
+#if defined(SIMDUTF_VISUAL_STUDIO)
+    /**
+     * It does not matter here whether you are using
+     * the regular visual studio or clang under visual
+     * studio.
+     */
+    #if SIMDUTF_USING_LIBRARY
+    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+    #else
+    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+    #endif
+#else
+    #define SIMDUTF_DLLIMPORTEXPORT
+#endif
+
+/// If EXPR is an error, returns it.
+#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
+
+
+#endif // SIMDUTF_COMMON_DEFS_H
+/* end file include/simdutf/common_defs.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h
+/* begin file include/simdutf/encoding_types.h */
+#include <string>
+
+namespace simdutf {
+
+enum encoding_type {
+        UTF16_LE,   // BOM 0xff 0xfe
+        UTF16_BE,   // BOM 0xfe 0xff
+        UTF32_LE,   // BOM 0xff 0xfe 0x00 0x00
+        UTF32_BE,   // BOM 0x00 0x00 0xfe 0xff
+        UTF8,       // BOM 0xef 0xbb 0xbf
+        unspecified
+};
+
+std::string to_string(encoding_type bom);
+
+// Note that BOM for UTF8 is discouraged.
+namespace BOM {
+
+/**
+ * Checks for a BOM. If not, returns unspecified
+ * @param input         the string to process
+ * @param length        the length of the string in words
+ * @return the corresponding encoding
+ */
+
+encoding_type check_bom(const uint8_t* byte, size_t length);
+encoding_type check_bom(const char* byte, size_t length);
+/**
+ * Returns the size, in bytes, of the BOM for a given encoding type.
+ * Note that UTF8 BOM are discouraged.
+ * @param bom         the encoding type
+ * @return the size in bytes of the corresponding BOM
+ */
+size_t bom_byte_size(encoding_type bom);
+
+} // BOM namespace
+} // simdutf namespace
+/* end file include/simdutf/encoding_types.h */
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+// Public API
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h
+/* begin file include/simdutf/simdutf_version.h */
+// /include/simdutf/simdutf_version.h automatically generated by release.py,
+// do not change by hand
+#ifndef SIMDUTF_SIMDUTF_VERSION_H
+#define SIMDUTF_SIMDUTF_VERSION_H
+
+/** The version of simdutf being used (major.minor.revision) */
+#define SIMDUTF_VERSION 0.1.0
+
+namespace simdutf {
+enum {
+  /**
+   * The major version (MAJOR.minor.revision) of simdutf being used.
+   */
+  SIMDUTF_VERSION_MAJOR = 0,
+  /**
+   * The minor version (major.MINOR.revision) of simdutf being used.
+   */
+  SIMDUTF_VERSION_MINOR = 1,
+  /**
+   * The revision (major.minor.REVISION) of simdutf being used.
+   */
+  SIMDUTF_VERSION_REVISION = 0
+};
+} // namespace simdutf
+
+#endif // SIMDUTF_SIMDUTF_VERSION_H
+/* end file include/simdutf/simdutf_version.h */
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/implementation.h
+/* begin file include/simdutf/implementation.h */
+#ifndef SIMDUTF_IMPLEMENTATION_H
+#define SIMDUTF_IMPLEMENTATION_H
+#include <string>
+#include <atomic>
+#include <vector>
+// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h
+/* begin file include/simdutf/internal/isadetection.h */
+/* From
+https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
+Highly modified.
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SIMDutf_INTERNAL_ISADETECTION_H
+#define SIMDutf_INTERNAL_ISADETECTION_H
+
+#include <cstdint>
+#include <cstdlib>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+#include <cpuid.h>
+#endif
+
+namespace simdutf {
+namespace internal {
+
+
+enum instruction_set {
+  DEFAULT = 0x0,
+  NEON = 0x1,
+  AVX2 = 0x4,
+  SSE42 = 0x8,
+  PCLMULQDQ = 0x10,
+  BMI1 = 0x20,
+  BMI2 = 0x40,
+  ALTIVEC = 0x80
+};
+
+#if defined(__PPC64__)
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::ALTIVEC;
+}
+
+#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
+
+#if defined(__ARM_NEON)
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::NEON;
+}
+
+#else // ARM without NEON
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::DEFAULT;
+}
+
+#endif
+
+#elif defined(__x86_64__) || defined(_M_AMD64) // x64
+
+
+namespace {
+// Can be found on Intel ISA Reference for CPUID
+constexpr uint32_t cpuid_avx2_bit = 1 << 5;      ///< @private Bit 5 of EBX for EAX=0x7
+constexpr uint32_t cpuid_bmi1_bit = 1 << 3;      ///< @private bit 3 of EBX for EAX=0x7
+constexpr uint32_t cpuid_bmi2_bit = 1 << 8;      ///< @private bit 8 of EBX for EAX=0x7
+constexpr uint32_t cpuid_sse42_bit = 1 << 20;    ///< @private bit 20 of ECX for EAX=0x1
+constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit  1 of ECX for EAX=0x1
+}
+
+
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                         uint32_t *edx) {
+#if defined(_MSC_VER)
+  int cpu_info[4];
+  __cpuid(cpu_info, *eax);
+  *eax = cpu_info[0];
+  *ebx = cpu_info[1];
+  *ecx = cpu_info[2];
+  *edx = cpu_info[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid(level, eax, ebx, ecx, edx);
+#else
+  uint32_t a = *eax, b, c = *ecx, d;
+  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+#endif
+}
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t host_isa = 0x0;
+
+  // ECX for EAX=0x7
+  eax = 0x7;
+  ecx = 0x0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & cpuid_avx2_bit) {
+    host_isa |= instruction_set::AVX2;
+  }
+  if (ebx & cpuid_bmi1_bit) {
+    host_isa |= instruction_set::BMI1;
+  }
+
+  if (ebx & cpuid_bmi2_bit) {
+    host_isa |= instruction_set::BMI2;
+  }
+
+  // EBX for EAX=0x1
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  if (ecx & cpuid_sse42_bit) {
+    host_isa |= instruction_set::SSE42;
+  }
+
+  if (ecx & cpuid_pclmulqdq_bit) {
+    host_isa |= instruction_set::PCLMULQDQ;
+  }
+
+  return host_isa;
+}
+#else // fallback
+
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::DEFAULT;
+}
+
+
+#endif // end SIMD extension detection code
+
+} // namespace internal
+} // namespace simdutf
+
+#endif // SIMDutf_INTERNAL_ISADETECTION_H
+/* end file include/simdutf/internal/isadetection.h */
+
+
+namespace simdutf {
+
+/**
+ * Autodetect the encoding of the input.
+ *
+ * @param input the string to analyze.
+ * @param length the length of the string in bytes.
+ * @return the detected encoding type
+ */
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
+simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
+  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
+}
+
+
+/**
+ * Validate the UTF-8 string.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the UTF-8 string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if and only if the string is valid UTF-8.
+ */
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
+
+/**
+ * Validate the UTF-16LE string.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16LE string to validate.
+ * @param len the length of the string in number of 2-byte words (char16_t).
+ * @return true if and only if the string is valid UTF-16LE.
+ */
+simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-16LE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the UTF-8 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf16_buffer  the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
+ */
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf8_output) noexcept;
+
+/**
+ * Convert valid UTF-8 string into UTF-16LE string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input         the UTF-8 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf16_buffer  the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
+
+/**
+ * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
+ *
+ * This function does not validate the input.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input         the UTF-8 string to process
+ * @param length        the length of the string in bytes
+ * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
+ */
+simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input         the UTF-16LE string to convert
+ * @param length        the length of the string in 2-byte words (char16_t)
+ * @param utf8_buffer   the pointer to buffer that can hold conversion result
+ * @return number of written words; 0 if input is not a valid UTF-16LE string
+ */
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16LE string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input         the UTF-16LE string to convert
+ * @param length        the length of the string in 2-byte words (char16_t)
+ * @param utf8_buffer   the pointer to buffer that can hold the conversion result
+ * @return number of written words; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
+ *
+ * This function does not validate the input.
+ *
+ * @param input         the UTF-16LE string to convert
+ * @param length        the length of the string in 2-byte words (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input         the UTF-16LE string to process
+ * @param length        the length of the string in 2-byte words (char16_t)
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input         the UTF-8 string to process
+ * @param length        the length of the string in bytes
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
+
+/**
+ * An implementation of simdutf for a particular CPU architecture.
+ *
+ * Also used to maintain the currently active implementation. The active implementation is
+ * automatically initialized on first use to the most advanced implementation supported by the host.
+ */
+class implementation {
+public:
+
+  /**
+   * The name of this implementation.
+   *
+   *     const implementation *impl = simdutf::active_implementation;
+   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+   */
+  virtual const std::string &name() const { return _name; }
+
+  /**
+   * The description of this implementation.
+   *
+   *     const implementation *impl = simdutf::active_implementation;
+   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+   */
+  virtual const std::string &description() const { return _description; }
+
+  /**
+   * The instruction sets this implementation is compiled against
+   * and the current CPU match. This function may poll the current CPU/system
+   * and should therefore not be called too often if performance is a concern.
+   *
+   *
+   * @return true if the implementation can be safely used on the current system (determined at runtime)
+   */
+  bool supported_by_runtime_system() const;
+
+  /**
+   * This function will try to detect the encoding
+   * @param input the string to identify
+   * @param length the length of the string in bytes.
+   * @return the encoding type detected
+   */
+  virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
+
+  /**
+   * @private For internal implementation use
+   *
+   * The instruction sets this implementation is compiled against.
+   *
+   * @return a mask of all required `internal::instruction_set::` values
+   */
+  virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; };
+
+
+  /**
+   * Validate the UTF-8 string.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the UTF-8 string to validate.
+   * @param len the length of the string in bytes.
+   * @return true if and only if the string is valid UTF-8.
+   */
+  simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-16LE string.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-16LE string to validate.
+   * @param len the length of the string in number of 2-byte words (char16_t).
+   * @return true if and only if the string is valid UTF-16LE.
+   */
+  simdutf_warn_unused virtual bool validate_utf16(const char16_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-16LE string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf8_output) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-8 string into UTF-16LE string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to process
+   * @param length        the length of the string in bytes
+   * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
+   */
+  simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into UTF-8 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into UTF-8 string.
+   *
+   * This function assumes that the input string is valid UTF-16LE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+   */
+  simdutf_warn_unused virtual size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Count the number of code points (characters) in the string assuming that
+   * it is valid.
+   *
+   * This function assumes that the input string is valid UTF-16LE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to process
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return number of code points
+   */
+  simdutf_warn_unused virtual size_t count_utf16(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Count the number of code points (characters) in the string assuming that
+   * it is valid.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to process
+   * @param length        the length of the string in bytes
+   * @return number of code points
+   */
+  simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
+
+
+
+protected:
+  /** @private Construct an implementation with the given name and description. For subclasses. */
+  simdutf_really_inline implementation(
+    std::string name,
+    std::string description,
+    uint32_t required_instruction_sets
+  ) :
+    _name(name),
+    _description(description),
+    _required_instruction_sets(required_instruction_sets)
+  {
+  }
+  virtual ~implementation()=default;
+
+private:
+  /**
+   * The name of this implementation.
+   */
+  const std::string _name;
+
+  /**
+   * The description of this implementation.
+   */
+  const std::string _description;
+
+  /**
+   * Instruction sets required for this implementation.
+   */
+  const uint32_t _required_instruction_sets;
+};
+
+/** @private */
+namespace internal {
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+class available_implementation_list {
+public:
+  /** Get the list of available implementations compiled into simdutf */
+  simdutf_really_inline available_implementation_list() {}
+  /** Number of implementations */
+  size_t size() const noexcept;
+  /** STL const begin() iterator */
+  const implementation * const *begin() const noexcept;
+  /** STL const end() iterator */
+  const implementation * const *end() const noexcept;
+
+  /**
+   * Get the implementation with the given name.
+   *
+   * Case sensitive.
+   *
+   *     const implementation *impl = simdutf::available_implementations["westmere"];
+   *     if (!impl) { exit(1); }
+   *     if (!imp->supported_by_runtime_system()) { exit(1); }
+   *     simdutf::active_implementation = impl;
+   *
+   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
+   * @return the implementation, or nullptr if the parse failed.
+   */
+  const implementation * operator[](const std::string &name) const noexcept {
+    for (const implementation * impl : *this) {
+      if (impl->name() == name) { return impl; }
+    }
+    return nullptr;
+  }
+
+  /**
+   * Detect the most advanced implementation supported by the current host.
+   *
+   * This is used to initialize the implementation on startup.
+   *
+   *     const implementation *impl = simdutf::available_implementation::detect_best_supported();
+   *     simdutf::active_implementation = impl;
+   *
+   * @return the most advanced supported implementation for the current host, or an
+   *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
+   *         implementation. Will never return nullptr.
+   */
+  const implementation *detect_best_supported() const noexcept;
+};
+
+template<typename T>
+class atomic_ptr {
+public:
+  atomic_ptr(T *_ptr) : ptr{_ptr} {}
+
+  operator const T*() const { return ptr.load(); }
+  const T& operator*() const { return *ptr; }
+  const T* operator->() const { return ptr.load(); }
+
+  operator T*() { return ptr.load(); }
+  T& operator*() { return *ptr; }
+  T* operator->() { return ptr.load(); }
+  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
+
+private:
+  std::atomic<T*> ptr;
+};
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations;
+
+/**
+  * The active implementation.
+  *
+  * Automatically initialized on first use to the most advanced implementation supported by this hardware.
+  */
+extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation;
+
+} // namespace simdutf
+
+#endif // SIMDUTF_IMPLEMENTATION_H
+/* end file include/simdutf/implementation.h */
+
+
+// Implementation-internal files (must be included before the implementations themselves, to keep
+// amalgamation working--otherwise, the first time a file is included, it might be put inside the
+// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
+// compile unless that implementation is turned on).
+
+
+SIMDUTF_POP_DISABLE_WARNINGS
+
+#endif // SIMDUTF_H
+/* end file include/simdutf.h */
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 45520878..7cf38883 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -78,7 +78,7 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word8)
-import Foreign.C.Types (CSize(..))
+import Foreign.C.Types (CSize(..), CInt(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
@@ -154,6 +154,10 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
+isValidBS :: ByteString -> Bool
+isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
+  unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len)
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
 -- Surrogate code points in replacement character returned by 'OnDecodeError'
@@ -164,6 +168,9 @@ decodeUtf8With ::
 #endif
   OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr bs
+  | isValidBS bs =
+    let !(SBS.SBS arr) = SBS.toShort bs in
+      (Text (A.ByteArray arr) 0 (B.length bs))
   | B.null undecoded = txt
   | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
     Nothing -> txt'
@@ -190,6 +197,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
       | i < len1  = B.index bs1 i
       | otherwise = B.index bs2 (i - len1)
 
+    -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+    guessUtf8Boundary :: Int
+    guessUtf8Boundary
+      | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+      | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+      | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+      | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+      | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+      | otherwise = 0
+      where
+        w0 = B.index bs2 (len2 - 1)
+        w1 = B.index bs2 (len2 - 2)
+        w2 = B.index bs2 (len2 - 3)
+        w3 = B.index bs2 (len2 - 4)
+
     decodeFrom :: Int -> DecoderResult
     decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
       where
@@ -205,10 +227,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
               A.shrinkM dst dstOff
               arr <- A.unsafeFreeze dst
               return (Text arr 0 dstOff, mempty)
+
+            | srcOff >= len1
+            , srcOff < len1 + guessUtf8Boundary
+            , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
+            , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
+            , isValidBS bs = do
+              withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+                unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff)
+              inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
+
             | dstOff + 4 > dstLen = do
               let dstLen' = dstLen + 4
               dst' <- A.resizeM dst dstLen'
               outer dst' dstLen' srcOff dstOff
+
             | otherwise = case decodeFrom srcOff of
               Accept c -> do
                 d <- unsafeWrite dst dstOff c
@@ -508,3 +541,6 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 
 cSizeToInt :: CSize -> Int
 cSizeToInt = fromIntegral
+
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
diff --git a/text.cabal b/text.cabal
index e6992850..ceb67819 100644
--- a/text.cabal
+++ b/text.cabal
@@ -1,4 +1,4 @@
-cabal-version:  >= 1.10
+cabal-version:  2.2
 name:           text
 version:        1.2.5.0
 
@@ -38,7 +38,7 @@ description:
     based on the well-respected and liberally
     licensed [ICU library](http://site.icu-project.org/).
 
-license:        BSD2
+license:        BSD-2-Clause
 license-file:   LICENSE
 author:         Bryan O'Sullivan <bos@serpentine.com>
 maintainer:     Haskell Text Team <andrew.lelechenko@gmail.com>, Core Libraries Committee
@@ -54,6 +54,7 @@ extra-source-files:
     README.markdown
     changelog.md
     scripts/*.hs
+    include/*.h
     tests/literal-rule-test.sh
     tests/LiteralRuleTest.hs
 
@@ -67,7 +68,22 @@ library
                 cbits/measure_off.c
                 cbits/reverse.c
                 cbits/utils.c
+  cxx-sources:  cbits/simdutf.cpp
+                cbits/validate_utf8.cpp
+  include-dirs: include
   hs-source-dirs: src
+  cxx-options: -std=c++17
+
+  if os(windows)
+    if arch(x86_64)
+      extra-libraries: stdc++-6 gcc_s_seh-1
+    else
+      extra-libraries: stdc++-6 gcc_s_dw2-1
+  else
+    if os(darwin)
+      extra-libraries: c++
+    else
+      extra-libraries: stdc++
 
   exposed-modules:
     Data.Text

From bcc4dc6b5a663cda46753e866efbf5e5ab40afe7 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 18 Jun 2021 02:28:49 +0100
Subject: [PATCH 35/38] Use GHC 8.10.5 for Windows build, because of issues
 with TH and simdutf

---
 .github/workflows/windows_and_macOS.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/windows_and_macOS.yml b/.github/workflows/windows_and_macOS.yml
index aa041081..d2cc79b6 100644
--- a/.github/workflows/windows_and_macOS.yml
+++ b/.github/workflows/windows_and_macOS.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         os: ['windows-latest', 'macOS-latest']
-        ghc: ['9.0']
+        ghc: ['8.10.5']
       fail-fast: false
     steps:
     - uses: actions/checkout@v2

From 20b901d2d87c78d384efabf194653cec3fdf9c93 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Sun, 5 Sep 2021 00:09:05 +0100
Subject: [PATCH 36/38] Avoid reconstructing chars in commonPrefixes

---
 src/Data/Text.hs | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 7114c853..b44a07a5 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -222,6 +222,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST)
 import qualified Data.Text.Array as A
 import qualified Data.List as L
 import Data.Binary (Binary(get, put))
+import Data.Int (Int8)
 import Data.Monoid (Monoid(..))
 import Data.Semigroup (Semigroup(..))
 import Data.String (IsString(..))
@@ -1922,16 +1923,26 @@ stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
 --
 -- >>> commonPrefixes "" "baz"
 -- Nothing
-commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
-commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
+commonPrefixes :: Text -> Text -> Maybe (Text, Text, Text)
+commonPrefixes !t0@(Text arr0 off0 len0) !t1@(Text arr1 off1 len1)
+  | len0 == 0 = Nothing
+  | len1 == 0 = Nothing
+  | otherwise = go 0 0
   where
-    go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
-             | i > 0     = Just (Text arr0 off0 i,
-                                 text arr0 (off0+i) (len0-i),
-                                 text arr1 (off1+j) (len1-j))
-             | otherwise = Nothing
-      where Iter a d0 = iter t0 i
-            Iter b d1 = iter t1 j
+    go !i !j
+      | i == len0 = Just (t0, empty, text arr1 (off1 + i) (len1 - i))
+      | i == len1 = Just (t1, text arr0 (off0 + i) (len0 - i), empty)
+      | a == b = go (i + 1) k
+      | k > 0 = Just (Text arr0 off0 k,
+                      Text arr0 (off0 + k) (len0 - k),
+                      Text arr1 (off1 + k) (len1 - k))
+      | otherwise = Nothing
+      where
+        a = A.unsafeIndex arr0 (off0 + i)
+        b = A.unsafeIndex arr1 (off1 + i)
+        isLeader = word8ToInt8 a >= -64
+        k = if isLeader then i else j
+{-# INLINE commonPrefixes #-}
 
 -- | /O(n)/ Return the prefix of the second string if its suffix
 -- matches the entire first string.
@@ -2002,6 +2013,9 @@ intToCSize = P.fromIntegral
 cSsizeToInt :: CSsize -> Int
 cSsizeToInt = P.fromIntegral
 
+word8ToInt8 :: Word8 -> Int8
+word8ToInt8 = P.fromIntegral
+
 -------------------------------------------------
 -- NOTE: the named chunk below used by doctest;
 --       verify the doctests via `doctest -fobject-code Data/Text.hs`

From fd497074efbe862c89ea2285cfaf3d414bce2919 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Mon, 6 Sep 2021 20:37:12 +0100
Subject: [PATCH 37/38] Bump version and update changelog

---
 changelog.md                            | 23 +++++++++++++++++++++--
 src/Data/Text.hs                        |  2 ++
 src/Data/Text/Array.hs                  | 18 ++++++++++++++++--
 src/Data/Text/Encoding.hs               |  2 +-
 src/Data/Text/Foreign.hs                |  6 ++++++
 src/Data/Text/Internal/Encoding/Utf8.hs |  7 +++++++
 src/Data/Text/Internal/Private.hs       |  2 ++
 src/Data/Text/Internal/Unsafe/Char.hs   |  1 +
 src/Data/Text/Unsafe.hs                 |  8 ++++++++
 text.cabal                              |  2 +-
 10 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/changelog.md b/changelog.md
index 4792622c..563f3c4a 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,24 @@
-### 1.3
-
+### 2.0
+
+* [Switch internal representation of text from UTF-16 to UTF-8](https://github.com/haskell/text/pull/365):
+  * Functions in `Data.Text.Array` now operate over arrays of `Word8` instead of `Word16`.
+  * Rename constructors of `Array` and `MArray` to `ByteArray` and `MutableByteArray`.
+  * Rename functions and types in `Data.Text.Foreign` to reflect switch
+    from `Word16` to `Word8`.
+  * Rename slicing functions in `Data.Text.Unsafe` to reflect switch
+    from `Word16` to `Word8`.
+  * Rename `Data.Text.Internal.Unsafe.Char.unsafeChr` to `unsafeChr16`.
+  * Change semantics and order of arguments of `Data.Text.Array.copyI`:
+    pass length, not end offset.
+  * Extend `Data.Text.Internal.Encoding.Utf8` to provide more UTF-8 related routines.
+  * Extend interface of `Data.Text.Array` with more utility functions.
+  * Add `instance Show Data.Text.Unsafe.Iter`.
+  * Add `Data.Text.measureOff`.
+  * Extend `Data.Text.Unsafe` with `iterArray` and `reverseIterArray`.
+  * Export `Data.Text.Internal.Lazy.equal`.
+  * Export `Data.Text.Internal.append`.
+  * Add `Data.Text.Internal.Private.spanAscii_`.
+  * Replacement characters in `decodeUtf8With` are no longer limited to Basic Multilingual Plane.
 * [Disable implicit fusion rules](https://github.com/haskell/text/pull/348)
 * [Add `Data.Text.Encoding.decodeUtf8Lenient`](https://github.com/haskell/text/pull/342)
 * [Remove `Data.Text.Internal.Unsafe.Shift`](https://github.com/haskell/text/pull/343)
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index b44a07a5..4d85e3e5 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1225,6 +1225,8 @@ take n t@(Text arr off len)
 --
 -- This function is used to implement 'take', 'drop', 'splitAt' and 'length'
 -- and is useful on its own in streaming and parsing libraries.
+--
+-- @since 2.0
 measureOff :: Int -> Text -> Int
 measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
   cSsizeToInt $ unsafeDupablePerformIO $
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 93ad29f8..8c304320 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -80,6 +80,8 @@ new (I# len#)
 {-# INLINE new #-}
 
 -- | Create an uninitialized mutable pinned array.
+--
+-- @since 2.0
 newPinned :: forall s. Int -> ST s (MArray s)
 newPinned (I# len#)
 #if defined(ASSERTS)
@@ -90,6 +92,7 @@ newPinned (I# len#)
       (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
 {-# INLINE newPinned #-}
 
+-- | @since 2.0
 newFilled :: Int -> Int -> ST s (MArray s)
 newFilled (I# len#) (I# c#) = ST $ \s1# ->
   case newByteArray# len# s1# of
@@ -97,6 +100,7 @@ newFilled (I# len#) (I# c#) = ST $ \s1# ->
       s3# -> (# s3#, MutableByteArray marr# #)
 {-# INLINE newFilled #-}
 
+-- | @since 2.0
 tile :: MArray s -> Int -> ST s ()
 tile marr tileLen = do
   totalLen <- getSizeofMArray marr
@@ -130,8 +134,10 @@ unsafeIndex (ByteArray arr) i@(I# i#) =
   case indexWord8Array# arr i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
--- sizeofMutableByteArray# is deprecated, because it is unsafe in the presence of
--- shrinkMutableByteArray# and resizeMutableByteArray#.
+-- | 'sizeofMutableByteArray#' is deprecated, because it is unsafe in the presence of
+-- 'shrinkMutableByteArray#' and 'resizeMutableByteArray#'.
+--
+-- @since 2.0
 getSizeofMArray :: MArray s -> ST s Int
 getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
   case getSizeofMutableByteArray# marr s0# of
@@ -185,12 +191,14 @@ run2 k = runST (do
                  return (arr,b))
 {-# INLINE run2 #-}
 
+-- | @since 2.0
 resizeM :: MArray s -> Int -> ST s (MArray s)
 resizeM (MutableByteArray ma) i@(I# i#) = ST $ \s1# ->
   case resizeMutableByteArray# ma i# s1# of
     (# s2#, newArr #) -> (# s2#, MutableByteArray newArr #)
 {-# INLINE resizeM #-}
 
+-- | @since 2.0
 shrinkM ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -253,6 +261,8 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
 {-# INLINE copyI #-}
 
 -- | Copy from pointer.
+--
+-- @since 2.0
 copyFromPointer
   :: MArray s               -- ^ Destination
   -> Int                    -- ^ Destination offset
@@ -270,6 +280,8 @@ copyFromPointer (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I#
 {-# INLINE copyFromPointer #-}
 
 -- | Copy to pointer.
+--
+-- @since 2.0
 copyToPointer
   :: Array                  -- ^ Source
   -> Int                    -- ^ Source offset
@@ -293,6 +305,8 @@ equal src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count == 0
 {-# INLINE equal #-}
 
 -- | Compare portions of two arrays. No bounds checking is performed.
+--
+-- @since 2.0
 compare :: Array -> Int -> Array -> Int -> Int -> Ordering
 compare src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count `Prelude.compare` 0
 {-# INLINE compare #-}
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 7cf38883..7fecd1f9 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -234,7 +234,7 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
             , isValidBS bs = do
               withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff)
+                unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
 
             | dstOff + 4 > dstLen = do
diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs
index afe3234e..831277fe 100644
--- a/src/Data/Text/Foreign.hs
+++ b/src/Data/Text/Foreign.hs
@@ -59,6 +59,8 @@ import qualified Data.Text.Array as A
 -- the functions in the 'Data.Text.Encoding' module.
 
 -- | A type representing a number of UTF-8 code units.
+--
+-- @since 2.0
 newtype I8 = I8 Int
     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
 
@@ -86,6 +88,8 @@ fromPtr ptr (I8 len) = unsafeSTToIO $ do
 -- If @n@ would cause the 'Text' to end inside a code point, the
 -- end of the prefix will be advanced by several additional 'Word8' units
 -- to maintain its validity.
+--
+-- @since 2.0
 takeWord8 :: I8 -> Text -> Text
 takeWord8 = (fst .) . splitAtWord8
 
@@ -95,6 +99,8 @@ takeWord8 = (fst .) . splitAtWord8
 -- If @n@ would cause the 'Text' to begin inside a code point, the
 -- beginning of the suffix will be advanced by several additional 'Word8'
 -- unit to maintain its validity.
+--
+-- @since 2.0
 dropWord8 :: I8 -> Text -> Text
 dropWord8 = (snd .) . splitAtWord8
 
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 0c1a42aa..4dfeb9b4 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -68,6 +68,8 @@ between x y z = x >= y && x <= z
 --   | ord c < 0x800   = 2
 --   | ord c < 0x10000 = 3
 --   | otherwise       = 4
+
+-- | @since 2.0
 utf8Length :: Char -> Int
 utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800#) +# geChar# c (chr# 0x10000#)))
 {-# INLINE utf8Length #-}
@@ -82,6 +84,8 @@ utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800
 -- c `xor` I# (c# <=# 0#) is a branchless equivalent of c `max` 1.
 -- It is crucial to write c# <=# 0# and not c# ==# 0#, otherwise
 -- GHC is tempted to "optimize" by introduction of branches.
+
+-- | @since 2.0
 utf8LengthByLeader :: Word8 -> Int
 utf8LengthByLeader w = c `xor` I# (c# <=# 0#)
   where
@@ -256,11 +260,13 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
 
 newtype CodePoint = CodePoint Int
 
+-- | @since 2.0
 data DecoderResult
   = Accept !Char
   | Incomplete !DecoderState !CodePoint
   | Reject
 
+-- | @since 2.0
 utf8DecodeStart :: Word8 -> DecoderResult
 utf8DecodeStart w
   | st == utf8AcceptState = Accept (chr (word8ToInt w))
@@ -271,6 +277,7 @@ utf8DecodeStart w
     st = updateState cl utf8AcceptState
     cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w
 
+-- | @since 2.0
 utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
 utf8DecodeContinue w st (CodePoint cp)
   | st' == utf8AcceptState = Accept (chr cp')
diff --git a/src/Data/Text/Internal/Private.hs b/src/Data/Text/Internal/Private.hs
index 4de74b66..02e791ef 100644
--- a/src/Data/Text/Internal/Private.hs
+++ b/src/Data/Text/Internal/Private.hs
@@ -38,6 +38,8 @@ span_ p t@(Text arr off len) = (# hd,tl #)
 
 -- | For the sake of performance this function does not check
 -- that a char is in ASCII range; it is a responsibility of @p@.
+--
+-- @since 2.0
 spanAscii_ :: (Word8 -> Bool) -> Text -> (# Text, Text #)
 spanAscii_ p (Text arr off len) = (# hd, tl #)
   where hd = text arr off k
diff --git a/src/Data/Text/Internal/Unsafe/Char.hs b/src/Data/Text/Internal/Unsafe/Char.hs
index 3f3372c8..47acc62e 100644
--- a/src/Data/Text/Internal/Unsafe/Char.hs
+++ b/src/Data/Text/Internal/Unsafe/Char.hs
@@ -39,6 +39,7 @@ ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
 {-# INLINE ord #-}
 
+-- | @since 2.0
 unsafeChr16 :: Word16 -> Char
 unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
 {-# INLINE unsafeChr16 #-}
diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs
index 4832c5d1..8f6cff86 100644
--- a/src/Data/Text/Unsafe.hs
+++ b/src/Data/Text/Unsafe.hs
@@ -79,6 +79,7 @@ iter ::
 iter (Text arr off _len) i = iterArray arr (off + i)
 {-# INLINE iter #-}
 
+-- | @since 2.0
 iterArray :: A.Array -> Int -> Iter
 iterArray arr j = Iter chr l
   where m0 = A.unsafeIndex arr j
@@ -107,6 +108,7 @@ reverseIter :: Text -> Int -> Iter
 reverseIter (Text arr off _len) i = reverseIterArray arr (off + i)
 {-# INLINE reverseIter #-}
 
+-- | @since 2.0
 reverseIterArray :: A.Array -> Int -> Iter
 reverseIterArray arr j
     | m0 <  0x80 = Iter (unsafeChr8 m0) (-1)
@@ -139,16 +141,22 @@ reverseIter_ (Text arr off _len) i
 -- | /O(1)/ Return the length of a 'Text' in units of 'Word8'.  This
 -- is useful for sizing a target array appropriately before using
 -- 'unsafeCopyToPtr'.
+--
+-- @since 2.0
 lengthWord8 :: Text -> Int
 lengthWord8 (Text _arr _off len) = len
 {-# INLINE lengthWord8 #-}
 
 -- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'.
+--
+-- @since 2.0
 takeWord8 :: Int -> Text -> Text
 takeWord8 k (Text arr off _len) = Text arr off k
 {-# INLINE takeWord8 #-}
 
 -- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'.
+--
+-- @since 2.0
 dropWord8 :: Int -> Text -> Text
 dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
 {-# INLINE dropWord8 #-}
diff --git a/text.cabal b/text.cabal
index ceb67819..95b3694d 100644
--- a/text.cabal
+++ b/text.cabal
@@ -1,6 +1,6 @@
 cabal-version:  2.2
 name:           text
-version:        1.2.5.0
+version:        2.0
 
 homepage:       https://github.com/haskell/text
 bug-reports:    https://github.com/haskell/text/issues

From 4e066acad7be6b1f220501ef533be9a010dbfa34 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Tue, 7 Sep 2021 21:38:26 +0100
Subject: [PATCH 38/38] Tweak documentation

---
 src/Data/Text/Array.hs                  | 7 +++----
 src/Data/Text/Internal/Encoding/Utf8.hs | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index 8c304320..e80153f8 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -134,12 +134,11 @@ unsafeIndex (ByteArray arr) i@(I# i#) =
   case indexWord8Array# arr i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
--- | 'sizeofMutableByteArray#' is deprecated, because it is unsafe in the presence of
--- 'shrinkMutableByteArray#' and 'resizeMutableByteArray#'.
---
--- @since 2.0
+-- | @since 2.0
 getSizeofMArray :: MArray s -> ST s Int
 getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
+  -- Cannot simply use (deprecated) 'sizeofMutableByteArray#', because it is
+  -- unsafe in the presence of 'shrinkMutableByteArray#' and 'resizeMutableByteArray#'.
   case getSizeofMutableByteArray# marr s0# of
     (# s1#, word8len# #) -> (# s1#, I# word8len# #)
 
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 4dfeb9b4..1645086e 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -68,6 +68,7 @@ between x y z = x >= y && x <= z
 --   | ord c < 0x800   = 2
 --   | ord c < 0x10000 = 3
 --   | otherwise       = 4
+-- Implementation suggested by Alex Mason.
 
 -- | @since 2.0
 utf8Length :: Char -> Int