From b509efd122e094afc6d05532d12c97e7c1110bc8 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Sun, 26 Jun 2022 23:40:29 -0600
Subject: [PATCH 01/87] Monad support for stream decoding

---
 src/Data/Text/Encoding.hs             | 87 ++++++++++++++++++---------
 src/Data/Text/Encoding/Error.hs       | 11 ++++
 tests/Tests/Properties/Transcoding.hs | 38 ++++++++++++
 text.cabal                            |  5 +-
 4 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 54f16722..78d4769a 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -44,6 +44,7 @@ module Data.Text.Encoding
     -- *** Stream oriented decoding
     -- $stream
     , streamDecodeUtf8With
+    , streamDecodeUtf8WithM
     , Decoding(..)
 
     -- ** Partial Functions
@@ -73,13 +74,19 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (runST, ST)
+import Control.Monad.Fix (fix)
+import Control.Monad.ST (runST)
+import Control.Monad.ST.Trans (runSTT)
+import Control.Monad.ST.Trans.Internal (liftST)
+import Control.Monad.Trans.Class (lift)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
+import Data.Function ((&))
+import Data.Functor.Identity (Identity(..), runIdentity)
+import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
@@ -142,13 +149,16 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
 -- anything except ASCII and copies buffer or throws an error otherwise.
 --
 decodeASCII :: ByteString -> Text
-decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
-  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-    c_is_ascii src (src `plusPtr` len)
-  if asciiPrefixLen == len
-  then let !(SBS.SBS arr) = SBS.toShort bs in
-        return (Text (A.ByteArray arr) 0 len)
-  else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
+decodeASCII bs = withBS bs $ \fp len ->
+  if len == 0
+    then empty
+    else runST $ do
+      asciiPrefixLen <- fmap cSizeToInt . unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
+        c_is_ascii src (src `plusPtr` len)
+      if asciiPrefixLen == len
+      then let !(SBS.SBS arr) = SBS.toShort bs in
+            return (Text (A.ByteArray arr) 0 len)
+      else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --
@@ -231,7 +241,8 @@ decodeUtf8With onErr bs
     Nothing -> txt'
     Just c  -> T.singleton c `append` txt')
   where
-    (txt, undecoded) = decodeUtf8With2 onErr mempty bs
+    (txt, undecoded) = runIdentity . decodeUtf8With2 mempty bs $ \ desc' mWord8 ->
+      Identity $ onErr desc' mWord8
     txt' = decodeUtf8With onErr (B.tail undecoded)
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
@@ -240,9 +251,10 @@ decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
-decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
-  marr <- A.new len'
+  Monad m =>
+  ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString)
+decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do
+  marr <- liftST $ A.new len'
   outer marr len' 0 0
   where
     len = len1 + len2
@@ -274,11 +286,10 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
           | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
         step _ st = st
 
-    outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
     outer dst dstLen = inner
         where
           inner srcOff dstOff
-            | srcOff >= len = do
+            | srcOff >= len = liftST $ do
               A.shrinkM dst dstOff
               arr <- A.unsafeFreeze dst
               return (Text arr 0 dstOff, mempty)
@@ -288,25 +299,27 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
             , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
             , isValidBS bs = do
-              withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+              liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
                 unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
 
             | dstOff + 4 > dstLen = do
               let dstLen' = dstLen + 4
-              dst' <- A.resizeM dst dstLen'
+              dst' <- liftST $ A.resizeM dst dstLen'
               outer dst' dstLen' srcOff dstOff
 
             | otherwise = case decodeFrom srcOff of
               Accept c -> do
-                d <- unsafeWrite dst dstOff c
+                d <- liftST $ unsafeWrite dst dstOff c
                 inner (srcOff + d) (dstOff + d)
-              Reject -> case onErr desc (Just (index srcOff)) of
-                Nothing -> inner (srcOff + 1) dstOff
-                Just c -> do
-                  d <- unsafeWrite dst dstOff (safe c)
-                  inner (srcOff + 1) (dstOff + d)
-              Incomplete{} -> do
+              Reject -> do
+                res <- lift $ onErrM desc (Just (index srcOff))
+                case res of
+                  Nothing -> inner (srcOff + 1) dstOff
+                  Just c -> do
+                    d <- liftST $ unsafeWrite dst dstOff (safe c)
+                    inner (srcOff + 1) (dstOff + d)
+              Incomplete{} -> liftST $ do
                 A.shrinkM dst dstOff
                 arr <- A.unsafeFreeze dst
                 let bs = if srcOff >= len1
@@ -369,6 +382,22 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
 -- If given invalid input, an exception will be thrown by the function
 -- or continuation where it is encountered.
 
+-- | Decode, in a monadic- and stream-oriented way using CPS, a lazy
+-- 'ByteString' containing UTF-8 encoded text.
+streamDecodeUtf8WithM ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Monad m =>
+  ByteString
+  -> OnDecodeErrorM m
+  -> (Text -> ByteString -> (ByteString -> m b) -> m b)
+  -> m b
+streamDecodeUtf8WithM bstr onErrM f = fix (\ go bs1 bs2 -> do
+    (txt, undecoded) <- decodeUtf8With2 bs1 bs2 onErrM
+    f txt undecoded (go undecoded)
+  ) mempty bstr
+
 -- | A stream oriented decoding result.
 --
 -- @since 1.0.0.0
@@ -406,11 +435,11 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = go mempty
-  where
-    go bs1 bs2 = Some txt undecoded (go undecoded)
-      where
-        (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
+streamDecodeUtf8With onErr = fix (\ go bs1 bs2 ->
+    runIdentity (decodeUtf8With2 bs1 bs2 (\ desc' mWord8 ->
+      Identity $ onErr desc' mWord8)) & (\ (txt, undecoded) ->
+        Some txt undecoded (go undecoded))
+  ) mempty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index ea9e0997..99226cb1 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -25,6 +25,7 @@ module Data.Text.Encoding.Error
       UnicodeException(..)
     , OnError
     , OnDecodeError
+    , OnDecodeErrorM
     , OnEncodeError
     -- * Useful error handling functions
     , lenientDecode
@@ -59,9 +60,19 @@ import Numeric (showHex)
 -- report.
 type OnError a b = String -> Maybe a -> Maybe b
 
+-- | Similar to 'OnError' but in a monadic context which allows
+-- additional capabilities. Depending on the monad(s) used, these
+-- capabilities include logging decode errors (IO and/or logger monads)
+-- and abort processessing without the need to use 'error' or 'throw'
+-- (continuation monad).
+type OnErrorM a m b = String -> Maybe a -> m (Maybe b)
+
 -- | A handler for a decoding error.
 type OnDecodeError = OnError Word8 Char
 
+-- | A monadic handler for a decoding error.
+type OnDecodeErrorM m = OnErrorM Word8 m Char
+
 -- | A handler for an encoding error.
 {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
 type OnEncodeError = OnError Char Word8
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 0fabf62e..2d98af16 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -6,6 +6,8 @@ module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
+import Control.Concurrent.MVar (newEmptyMVar, tryPutMVar, tryTakeMVar)
+import Control.Monad.Cont (cont, runCont)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Test.QuickCheck hiding ((.&.))
@@ -21,6 +23,8 @@ import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Char8 as BC
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Char8 as BLC
+import Data.Functor.Identity (Identity(..))
+import Data.Maybe (isJust)
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
@@ -199,11 +203,40 @@ t_decode_with_error3' =
 t_decode_with_error4' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of
     E.Some x _ _ -> x === "xaaa"
+t_decode_with_error2'' =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ -> pure $ Just 'x')
+    (\ x _ _ -> pure $ x === "xa")
+t_decode_with_error3'' =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ -> pure $ Just 'x')
+    (\ x _ _ -> pure $ x === "xaa")
+t_decode_with_error4'' =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ -> pure $ Just 'x')
+    (\ x _ _ -> pure $ x === "xaaa")
 t_decode_with_error5' = ioProperty $ do
   ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81])
   pure $ case ret of
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
+-- log error test
+t_decode_with_error5'' =
+  ioProperty $ do
+    mVar <- newEmptyMVar
+    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
+      (\desc _ -> do
+        res <- tryPutMVar mVar desc
+        pure . Just $ if res then 'x' else 'y')
+      (\ x _ _ -> do
+        mDesc <- tryTakeMVar mVar
+        pure $ x == "xaaa" && isJust mDesc)
+-- test case demonstrating how to stop processing without 'throw' or 'error'
+t_decode_with_error6'' =
+  runCont (do
+    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
+      -- early exit
+      (\ _ _ -> cont $ \ _ -> True)
+      -- this should not be executed
+      (\ _ _ _ -> pure False)
+    ) id
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -250,7 +283,12 @@ testTranscoding =
       testProperty "t_decode_with_error2'" t_decode_with_error2',
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
+      testProperty "t_decode_with_error2''" t_decode_with_error2'',
+      testProperty "t_decode_with_error3''" t_decode_with_error3'',
+      testProperty "t_decode_with_error4''" t_decode_with_error4'',
       testProperty "t_decode_with_error5'" t_decode_with_error5',
+      testProperty "t_decode_with_error5''" t_decode_with_error5'',
+      testProperty "t_decode_with_error6''" t_decode_with_error6'',
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
diff --git a/text.cabal b/text.cabal
index 488f1451..388512d7 100644
--- a/text.cabal
+++ b/text.cabal
@@ -181,13 +181,15 @@ library
     Data.Text.Show
 
   build-depends:
+    STMonadTrans     >= 0.4.6 && < 0.4.7,
     array            >= 0.3 && < 0.6,
     base             >= 4.9 && < 5,
     binary           >= 0.5 && < 0.9,
     bytestring       >= 0.10.4 && < 0.12,
     deepseq          >= 1.1 && < 1.5,
     ghc-prim         >= 0.2 && < 0.10,
-    template-haskell >= 2.5 && < 2.20
+    template-haskell >= 2.5 && < 2.20,
+    transformers
 
   ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
   if flag(developer)
@@ -254,6 +256,7 @@ test-suite tests
     deepseq,
     directory,
     ghc-prim,
+    mtl,
     tasty,
     tasty-hunit,
     tasty-quickcheck,

From 54a1ad906885d35f76e8afcfc733c12e4005649d Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Mon, 27 Jun 2022 19:31:00 -0600
Subject: [PATCH 02/87] Refactored remove STMonadTrans dependency

---
 src/Data/Text/Encoding.hs | 156 +++++++++++++++++++++++++++++---------
 text.cabal                |   4 +-
 2 files changed, 122 insertions(+), 38 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 78d4769a..be2fc3af 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -1,5 +1,5 @@
 {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
-    UnliftedFFITypes #-}
+    UnliftedFFITypes, UnboxedTuples #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
@@ -76,9 +76,6 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Control.Exception (evaluate, try)
 import Control.Monad.Fix (fix)
 import Control.Monad.ST (runST)
-import Control.Monad.ST.Trans (runSTT)
-import Control.Monad.ST.Trans.Internal (liftST)
-import Control.Monad.Trans.Class (lift)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
@@ -96,7 +93,7 @@ import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
-import GHC.Exts (byteArrayContents#, unsafeCoerce#)
+import GHC.Exts (byteArrayContents#, runRW#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
@@ -117,6 +114,7 @@ import Foreign.C.Types (CInt(..))
 import qualified Data.ByteString.Unsafe as B
 import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
 #endif
+import GHC.ST (ST(..), STRep)
 
 -- $strict
 --
@@ -246,17 +244,105 @@ decodeUtf8With onErr bs
     txt' = decodeUtf8With onErr (B.tail undecoded)
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
--- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
+-- -- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
+-- decodeUtf8With2 ::
+-- #if defined(ASSERTS)
+--   HasCallStack =>
+-- #endif
+--   Monad m =>
+--   ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString)
+-- decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do
+--   marr <- liftST $ A.new len'
+--   outer marr len' 0 0
+--   where
+--     len = len1 + len2
+--     len' = len + 4
+
+--     index i
+--       | i < len1  = B.index bs1 i
+--       | otherwise = B.index bs2 (i - len1)
+
+--     -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+--     guessUtf8Boundary :: Int
+--     guessUtf8Boundary
+--       | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+--       | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+--       | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+--       | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+--       | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+--       | otherwise = 0
+--       where
+--         w0 = B.index bs2 (len2 - 1)
+--         w1 = B.index bs2 (len2 - 2)
+--         w2 = B.index bs2 (len2 - 3)
+--         w3 = B.index bs2 (len2 - 4)
+
+--     decodeFrom :: Int -> DecoderResult
+--     decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
+--       where
+--         step i (Incomplete a b)
+--           | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
+--         step _ st = st
+
+--     outer dst dstLen = inner
+--         where
+--           inner srcOff dstOff
+--             | srcOff >= len = liftST $ do
+--               A.shrinkM dst dstOff
+--               arr <- A.unsafeFreeze dst
+--               return (Text arr 0 dstOff, mempty)
+
+--             | srcOff >= len1
+--             , srcOff < len1 + guessUtf8Boundary
+--             , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
+--             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
+--             , isValidBS bs = do
+--               liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+--                 unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
+--               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
+
+--             | dstOff + 4 > dstLen = do
+--               let dstLen' = dstLen + 4
+--               dst' <- liftST $ A.resizeM dst dstLen'
+--               outer dst' dstLen' srcOff dstOff
+
+--             | otherwise = case decodeFrom srcOff of
+--               Accept c -> do
+--                 d <- liftST $ unsafeWrite dst dstOff c
+--                 inner (srcOff + d) (dstOff + d)
+--               Reject -> do
+--                 res <- lift $ onErrM desc (Just (index srcOff))
+--                 case res of
+--                   Nothing -> inner (srcOff + 1) dstOff
+--                   Just c -> do
+--                     d <- liftST $ unsafeWrite dst dstOff (safe c)
+--                     inner (srcOff + 1) (dstOff + d)
+--               Incomplete{} -> liftST $ do
+--                 A.shrinkM dst dstOff
+--                 arr <- A.unsafeFreeze dst
+--                 let bs = if srcOff >= len1
+--                       then B.drop (srcOff - len1) bs2
+--                       else B.drop srcOff (bs1 `B.append` bs2)
+--                 return (Text arr 0 dstOff, bs)
+
+--     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+
+unST :: ST s a -> STRep s a
+unST (ST st) = st
+
 decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   Monad m =>
-  ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString)
-decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do
-  marr <- liftST $ A.new len'
-  outer marr len' 0 0
+  ByteString
+  -> ByteString
+  -> OnDecodeErrorM m
+  -> m (Text, ByteString)
+decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do
+  outer marr len' 0 0 s#
   where
+    !(# s#, marr #) = runRW# (unST $ A.new len')
     len = len1 + len2
     len' = len + 4
 
@@ -288,44 +374,44 @@ decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $
 
     outer dst dstLen = inner
         where
-          inner srcOff dstOff
-            | srcOff >= len = liftST $ do
-              A.shrinkM dst dstOff
-              arr <- A.unsafeFreeze dst
-              return (Text arr 0 dstOff, mempty)
+          inner srcOff dstOff s'#
+            | srcOff >= len =
+              let !(# s''#, _ #) = (unST $ A.shrinkM dst dstOff) s'#
+                  !(# _, arr #) = (unST $ A.unsafeFreeze dst) s''#
+              in pure (Text arr 0 dstOff, mempty)
 
             | srcOff >= len1
             , srcOff < len1 + guessUtf8Boundary
             , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
-            , isValidBS bs = do
-              liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
-              inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
+            , isValidBS bs =
+              let !(# s''#, _ #) = unST (withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+                    unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)) s'#
+              in inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''#
 
-            | dstOff + 4 > dstLen = do
+            | dstOff + 4 > dstLen =
               let dstLen' = dstLen + 4
-              dst' <- liftST $ A.resizeM dst dstLen'
-              outer dst' dstLen' srcOff dstOff
+                  !(# s''#, dst' #) = unST (A.resizeM dst dstLen') s'#
+              in outer dst' dstLen' srcOff dstOff s''#
 
             | otherwise = case decodeFrom srcOff of
-              Accept c -> do
-                d <- liftST $ unsafeWrite dst dstOff c
-                inner (srcOff + d) (dstOff + d)
+              Accept c ->
+                let !(# s''#, d #) = unST (unsafeWrite dst dstOff c) s'#
+                in inner (srcOff + d) (dstOff + d) s''#
               Reject -> do
-                res <- lift $ onErrM desc (Just (index srcOff))
+                res <- onErrM desc (Just (index srcOff))
                 case res of
-                  Nothing -> inner (srcOff + 1) dstOff
-                  Just c -> do
-                    d <- liftST $ unsafeWrite dst dstOff (safe c)
-                    inner (srcOff + 1) (dstOff + d)
-              Incomplete{} -> liftST $ do
-                A.shrinkM dst dstOff
-                arr <- A.unsafeFreeze dst
-                let bs = if srcOff >= len1
+                  Nothing -> inner (srcOff + 1) dstOff s'#
+                  Just c ->
+                    let !(# s''#, d #) = unST (unsafeWrite dst dstOff (safe c)) s'# in
+                    inner (srcOff + 1) (dstOff + d) s''#
+              Incomplete{} ->
+                let !(# s''#, _ #) = unST (A.shrinkM dst dstOff) s'#
+                    !(# _, arr #) = unST (A.unsafeFreeze dst) s''#
+                    bs = if srcOff >= len1
                       then B.drop (srcOff - len1) bs2
                       else B.drop srcOff (bs1 `B.append` bs2)
-                return (Text arr 0 dstOff, bs)
+                in pure (Text arr 0 dstOff, bs)
 
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
diff --git a/text.cabal b/text.cabal
index 388512d7..5231dbac 100644
--- a/text.cabal
+++ b/text.cabal
@@ -181,15 +181,13 @@ library
     Data.Text.Show
 
   build-depends:
-    STMonadTrans     >= 0.4.6 && < 0.4.7,
     array            >= 0.3 && < 0.6,
     base             >= 4.9 && < 5,
     binary           >= 0.5 && < 0.9,
     bytestring       >= 0.10.4 && < 0.12,
     deepseq          >= 1.1 && < 1.5,
     ghc-prim         >= 0.2 && < 0.10,
-    template-haskell >= 2.5 && < 2.20,
-    transformers
+    template-haskell >= 2.5 && < 2.20
 
   ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
   if flag(developer)

From fc7b6651001d3ba8a4f064a90ccb01fbc277a6e7 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Mon, 27 Jun 2022 19:33:44 -0600
Subject: [PATCH 03/87] Remove unused commented code

---
 src/Data/Text/Encoding.hs | 83 ---------------------------------------
 1 file changed, 83 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index be2fc3af..16f50d0b 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -244,89 +244,6 @@ decodeUtf8With onErr bs
     txt' = decodeUtf8With onErr (B.tail undecoded)
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
--- -- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
--- decodeUtf8With2 ::
--- #if defined(ASSERTS)
---   HasCallStack =>
--- #endif
---   Monad m =>
---   ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString)
--- decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do
---   marr <- liftST $ A.new len'
---   outer marr len' 0 0
---   where
---     len = len1 + len2
---     len' = len + 4
-
---     index i
---       | i < len1  = B.index bs1 i
---       | otherwise = B.index bs2 (i - len1)
-
---     -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
---     guessUtf8Boundary :: Int
---     guessUtf8Boundary
---       | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
---       | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
---       | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
---       | len2 >= 3 && w2 >= 0xC0 = len2 - 3
---       | len2 >= 4 && w3 >= 0xC0 = len2 - 4
---       | otherwise = 0
---       where
---         w0 = B.index bs2 (len2 - 1)
---         w1 = B.index bs2 (len2 - 2)
---         w2 = B.index bs2 (len2 - 3)
---         w3 = B.index bs2 (len2 - 4)
-
---     decodeFrom :: Int -> DecoderResult
---     decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
---       where
---         step i (Incomplete a b)
---           | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
---         step _ st = st
-
---     outer dst dstLen = inner
---         where
---           inner srcOff dstOff
---             | srcOff >= len = liftST $ do
---               A.shrinkM dst dstOff
---               arr <- A.unsafeFreeze dst
---               return (Text arr 0 dstOff, mempty)
-
---             | srcOff >= len1
---             , srcOff < len1 + guessUtf8Boundary
---             , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
---             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
---             , isValidBS bs = do
---               liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
---                 unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
---               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
-
---             | dstOff + 4 > dstLen = do
---               let dstLen' = dstLen + 4
---               dst' <- liftST $ A.resizeM dst dstLen'
---               outer dst' dstLen' srcOff dstOff
-
---             | otherwise = case decodeFrom srcOff of
---               Accept c -> do
---                 d <- liftST $ unsafeWrite dst dstOff c
---                 inner (srcOff + d) (dstOff + d)
---               Reject -> do
---                 res <- lift $ onErrM desc (Just (index srcOff))
---                 case res of
---                   Nothing -> inner (srcOff + 1) dstOff
---                   Just c -> do
---                     d <- liftST $ unsafeWrite dst dstOff (safe c)
---                     inner (srcOff + 1) (dstOff + d)
---               Incomplete{} -> liftST $ do
---                 A.shrinkM dst dstOff
---                 arr <- A.unsafeFreeze dst
---                 let bs = if srcOff >= len1
---                       then B.drop (srcOff - len1) bs2
---                       else B.drop srcOff (bs1 `B.append` bs2)
---                 return (Text arr 0 dstOff, bs)
-
---     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-
 unST :: ST s a -> STRep s a
 unST (ST st) = st
 

From 4dbf4933880c8a084244cfa808f73862eb8c19e5 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Mon, 27 Jun 2022 20:59:04 -0600
Subject: [PATCH 04/87] change source module for runRW#

---
 src/Data/Text/Encoding.hs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 16f50d0b..1faaaf61 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -93,7 +93,8 @@ import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
-import GHC.Exts (byteArrayContents#, runRW#, unsafeCoerce#)
+import GHC.Exts (byteArrayContents#, unsafeCoerce#)
+import GHC.Magic (runRW#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
@@ -331,6 +332,7 @@ decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do
                 in pure (Text arr 0 dstOff, bs)
 
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+{-# NOINLINE decodeUtf8With2 #-}
 
 -- $stream
 --

From c052ad64d936161116288a993f1b97c7a0c2c129 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Thu, 30 Jun 2022 23:11:47 -0600
Subject: [PATCH 05/87] Added byte position (Int) argument to DecodeErrorM
 type.

---
 src/Data/Text/Encoding.hs             | 215 +++++++++++++-------------
 src/Data/Text/Encoding/Error.hs       |  13 +-
 src/Data/Text/Lazy/Encoding.hs        |  46 ++++--
 tests/Tests/Properties/Transcoding.hs |  85 +++++++---
 text.cabal                            |   4 +-
 5 files changed, 205 insertions(+), 158 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 1faaaf61..1daec591 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -36,6 +36,7 @@ module Data.Text.Encoding
 
     -- *** Controllable error handling
     , decodeUtf8With
+    , decodeUtf8WithM
     , decodeUtf16LEWith
     , decodeUtf16BEWith
     , decodeUtf32LEWith
@@ -43,6 +44,7 @@ module Data.Text.Encoding
 
     -- *** Stream oriented decoding
     -- $stream
+    , streamDecodeUtf8
     , streamDecodeUtf8With
     , streamDecodeUtf8WithM
     , Decoding(..)
@@ -56,9 +58,6 @@ module Data.Text.Encoding
     , decodeUtf32LE
     , decodeUtf32BE
 
-    -- *** Stream oriented decoding
-    , streamDecodeUtf8
-
     -- * Encoding Text to ByteStrings
     , encodeUtf8
     , encodeUtf16LE
@@ -81,7 +80,6 @@ import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Function ((&))
 import Data.Functor.Identity (Identity(..), runIdentity)
 import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, empty, append)
@@ -231,108 +229,31 @@ decodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs
-  | isValidBS bs =
-    let !(SBS.SBS arr) = SBS.toShort bs in
-      (Text (A.ByteArray arr) 0 (B.length bs))
-  | B.null undecoded = txt
-  | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
-    Nothing -> txt'
-    Just c  -> T.singleton c `append` txt')
-  where
-    (txt, undecoded) = runIdentity . decodeUtf8With2 mempty bs $ \ desc' mWord8 ->
-      Identity $ onErr desc' mWord8
-    txt' = decodeUtf8With onErr (B.tail undecoded)
-    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+decodeUtf8With onErr bs =
+  runIdentity $ decodeUtf8WithM bs (\ desc' _ mWord8 -> Identity $ onErr desc' mWord8)
 
-unST :: ST s a -> STRep s a
-unST (ST st) = st
-
-decodeUtf8With2 ::
+decodeUtf8WithM ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   Monad m =>
-  ByteString
-  -> ByteString
-  -> OnDecodeErrorM m
-  -> m (Text, ByteString)
-decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do
-  outer marr len' 0 0 s#
+  ByteString -> OnDecodeErrorM m -> m Text
+decodeUtf8WithM bs onErrM =
+  streamDecodeUtf8WithM bs onErrM (\ t bs' bytePos _ ->
+    fix (\ f bp bs'' t' ->
+      case B.uncons bs'' of
+        Just (word8, bs''') -> do
+          mC <- onErrM desc bp $ Just word8
+          f (bp + 1) bs''' $ case mC of
+            Just c -> t' `append` T.singleton c
+            _ -> t'
+        _ -> pure t') bytePos bs' t
+  )
   where
-    !(# s#, marr #) = runRW# (unST $ A.new len')
-    len = len1 + len2
-    len' = len + 4
-
-    index i
-      | i < len1  = B.index bs1 i
-      | otherwise = B.index bs2 (i - len1)
-
-    -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-    guessUtf8Boundary :: Int
-    guessUtf8Boundary
-      | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
-      | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
-      | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
-      | len2 >= 3 && w2 >= 0xC0 = len2 - 3
-      | len2 >= 4 && w3 >= 0xC0 = len2 - 4
-      | otherwise = 0
-      where
-        w0 = B.index bs2 (len2 - 1)
-        w1 = B.index bs2 (len2 - 2)
-        w2 = B.index bs2 (len2 - 3)
-        w3 = B.index bs2 (len2 - 4)
-
-    decodeFrom :: Int -> DecoderResult
-    decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
-      where
-        step i (Incomplete a b)
-          | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
-        step _ st = st
-
-    outer dst dstLen = inner
-        where
-          inner srcOff dstOff s'#
-            | srcOff >= len =
-              let !(# s''#, _ #) = (unST $ A.shrinkM dst dstOff) s'#
-                  !(# _, arr #) = (unST $ A.unsafeFreeze dst) s''#
-              in pure (Text arr 0 dstOff, mempty)
-
-            | srcOff >= len1
-            , srcOff < len1 + guessUtf8Boundary
-            , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
-            , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
-            , isValidBS bs =
-              let !(# s''#, _ #) = unST (withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                    unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)) s'#
-              in inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''#
-
-            | dstOff + 4 > dstLen =
-              let dstLen' = dstLen + 4
-                  !(# s''#, dst' #) = unST (A.resizeM dst dstLen') s'#
-              in outer dst' dstLen' srcOff dstOff s''#
-
-            | otherwise = case decodeFrom srcOff of
-              Accept c ->
-                let !(# s''#, d #) = unST (unsafeWrite dst dstOff c) s'#
-                in inner (srcOff + d) (dstOff + d) s''#
-              Reject -> do
-                res <- onErrM desc (Just (index srcOff))
-                case res of
-                  Nothing -> inner (srcOff + 1) dstOff s'#
-                  Just c ->
-                    let !(# s''#, d #) = unST (unsafeWrite dst dstOff (safe c)) s'# in
-                    inner (srcOff + 1) (dstOff + d) s''#
-              Incomplete{} ->
-                let !(# s''#, _ #) = unST (A.shrinkM dst dstOff) s'#
-                    !(# _, arr #) = unST (A.unsafeFreeze dst) s''#
-                    bs = if srcOff >= len1
-                      then B.drop (srcOff - len1) bs2
-                      else B.drop srcOff (bs1 `B.append` bs2)
-                in pure (Text arr 0 dstOff, bs)
-
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-{-# NOINLINE decodeUtf8With2 #-}
+
+unST :: ST s a -> STRep s a
+unST (ST st) = st
 
 -- $stream
 --
@@ -396,12 +317,90 @@ streamDecodeUtf8WithM ::
   Monad m =>
   ByteString
   -> OnDecodeErrorM m
-  -> (Text -> ByteString -> (ByteString -> m b) -> m b)
+  -> (Text -> ByteString -> Int -> (ByteString -> m b) -> m b)
   -> m b
-streamDecodeUtf8WithM bstr onErrM f = fix (\ go bs1 bs2 -> do
-    (txt, undecoded) <- decodeUtf8With2 bs1 bs2 onErrM
-    f txt undecoded (go undecoded)
-  ) mempty bstr
+streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do
+    (txt, (undecoded, bytePos)) <- decodeUtf8With2 bs1 bs2 (\ desc bp'' mWord8 -> onErrM desc (bp'' + bp) mWord8)
+    let bp' = bytePos + bp
+    f txt undecoded bp' $ go bp' undecoded
+  ) 0 mempty bstr
+  where
+    decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM' = do
+      case runRW# $ unST (A.new len') of
+        (# s#, marr #) -> outer marr len' 0 0 s#
+      where
+        len = len1 + len2
+        len' = len + 4
+
+        index i
+          | i < len1  = B.index bs1 i
+          | otherwise = B.index bs2 $ i - len1
+
+        -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+        guessUtf8Boundary :: Int
+        guessUtf8Boundary
+          | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+          | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+          | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+          | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+          | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+          | otherwise = 0
+          where
+            w0 = B.index bs2 $ len2 - 1
+            w1 = B.index bs2 $ len2 - 2
+            w2 = B.index bs2 $ len2 - 3
+            w3 = B.index bs2 $ len2 - 4
+
+        decodeFrom :: Int -> DecoderResult
+        decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
+          where
+            step i (Incomplete a b)
+              | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
+            step _ st = st
+
+        outer dst dstLen = inner
+            where
+              inner srcOff dstOff s'#
+                | srcOff >= len =
+                  case unST (A.shrinkM dst dstOff) s'# of
+                    (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of
+                      (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff))
+
+                | srcOff >= len1
+                , srcOff < len1 + guessUtf8Boundary
+                , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
+                , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
+                , isValidBS bs =
+                  case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
+                        unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s'# of
+                    (# s''#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''#
+
+                | dstOff + 4 > dstLen =
+                  let dstLen' = dstLen + 4 in
+                  case unST (A.resizeM dst dstLen') s'# of
+                    !(# s''#, dst' #) -> outer dst' dstLen' srcOff dstOff s''#
+
+                | otherwise = case decodeFrom srcOff of
+                  Accept c ->
+                    case unST (unsafeWrite dst dstOff c) s'# of
+                      (# s''#, d #) -> inner (srcOff + d) (dstOff + d) s''#
+                  Reject -> do
+                    res <- onErrM' desc srcOff . Just $ index srcOff
+                    case res of
+                      Nothing -> inner (srcOff + 1) dstOff s'#
+                      Just c ->
+                        case unST (unsafeWrite dst dstOff $ safe c) s'# of
+                          (# s''#, d #) -> inner (srcOff + 1) (dstOff + d) s''#
+                  Incomplete{} ->
+                    case unST (A.shrinkM dst dstOff) s'# of
+                      (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of
+                        (# _, arr #) ->
+                          let bs = if srcOff >= len1
+                                then B.drop (srcOff - len1) bs2
+                                else B.drop srcOff $ bs1 `B.append` bs2 in
+                          pure (Text arr 0 dstOff, (bs, srcOff))
+
+        desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
 -- | A stream oriented decoding result.
 --
@@ -440,11 +439,9 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = fix (\ go bs1 bs2 ->
-    runIdentity (decodeUtf8With2 bs1 bs2 (\ desc' mWord8 ->
-      Identity $ onErr desc' mWord8)) & (\ (txt, undecoded) ->
-        Some txt undecoded (go undecoded))
-  ) mempty
+streamDecodeUtf8With onErr bs = runIdentity $ streamDecodeUtf8WithM bs
+  (\ desc _ mWord8 -> Identity $ onErr desc mWord8)
+  (\ txt undecoded _ f -> Identity . Some txt undecoded $ runIdentity . f)
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index 99226cb1..bec2fd55 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -60,18 +60,13 @@ import Numeric (showHex)
 -- report.
 type OnError a b = String -> Maybe a -> Maybe b
 
--- | Similar to 'OnError' but in a monadic context which allows
--- additional capabilities. Depending on the monad(s) used, these
--- capabilities include logging decode errors (IO and/or logger monads)
--- and abort processessing without the need to use 'error' or 'throw'
--- (continuation monad).
-type OnErrorM a m b = String -> Maybe a -> m (Maybe b)
-
 -- | A handler for a decoding error.
 type OnDecodeError = OnError Word8 Char
 
--- | A monadic handler for a decoding error.
-type OnDecodeErrorM m = OnErrorM Word8 m Char
+-- | A monadic handler for a decoding error. With certain monads such as
+-- 'Maybe', 'Either', and 'Cont', processessing can be abandoned without
+-- the need to use 'error' or 'throw'.
+type OnDecodeErrorM m = String -> Int -> Maybe Word8 -> m (Maybe Char)
 
 -- | A handler for an encoding error.
 {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index ad361af5..7f6baa3b 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -29,6 +29,7 @@ module Data.Text.Lazy.Encoding
 
     -- *** Controllable error handling
     , decodeUtf8With
+    , decodeUtf8WithM
     , decodeUtf16LEWith
     , decodeUtf16BEWith
     , decodeUtf32LEWith
@@ -56,8 +57,12 @@ module Data.Text.Lazy.Encoding
     ) where
 
 import Control.Exception (evaluate, try)
+import Control.Monad.Fix (fix)
+import Control.Monad.State (evalStateT, get, state)
+import Control.Monad.Trans.Class (lift)
+import Data.Functor.Identity (Identity(..))
 import Data.Monoid (Monoid(..))
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word8)
 import qualified Data.ByteString as S
@@ -106,24 +111,31 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
-decodeUtf8With onErr (B.Chunk b0 bs0) =
-    case TE.streamDecodeUtf8With onErr b0 of
-      TE.Some t l f -> chunk t (go f l bs0)
+decodeUtf8With onErr = runIdentity . flip decodeUtf8WithM (const . (Identity .) . onErr)
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text in a monad context.
+decodeUtf8WithM :: Monad m => B.ByteString -> OnDecodeErrorM m -> m Text
+decodeUtf8WithM (B.Chunk sb lb) onErrM =
+  evalStateT (TE.streamDecodeUtf8WithM sb (((lift .) .) . onErrM) (\ t cp bp f -> do
+    (lb', diff) <- get
+    case lb' of
+      B.Chunk sb' lb'' -> do
+        state $ const ((), (lb'', diff . chunk t))
+        f sb'
+      _ -> do
+        t'' <- lift $ fix (\ f' bp' cp' t' ->
+          case S.uncons cp' of
+            Just (word8, cp'') -> do
+              mC <- onErrM desc bp' $ Just word8
+              f' (bp' + 1) cp'' $ case mC of
+                Just c -> t' `T.snoc` c
+                _ -> t'
+            _ -> pure t') bp cp t
+        pure . diff $ Chunk t'' Empty
+    )) (lb, id)
   where
-    go f0 _ (B.Chunk b bs) =
-      case f0 b of
-        TE.Some t l f -> chunk t (go f l bs)
-    go _ l _
-      | S.null l  = empty
-      | otherwise =
-        let !t = T.pack (skipBytes l)
-            skipBytes = S.foldr (\x s' ->
-                  case onErr desc (Just x) of
-                    Just c -> c : s'
-                    Nothing -> s') [] in
-        Chunk t Empty
     desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
-decodeUtf8With _ _ = empty
+decodeUtf8WithM _ _ = pure empty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 2d98af16..5e67f52a 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -6,7 +6,7 @@ module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
-import Control.Concurrent.MVar (newEmptyMVar, tryPutMVar, tryTakeMVar)
+import Control.Concurrent.MVar (newEmptyMVar, newMVar, tryPutMVar, tryTakeMVar)
 import Control.Monad.Cont (cont, runCont)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
@@ -203,40 +203,75 @@ t_decode_with_error3' =
 t_decode_with_error4' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of
     E.Some x _ _ -> x === "xaaa"
-t_decode_with_error2'' =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ -> pure $ Just 'x')
-    (\ x _ _ -> pure $ x === "xa")
-t_decode_with_error3'' =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ -> pure $ Just 'x')
-    (\ x _ _ -> pure $ x === "xaa")
-t_decode_with_error4'' =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ -> pure $ Just 'x')
-    (\ x _ _ -> pure $ x === "xaaa")
 t_decode_with_error5' = ioProperty $ do
   ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81])
   pure $ case ret of
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
+t_decode_withM_error1 =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x')
+    (\ x _ _ _ -> pure $ x === "xa")
+t_decode_withM_error2 =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ _ -> pure $ Just 'x')
+    (\ x _ _ _ -> pure $ x === "xaa")
+t_decode_withM_error3 =
+  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')
+    (\ x _ _ _ -> pure $ x === "xaaa")
 -- log error test
-t_decode_with_error5'' =
+t_decode_withM_error4 =
   ioProperty $ do
     mVar <- newEmptyMVar
     E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      (\desc _ -> do
+      (\ desc _ _ -> do
         res <- tryPutMVar mVar desc
         pure . Just $ if res then 'x' else 'y')
-      (\ x _ _ -> do
+      (\ x _ _ _ -> do
         mDesc <- tryTakeMVar mVar
         pure $ x == "xaaa" && isJust mDesc)
--- test case demonstrating how to stop processing without 'throw' or 'error'
-t_decode_with_error6'' =
+-- test case demonstrating how to stop processing without 'throw' or 'error' using Either
+t_decode_withM_error5 =
+  case do
+    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
+      (\ _ errPos _ -> Left errPos)
+      (\ x _ _ _ -> pure x) of
+    Left pos -> pos == 0
+    Right _ -> False
+-- test case demonstrating how to stop processing without 'throw' or 'error' using Maybe
+t_decode_withM_error6 =
+  case do
+    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
+      (\ _ _ _ -> Nothing)
+      (\ x _ _ _ -> pure x) of
+    Just _ -> False
+    _ -> True
+-- test case demonstrating how to stop processing without 'throw' or 'error' using the continuation monad
+t_decode_withM_error7 =
   runCont (do
     E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
       -- early exit
-      (\ _ _ -> cont $ \ _ -> True)
+      (\ _ _ _ -> cont $ \ _ -> True)
       -- this should not be executed
-      (\ _ _ _ -> pure False)
+      (\ _ _ _ _ -> pure False)
     ) id
+-- feed more data into the stream using the continuation
+t_decode_withM_error8 =
+  ioProperty $ do
+    mVar <- newMVar $ B.pack [84, 101, 115, 116] -- 'T' 'e' 's' 't'
+    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
+      (\ _ _ _ -> pure $ Just 'x')
+      (\ x _ _ f -> do
+        mBs <- tryTakeMVar mVar
+        case mBs of
+          Just bs -> f bs
+          _ -> pure $ x == "Test")
+tl_decode_withM_error1 =
+  ioProperty (do
+    x <- EL.decodeUtf8WithM (BL.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x')
+    pure $ x == "xa")
+tl_decode_withM_error2 =
+  runIdentity (EL.decodeUtf8WithM (BL.pack [0xE0, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaa"
+tl_decode_withM_error3 =
+  runIdentity (EL.decodeUtf8WithM (BL.pack [0xF0, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaaa"
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -283,12 +318,18 @@ testTranscoding =
       testProperty "t_decode_with_error2'" t_decode_with_error2',
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
-      testProperty "t_decode_with_error2''" t_decode_with_error2'',
-      testProperty "t_decode_with_error3''" t_decode_with_error3'',
-      testProperty "t_decode_with_error4''" t_decode_with_error4'',
+      testProperty "t_decode_withM_error1" t_decode_withM_error1,
+      testProperty "t_decode_withM_error2" t_decode_withM_error2,
+      testProperty "t_decode_withM_error3" t_decode_withM_error3,
       testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_with_error5''" t_decode_with_error5'',
-      testProperty "t_decode_with_error6''" t_decode_with_error6'',
+      testProperty "t_decode_withM_error4" t_decode_withM_error4,
+      testProperty "t_decode_withM_error5" t_decode_withM_error5,
+      testProperty "t_decode_withM_error6" t_decode_withM_error6,
+      testProperty "t_decode_withM_error7" t_decode_withM_error7,
+      testProperty "t_decode_withM_error8" t_decode_withM_error8,
+      testProperty "tl_decode_withM_error1" tl_decode_withM_error1,
+      testProperty "tl_decode_withM_error2" tl_decode_withM_error2,
+      testProperty "tl_decode_withM_error3" tl_decode_withM_error3,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
diff --git a/text.cabal b/text.cabal
index 5231dbac..9ff632d1 100644
--- a/text.cabal
+++ b/text.cabal
@@ -187,7 +187,9 @@ library
     bytestring       >= 0.10.4 && < 0.12,
     deepseq          >= 1.1 && < 1.5,
     ghc-prim         >= 0.2 && < 0.10,
-    template-haskell >= 2.5 && < 2.20
+    mtl,
+    template-haskell >= 2.5 && < 2.20,
+    transformers
 
   ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
   if flag(developer)

From 2e2c5419e0ea250809b71870dc3caa894bc79605 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Fri, 1 Jul 2022 19:04:49 -0600
Subject: [PATCH 06/87] make streamDecodeUtf8WithM safe with Cont and List
 monads

---
 src/Data/Text/Encoding.hs | 59 +++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 1daec591..3596bbb4 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -146,16 +146,13 @@ import GHC.ST (ST(..), STRep)
 -- anything except ASCII and copies buffer or throws an error otherwise.
 --
 decodeASCII :: ByteString -> Text
-decodeASCII bs = withBS bs $ \fp len ->
-  if len == 0
-    then empty
-    else runST $ do
-      asciiPrefixLen <- fmap cSizeToInt . unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
-        c_is_ascii src (src `plusPtr` len)
-      if asciiPrefixLen == len
-      then let !(SBS.SBS arr) = SBS.toShort bs in
-            return (Text (A.ByteArray arr) 0 len)
-      else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
+decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
+  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+    c_is_ascii src (src `plusPtr` len)
+  if asciiPrefixLen == len
+  then let !(SBS.SBS arr) = SBS.toShort bs in
+        return (Text (A.ByteArray arr) 0 len)
+  else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --
@@ -360,10 +357,10 @@ streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do
 
         outer dst dstLen = inner
             where
-              inner srcOff dstOff s'#
+              inner srcOff dstOff s0#
                 | srcOff >= len =
-                  case unST (A.shrinkM dst dstOff) s'# of
-                    (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of
+                  case unST (A.shrinkM dst dstOff) s0# of
+                    (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
                       (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff))
 
                 | srcOff >= len1
@@ -372,28 +369,36 @@ streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do
                 , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
                 , isValidBS bs =
                   case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
-                        unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s'# of
-                    (# s''#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''#
+                        unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s0# of
+                    (# s1#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s1#
 
                 | dstOff + 4 > dstLen =
                   let dstLen' = dstLen + 4 in
-                  case unST (A.resizeM dst dstLen') s'# of
-                    !(# s''#, dst' #) -> outer dst' dstLen' srcOff dstOff s''#
+                  case unST (A.resizeM dst dstLen') s0# of
+                    !(# s1#, dst' #) -> outer dst' dstLen' srcOff dstOff s1#
 
                 | otherwise = case decodeFrom srcOff of
                   Accept c ->
-                    case unST (unsafeWrite dst dstOff c) s'# of
-                      (# s''#, d #) -> inner (srcOff + d) (dstOff + d) s''#
+                    case unST (unsafeWrite dst dstOff c) s0# of
+                      (# s1#, d #) -> inner (srcOff + d) (dstOff + d) s1#
                   Reject -> do
-                    res <- onErrM' desc srcOff . Just $ index srcOff
-                    case res of
-                      Nothing -> inner (srcOff + 1) dstOff s'#
-                      Just c ->
-                        case unST (unsafeWrite dst dstOff $ safe c) s'# of
-                          (# s''#, d #) -> inner (srcOff + 1) (dstOff + d) s''#
+                    -- gonna call this text array done
+                    case unST (A.shrinkM dst dstOff) s0# of
+                      -- might not be necessary
+                      (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
+                        (# _, arr #) -> do
+                          res <- onErrM' desc srcOff . Just $ index srcOff
+                          -- continue on with a copy of the text array
+                          case runRW# (unST $ A.new dstLen) of
+                            (# s2#, dst' #) -> case unST (A.copyI dstOff dst' 0 arr 0) s2# of
+                              (# s3#, _ #) -> case res of
+                                Just c ->
+                                  case unST (unsafeWrite dst' dstOff $ safe c) s3# of
+                                    (# s4#, d #) -> outer dst' dstLen (srcOff + 1) (dstOff + d) s4#
+                                _ -> outer dst' dstLen (srcOff + 1) dstOff s3#
                   Incomplete{} ->
-                    case unST (A.shrinkM dst dstOff) s'# of
-                      (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of
+                    case unST (A.shrinkM dst dstOff) s0# of
+                      (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
                         (# _, arr #) ->
                           let bs = if srcOff >= len1
                                 then B.drop (srcOff - len1) bs2

From 41492dc573e2ac5460748adef1fd0ba52e1b7e81 Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Sat, 2 Jul 2022 23:07:32 -0600
Subject: [PATCH 07/87] Added StreamDecode which is like Decoding with more
 options. Added streamDecodeUtf8With' to take advantage of StremDecode.

---
 src/Data/Text/Encoding.hs             | 276 +++++++++++++-------------
 src/Data/Text/Encoding/Error.hs       |   8 +-
 src/Data/Text/Lazy/Encoding.hs        |  48 ++---
 tests/Tests/Properties/Transcoding.hs | 123 ++++--------
 text.cabal                            |   4 +-
 5 files changed, 198 insertions(+), 261 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 3596bbb4..a4e01247 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -1,5 +1,5 @@
 {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
-    UnliftedFFITypes, UnboxedTuples #-}
+    UnliftedFFITypes #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
@@ -36,7 +36,6 @@ module Data.Text.Encoding
 
     -- *** Controllable error handling
     , decodeUtf8With
-    , decodeUtf8WithM
     , decodeUtf16LEWith
     , decodeUtf16BEWith
     , decodeUtf32LEWith
@@ -46,8 +45,9 @@ module Data.Text.Encoding
     -- $stream
     , streamDecodeUtf8
     , streamDecodeUtf8With
-    , streamDecodeUtf8WithM
+    , streamDecodeUtf8With'
     , Decoding(..)
+    , StreamDecode(..)
 
     -- ** Partial Functions
     -- $partial
@@ -73,15 +73,13 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
-import Control.Monad.Fix (fix)
-import Control.Monad.ST (runST)
+import Control.Monad.ST (runST, ST)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Functor.Identity (Identity(..), runIdentity)
-import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
@@ -92,7 +90,6 @@ import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
-import GHC.Magic (runRW#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
@@ -113,7 +110,6 @@ import Foreign.C.Types (CInt(..))
 import qualified Data.ByteString.Unsafe as B
 import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
 #endif
-import GHC.ST (ST(..), STRep)
 
 -- $strict
 --
@@ -226,37 +222,24 @@ decodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs =
-  runIdentity $ decodeUtf8WithM bs (\ desc' _ mWord8 -> Identity $ onErr desc' mWord8)
-
-decodeUtf8WithM ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  Monad m =>
-  ByteString -> OnDecodeErrorM m -> m Text
-decodeUtf8WithM bs onErrM =
-  streamDecodeUtf8WithM bs onErrM (\ t bs' bytePos _ ->
-    fix (\ f bp bs'' t' ->
-      case B.uncons bs'' of
-        Just (word8, bs''') -> do
-          mC <- onErrM desc bp $ Just word8
-          f (bp + 1) bs''' $ case mC of
-            Just c -> t' `append` T.singleton c
-            _ -> t'
-        _ -> pure t') bytePos bs' t
-  )
+decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of
+  Some t unencoded _ -> codePointToInvalid unencoded t
   where
-    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+    codePointToInvalid bs' txt =
+      case B.uncons bs' of
+        Just (x, bs'') -> codePointToInvalid bs'' $ case onErr desc $ Just x of
+          Just c -> append txt $ T.singleton c
+          _ -> txt
+        _ -> txt
 
-unST :: ST s a -> STRep s a
-unST (ST st) = st
+    desc = "Data.Text.Internal.Encoding: Incomplete UTF-8 code point"
 
 -- $stream
 --
--- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
--- a 'ByteString' that represents a possibly incomplete input (e.g. a
--- packet from a network stream) that may not end on a UTF-8 boundary.
+-- The 'streamDecodeUtf8',  'streamDecodeUtf8With', and
+-- 'streamDecodeUtf8With'' functions accept a 'ByteString' that
+-- represents a possibly incomplete input (e.g. a packet from a network
+-- stream) that may not end on a UTF-8 boundary.
 --
 -- 1. The maximal prefix of 'Text' that could be decoded from the
 --    given input.
@@ -305,108 +288,6 @@ unST (ST st) = st
 -- If given invalid input, an exception will be thrown by the function
 -- or continuation where it is encountered.
 
--- | Decode, in a monadic- and stream-oriented way using CPS, a lazy
--- 'ByteString' containing UTF-8 encoded text.
-streamDecodeUtf8WithM ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  Monad m =>
-  ByteString
-  -> OnDecodeErrorM m
-  -> (Text -> ByteString -> Int -> (ByteString -> m b) -> m b)
-  -> m b
-streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do
-    (txt, (undecoded, bytePos)) <- decodeUtf8With2 bs1 bs2 (\ desc bp'' mWord8 -> onErrM desc (bp'' + bp) mWord8)
-    let bp' = bytePos + bp
-    f txt undecoded bp' $ go bp' undecoded
-  ) 0 mempty bstr
-  where
-    decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM' = do
-      case runRW# $ unST (A.new len') of
-        (# s#, marr #) -> outer marr len' 0 0 s#
-      where
-        len = len1 + len2
-        len' = len + 4
-
-        index i
-          | i < len1  = B.index bs1 i
-          | otherwise = B.index bs2 $ i - len1
-
-        -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-        guessUtf8Boundary :: Int
-        guessUtf8Boundary
-          | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
-          | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
-          | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
-          | len2 >= 3 && w2 >= 0xC0 = len2 - 3
-          | len2 >= 4 && w3 >= 0xC0 = len2 - 4
-          | otherwise = 0
-          where
-            w0 = B.index bs2 $ len2 - 1
-            w1 = B.index bs2 $ len2 - 2
-            w2 = B.index bs2 $ len2 - 3
-            w3 = B.index bs2 $ len2 - 4
-
-        decodeFrom :: Int -> DecoderResult
-        decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
-          where
-            step i (Incomplete a b)
-              | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-            step _ st = st
-
-        outer dst dstLen = inner
-            where
-              inner srcOff dstOff s0#
-                | srcOff >= len =
-                  case unST (A.shrinkM dst dstOff) s0# of
-                    (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
-                      (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff))
-
-                | srcOff >= len1
-                , srcOff < len1 + guessUtf8Boundary
-                , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
-                , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
-                , isValidBS bs =
-                  case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
-                        unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s0# of
-                    (# s1#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s1#
-
-                | dstOff + 4 > dstLen =
-                  let dstLen' = dstLen + 4 in
-                  case unST (A.resizeM dst dstLen') s0# of
-                    !(# s1#, dst' #) -> outer dst' dstLen' srcOff dstOff s1#
-
-                | otherwise = case decodeFrom srcOff of
-                  Accept c ->
-                    case unST (unsafeWrite dst dstOff c) s0# of
-                      (# s1#, d #) -> inner (srcOff + d) (dstOff + d) s1#
-                  Reject -> do
-                    -- gonna call this text array done
-                    case unST (A.shrinkM dst dstOff) s0# of
-                      -- might not be necessary
-                      (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
-                        (# _, arr #) -> do
-                          res <- onErrM' desc srcOff . Just $ index srcOff
-                          -- continue on with a copy of the text array
-                          case runRW# (unST $ A.new dstLen) of
-                            (# s2#, dst' #) -> case unST (A.copyI dstOff dst' 0 arr 0) s2# of
-                              (# s3#, _ #) -> case res of
-                                Just c ->
-                                  case unST (unsafeWrite dst' dstOff $ safe c) s3# of
-                                    (# s4#, d #) -> outer dst' dstLen (srcOff + 1) (dstOff + d) s4#
-                                _ -> outer dst' dstLen (srcOff + 1) dstOff s3#
-                  Incomplete{} ->
-                    case unST (A.shrinkM dst dstOff) s0# of
-                      (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of
-                        (# _, arr #) ->
-                          let bs = if srcOff >= len1
-                                then B.drop (srcOff - len1) bs2
-                                else B.drop srcOff $ bs1 `B.append` bs2 in
-                          pure (Text arr 0 dstOff, (bs, srcOff))
-
-        desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-
 -- | A stream oriented decoding result.
 --
 -- @since 1.0.0.0
@@ -444,9 +325,122 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr bs = runIdentity $ streamDecodeUtf8WithM bs
-  (\ desc _ mWord8 -> Identity $ onErr desc mWord8)
-  (\ txt undecoded _ f -> Identity . Some txt undecoded $ runIdentity . f)
+streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
+  where
+    go t res = case res of
+      Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With'
+      IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f
+      InvalidWord txt _ mWord8 f -> go (t `append` txt) . f $ onErr desc mWord8
+
+    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+
+-- | A stream-oriented decoding result of one of three possibilities:
+--
+--     * 'Ok' - Decoded text without issue.
+--     * 'IncompleteCodePoint' - An incomplete code point at the end of
+-- the 'ByteString' which includes the position in the 'ByteString'
+-- where the code point start, the incomplete code point, and a
+-- continuation that accepts another 'ByteString' as a continuation of
+-- the initial input.
+--     * 'InvalidWord' - An invalid utf-8 'Word8' which includes the
+-- position of in the 'ByteString' where the invalid data occurs, the
+-- offending 'Word8', and a continuation that accepts as what 'Char' it
+-- should be interpreted if any.
+data StreamDecode =
+    Ok !Text
+  | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode)
+  | InvalidWord !Text !Int !(Maybe Word8) (Maybe Char -> StreamDecode)
+
+-- | Like 'streamDecodeUtf8With', but instead of accepting an
+-- 'OnDecodeError' callback, it returns a 'StreamDecode' identifying
+-- whether the 'ByteString' decoded without issue, it encountered an
+-- incomplete code point at the end of the 'ByteString', or an invalid
+-- 'Word8'.
+streamDecodeUtf8With' :: ByteString -> StreamDecode
+streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty
+  where
+    decodeWithPossibleStartChar :: Maybe Char -> ByteString -> ByteString -> StreamDecode
+    decodeWithPossibleStartChar mStartChar bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+      marr <- A.new len'
+      case mStartChar of
+        Just c -> do
+          d <- unsafeWrite marr 0 (safe c)
+          outer marr len' 0 d
+        _ -> outer marr len' 0 0
+      where
+        len = len1 + len2
+        len' = len + 4
+
+        index i
+          | i < len1  = B.index bs1 i
+          | otherwise = B.index bs2 (i - len1)
+
+        -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+        guessUtf8Boundary :: Int
+        guessUtf8Boundary
+          | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+          | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+          | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+          | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+          | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+          | otherwise = 0
+          where
+            w0 = B.index bs2 (len2 - 1)
+            w1 = B.index bs2 (len2 - 2)
+            w2 = B.index bs2 (len2 - 3)
+            w3 = B.index bs2 (len2 - 4)
+
+        decodeFrom :: Int -> DecoderResult
+        decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
+          where
+            step i (Incomplete a b)
+              | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
+            step _ st = st
+
+        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode
+        outer dst dstLen = inner
+            where
+              inner srcOff dstOff
+                | srcOff >= len = do
+                  A.shrinkM dst dstOff
+                  arr <- A.unsafeFreeze dst
+                  pure . Ok $ Text arr 0 dstOff
+
+                | srcOff >= len1
+                , srcOff < len1 + guessUtf8Boundary
+                , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
+                , bs' <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
+                , isValidBS bs' = do
+                  withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+                    unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
+                  inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
+
+                | dstOff + 4 > dstLen = do
+                  let dstLen' = dstLen + 4
+                  dst' <- A.resizeM dst dstLen'
+                  outer dst' dstLen' srcOff dstOff
+
+                | otherwise = case decodeFrom srcOff of
+                  Accept c -> do
+                    d <- unsafeWrite dst dstOff c
+                    inner (srcOff + d) (dstOff + d)
+                  Reject -> do
+                    A.shrinkM dst dstOff
+                    arr <- A.unsafeFreeze dst
+                    let srcOff' = srcOff + 1
+                        bs' = if srcOff' >= len1
+                          then B.drop (srcOff' - len1) bs2
+                          else B.drop srcOff' (bs1 `B.append` bs2)
+                    pure . InvalidWord (Text arr 0 dstOff) srcOff (Just $ index srcOff) $ \ mChar ->
+                      decodeWithPossibleStartChar mChar bs' mempty
+                  Incomplete{} -> do
+                    A.shrinkM dst dstOff
+                    arr <- A.unsafeFreeze dst
+                    let bs' = if srcOff >= len1
+                          then B.drop (srcOff - len1) bs2
+                          else B.drop srcOff (bs1 `B.append` bs2)
+                    pure . IncompleteCodePoint (Text arr 0 dstOff) srcOff bs' $
+                      decodeWithPossibleStartChar Nothing bs'
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
@@ -655,4 +649,4 @@ cSizeToInt = fromIntegral
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
-#endif
+#endif
\ No newline at end of file
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index bec2fd55..1dfbf03a 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -25,7 +25,6 @@ module Data.Text.Encoding.Error
       UnicodeException(..)
     , OnError
     , OnDecodeError
-    , OnDecodeErrorM
     , OnEncodeError
     -- * Useful error handling functions
     , lenientDecode
@@ -63,11 +62,6 @@ type OnError a b = String -> Maybe a -> Maybe b
 -- | A handler for a decoding error.
 type OnDecodeError = OnError Word8 Char
 
--- | A monadic handler for a decoding error. With certain monads such as
--- 'Maybe', 'Either', and 'Cont', processessing can be abandoned without
--- the need to use 'error' or 'throw'.
-type OnDecodeErrorM m = String -> Int -> Maybe Word8 -> m (Maybe Char)
-
 -- | A handler for an encoding error.
 {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
 type OnEncodeError = OnError Char Word8
@@ -123,4 +117,4 @@ ignore _ _ = Nothing
 
 -- | Replace an invalid input with a valid output.
 replace :: b -> OnError a b
-replace c _ _ = Just c
+replace c _ _ = Just c
\ No newline at end of file
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 7f6baa3b..dc098daa 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -29,7 +29,6 @@ module Data.Text.Lazy.Encoding
 
     -- *** Controllable error handling
     , decodeUtf8With
-    , decodeUtf8WithM
     , decodeUtf16LEWith
     , decodeUtf16BEWith
     , decodeUtf32LEWith
@@ -57,12 +56,8 @@ module Data.Text.Lazy.Encoding
     ) where
 
 import Control.Exception (evaluate, try)
-import Control.Monad.Fix (fix)
-import Control.Monad.State (evalStateT, get, state)
-import Control.Monad.Trans.Class (lift)
-import Data.Functor.Identity (Identity(..))
 import Data.Monoid (Monoid(..))
-import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word8)
 import qualified Data.ByteString as S
@@ -111,31 +106,24 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
-decodeUtf8With onErr = runIdentity . flip decodeUtf8WithM (const . (Identity .) . onErr)
-
--- | Decode a 'ByteString' containing UTF-8 encoded text in a monad context.
-decodeUtf8WithM :: Monad m => B.ByteString -> OnDecodeErrorM m -> m Text
-decodeUtf8WithM (B.Chunk sb lb) onErrM =
-  evalStateT (TE.streamDecodeUtf8WithM sb (((lift .) .) . onErrM) (\ t cp bp f -> do
-    (lb', diff) <- get
-    case lb' of
-      B.Chunk sb' lb'' -> do
-        state $ const ((), (lb'', diff . chunk t))
-        f sb'
-      _ -> do
-        t'' <- lift $ fix (\ f' bp' cp' t' ->
-          case S.uncons cp' of
-            Just (word8, cp'') -> do
-              mC <- onErrM desc bp' $ Just word8
-              f' (bp' + 1) cp'' $ case mC of
-                Just c -> t' `T.snoc` c
-                _ -> t'
-            _ -> pure t') bp cp t
-        pure . diff $ Chunk t'' Empty
-    )) (lb, id)
+decodeUtf8With onErr (B.Chunk b0 bs0) =
+    case TE.streamDecodeUtf8With onErr b0 of
+      TE.Some t l f -> chunk t (go f l bs0)
   where
+    go f0 _ (B.Chunk b bs) =
+      case f0 b of
+        TE.Some t l f -> chunk t (go f l bs)
+    go _ l _
+      | S.null l  = empty
+      | otherwise =
+        let !t = T.pack (skipBytes l)
+            skipBytes = S.foldr (\x s' ->
+                  case onErr desc (Just x) of
+                    Just c -> c : s'
+                    Nothing -> s') [] in
+        Chunk t Empty
     desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
-decodeUtf8WithM _ _ = pure empty
+decodeUtf8With _ _ = empty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
@@ -262,4 +250,4 @@ encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
 -- | Encode text using big endian UTF-32 encoding.
 encodeUtf32BE :: Text -> B.ByteString
 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
-{-# INLINE encodeUtf32BE #-}
+{-# INLINE encodeUtf32BE #-}
\ No newline at end of file
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 5e67f52a..fe3d1def 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -6,8 +6,6 @@ module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
-import Control.Concurrent.MVar (newEmptyMVar, newMVar, tryPutMVar, tryTakeMVar)
-import Control.Monad.Cont (cont, runCont)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Test.QuickCheck hiding ((.&.))
@@ -23,8 +21,6 @@ import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Char8 as BC
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Char8 as BLC
-import Data.Functor.Identity (Identity(..))
-import Data.Maybe (isJust)
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
@@ -208,70 +204,45 @@ t_decode_with_error5' = ioProperty $ do
   pure $ case ret of
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
-t_decode_withM_error1 =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x')
-    (\ x _ _ _ -> pure $ x === "xa")
-t_decode_withM_error2 =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ _ -> pure $ Just 'x')
-    (\ x _ _ _ -> pure $ x === "xaa")
-t_decode_withM_error3 =
-  runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')
-    (\ x _ _ _ -> pure $ x === "xaaa")
--- log error test
-t_decode_withM_error4 =
-  ioProperty $ do
-    mVar <- newEmptyMVar
-    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      (\ desc _ _ -> do
-        res <- tryPutMVar mVar desc
-        pure . Just $ if res then 'x' else 'y')
-      (\ x _ _ _ -> do
-        mDesc <- tryTakeMVar mVar
-        pure $ x == "xaaa" && isJust mDesc)
--- test case demonstrating how to stop processing without 'throw' or 'error' using Either
-t_decode_withM_error5 =
-  case do
-    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      (\ _ errPos _ -> Left errPos)
-      (\ x _ _ _ -> pure x) of
-    Left pos -> pos == 0
-    Right _ -> False
--- test case demonstrating how to stop processing without 'throw' or 'error' using Maybe
-t_decode_withM_error6 =
-  case do
-    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      (\ _ _ _ -> Nothing)
-      (\ x _ _ _ -> pure x) of
-    Just _ -> False
-    _ -> True
--- test case demonstrating how to stop processing without 'throw' or 'error' using the continuation monad
-t_decode_withM_error7 =
-  runCont (do
-    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      -- early exit
-      (\ _ _ _ -> cont $ \ _ -> True)
-      -- this should not be executed
-      (\ _ _ _ _ -> pure False)
-    ) id
--- feed more data into the stream using the continuation
-t_decode_withM_error8 =
-  ioProperty $ do
-    mVar <- newMVar $ B.pack [84, 101, 115, 116] -- 'T' 'e' 's' 't'
-    E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97])
-      (\ _ _ _ -> pure $ Just 'x')
-      (\ x _ _ f -> do
-        mBs <- tryTakeMVar mVar
-        case mBs of
-          Just bs -> f bs
-          _ -> pure $ x == "Test")
-tl_decode_withM_error1 =
-  ioProperty (do
-    x <- EL.decodeUtf8WithM (BL.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x')
-    pure $ x == "xa")
-tl_decode_withM_error2 =
-  runIdentity (EL.decodeUtf8WithM (BL.pack [0xE0, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaa"
-tl_decode_withM_error3 =
-  runIdentity (EL.decodeUtf8WithM (BL.pack [0xF0, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaaa"
+
+t_decode_with_error2'' =
+  case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
+    E.InvalidWord t pos mWord8 f -> if pos /= 1
+      then pos =/= 1
+      else case mWord8 of
+        Just 0xC2 -> case f $ Just 'x' of
+          E.Ok x -> t `T.append` x === "axa"
+          _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
+        Just y -> y =/= 0xC2
+        _ -> counterexample "Should have been something instead of Nothing" False
+    _ -> counterexample "The second word (\\xC2) should have been invalid" False
+t_decode_with_error3'' =
+  case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
+    E.InvalidWord t pos mWord8 f -> if pos /= 1
+      then pos =/= 1
+      else case mWord8 of
+        Just 0xC2 -> case f Nothing of
+          E.Ok x -> t `T.append` x === "aa"
+          _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
+        Just y -> y =/= 0xC2
+        _ -> counterexample "Should have been something instead of Nothing" False
+    _ -> counterexample "The second word (\\xC2) should have been invalid" False
+t_decode_with_error4'' =
+  case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2
+    E.IncompleteCodePoint t pos bs f -> if pos /= 3
+      then pos =/= 3
+      else if bs /= B.pack [0xe2]
+        then bs =/= B.pack [0xe2]
+        else case f (B.pack [0x98]) of
+          E.IncompleteCodePoint t' pos' bs' f' -> if pos' /= 0
+            then pos' =/= 0
+            else if bs' /= B.pack [0xe2, 0x98]
+              then bs' =/= B.pack [0xe2, 0x98]
+              else case f' (B.pack [0x83]) of
+                E.Ok x -> "hi ☃" === t `T.append` t' `T.append` x
+                _ -> counterexample "Should have been a completely decoded text." False
+          _ -> counterexample "Should have encountered an incomplete code point." False
+    _ -> counterexample "Should have encountered an incomplete code point." False
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -318,18 +289,10 @@ testTranscoding =
       testProperty "t_decode_with_error2'" t_decode_with_error2',
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
-      testProperty "t_decode_withM_error1" t_decode_withM_error1,
-      testProperty "t_decode_withM_error2" t_decode_withM_error2,
-      testProperty "t_decode_withM_error3" t_decode_withM_error3,
       testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_withM_error4" t_decode_withM_error4,
-      testProperty "t_decode_withM_error5" t_decode_withM_error5,
-      testProperty "t_decode_withM_error6" t_decode_withM_error6,
-      testProperty "t_decode_withM_error7" t_decode_withM_error7,
-      testProperty "t_decode_withM_error8" t_decode_withM_error8,
-      testProperty "tl_decode_withM_error1" tl_decode_withM_error1,
-      testProperty "tl_decode_withM_error2" tl_decode_withM_error2,
-      testProperty "tl_decode_withM_error3" tl_decode_withM_error3,
+      testProperty "t_decode_with_error2''" t_decode_with_error2'',
+      testProperty "t_decode_with_error3''" t_decode_with_error3'',
+      testProperty "t_decode_with_error4''" t_decode_with_error4'',
       testProperty "t_infix_concat" t_infix_concat
     ]
-  ]
+  ]
\ No newline at end of file
diff --git a/text.cabal b/text.cabal
index 9ff632d1..5231dbac 100644
--- a/text.cabal
+++ b/text.cabal
@@ -187,9 +187,7 @@ library
     bytestring       >= 0.10.4 && < 0.12,
     deepseq          >= 1.1 && < 1.5,
     ghc-prim         >= 0.2 && < 0.10,
-    mtl,
-    template-haskell >= 2.5 && < 2.20,
-    transformers
+    template-haskell >= 2.5 && < 2.20
 
   ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
   if flag(developer)

From 1a74c5a45351d83a31397c10a57a3fbee82c288c Mon Sep 17 00:00:00 2001
From: "David M. Sledge" <sledged@gmail.com>
Date: Sun, 3 Jul 2022 13:11:38 -0600
Subject: [PATCH 08/87] Extracted Word8 from Maybe in StreamDecode: Offending
 Word8 is never Nothing. Cleaned up documentation for StreamDecode and
 streamDecodeUtf8With'. Track byte position across continuations in
 streamDecodeUtf8With. Made streamDecodeUtf8With' test case a little more
 comprehensive.

---
 src/Data/Text/Encoding.hs             | 117 ++++++++++++++------------
 tests/Tests/Properties/Transcoding.hs |  52 ++++++------
 2 files changed, 93 insertions(+), 76 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index a4e01247..830952ad 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -330,41 +330,54 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
     go t res = case res of
       Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With'
       IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f
-      InvalidWord txt _ mWord8 f -> go (t `append` txt) . f $ onErr desc mWord8
+      InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8
 
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
--- | A stream-oriented decoding result of one of three possibilities:
---
---     * 'Ok' - Decoded text without issue.
---     * 'IncompleteCodePoint' - An incomplete code point at the end of
--- the 'ByteString' which includes the position in the 'ByteString'
--- where the code point start, the incomplete code point, and a
--- continuation that accepts another 'ByteString' as a continuation of
--- the initial input.
---     * 'InvalidWord' - An invalid utf-8 'Word8' which includes the
--- position of in the 'ByteString' where the invalid data occurs, the
--- offending 'Word8', and a continuation that accepts as what 'Char' it
--- should be interpreted if any.
-data StreamDecode =
-    Ok !Text
-  | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode)
-  | InvalidWord !Text !Int !(Maybe Word8) (Maybe Char -> StreamDecode)
+-- | A stream-oriented decoding result of one of three possibilities.
+data StreamDecode
+  -- | The 'ByteString' was decoded without issue.
+  = Ok
+    -- | The decoded text.
+    !Text
+  -- | An incomplete code point at the end of the 'ByteString'.
+  | IncompleteCodePoint
+      -- | The decoded text up to but not including the incomplete code
+      -- point.
+      !Text
+      -- | The position in the 'ByteString' where the code point
+      -- starts.
+      !Int
+      -- | The incomplete code point.
+      !ByteString
+      -- | A function that accepts another 'ByteString' as a
+      -- continuation of the previous input.
+      (ByteString -> StreamDecode)
+  -- | An invalid utf-8 'Word8'.
+  | InvalidWord
+      -- | The decoded text up to but not including the invalid
+      -- 'Word8'.
+      !Text
+      -- | The position in the 'ByteString' of the offending 'Word8'.
+      !Int
+      -- | The offender.
+      !Word8
+      -- | A function that accepts a possible 'Char' as to interpret
+      -- the 'Word8' in this specific occurrence of it.
+      (Maybe Char -> StreamDecode)
 
 -- | Like 'streamDecodeUtf8With', but instead of accepting an
--- 'OnDecodeError' callback, it returns a 'StreamDecode' identifying
--- whether the 'ByteString' decoded without issue, it encountered an
--- incomplete code point at the end of the 'ByteString', or an invalid
--- 'Word8'.
+-- 'OnDecodeError' callback and returning a 'Decoding', it returns a
+-- 'StreamDecode'.
 streamDecodeUtf8With' :: ByteString -> StreamDecode
-streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty
+streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty
   where
-    decodeWithPossibleStartChar :: Maybe Char -> ByteString -> ByteString -> StreamDecode
-    decodeWithPossibleStartChar mStartChar bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+    decodeWithPossibleStartChar :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode
+    decodeWithPossibleStartChar mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
       marr <- A.new len'
       case mStartChar of
         Just c -> do
-          d <- unsafeWrite marr 0 (safe c)
+          d <- unsafeWrite marr 0 $ safe c
           outer marr len' 0 d
         _ -> outer marr len' 0 0
       where
@@ -373,7 +386,7 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty
 
         index i
           | i < len1  = B.index bs1 i
-          | otherwise = B.index bs2 (i - len1)
+          | otherwise = B.index bs2 $ i - len1
 
         -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
         guessUtf8Boundary :: Int
@@ -391,10 +404,10 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty
             w3 = B.index bs2 (len2 - 4)
 
         decodeFrom :: Int -> DecoderResult
-        decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
+        decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
           where
             step i (Incomplete a b)
-              | i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
+              | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
             step _ st = st
 
         outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode
@@ -409,38 +422,38 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty
                 | srcOff >= len1
                 , srcOff < len1 + guessUtf8Boundary
                 , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
-                , bs' <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
+                , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
                 , isValidBS bs' = do
                   withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                    unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
-                  inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
+                    unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff
+                  inner (len1 + guessUtf8Boundary) $ dstOff + (len1 + guessUtf8Boundary - srcOff)
 
                 | dstOff + 4 > dstLen = do
                   let dstLen' = dstLen + 4
                   dst' <- A.resizeM dst dstLen'
                   outer dst' dstLen' srcOff dstOff
 
-                | otherwise = case decodeFrom srcOff of
-                  Accept c -> do
-                    d <- unsafeWrite dst dstOff c
-                    inner (srcOff + d) (dstOff + d)
-                  Reject -> do
-                    A.shrinkM dst dstOff
-                    arr <- A.unsafeFreeze dst
-                    let srcOff' = srcOff + 1
-                        bs' = if srcOff' >= len1
-                          then B.drop (srcOff' - len1) bs2
-                          else B.drop srcOff' (bs1 `B.append` bs2)
-                    pure . InvalidWord (Text arr 0 dstOff) srcOff (Just $ index srcOff) $ \ mChar ->
-                      decodeWithPossibleStartChar mChar bs' mempty
-                  Incomplete{} -> do
-                    A.shrinkM dst dstOff
-                    arr <- A.unsafeFreeze dst
-                    let bs' = if srcOff >= len1
-                          then B.drop (srcOff - len1) bs2
-                          else B.drop srcOff (bs1 `B.append` bs2)
-                    pure . IncompleteCodePoint (Text arr 0 dstOff) srcOff bs' $
-                      decodeWithPossibleStartChar Nothing bs'
+                | otherwise =
+                  let contin off res = do
+                        A.shrinkM dst dstOff
+                        arr <- A.unsafeFreeze dst
+                        let bs' = if off >= len1
+                              then B.drop (off - len1) bs2
+                              else B.drop off $ bs1 `B.append` bs2
+                        pure $ res arr bs'
+                  in
+                  case decodeFrom srcOff of
+                    Accept c -> do
+                      d <- unsafeWrite dst dstOff c
+                      inner (srcOff + d) $ dstOff + d
+                    Reject -> let srcOff' = srcOff + 1 in
+                      contin srcOff' $ \ arr bs' ->
+                        InvalidWord (Text arr 0 dstOff) (bytePos + srcOff) (index srcOff) $ \ mChar ->
+                          decodeWithPossibleStartChar mChar (bytePos + srcOff') bs' mempty
+                    Incomplete{} -> let bytePos' = bytePos + srcOff in
+                      contin srcOff $ \ arr bs' ->
+                        IncompleteCodePoint (Text arr 0 dstOff) bytePos' bs' $
+                          decodeWithPossibleStartChar Nothing bytePos' bs'
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index fe3d1def..e6ce765a 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -205,44 +205,47 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
+whenEqProp a b next = if a == b
+  then next
+  else a === b
+
 t_decode_with_error2'' =
   case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
-    E.InvalidWord t pos mWord8 f -> if pos /= 1
-      then pos =/= 1
-      else case mWord8 of
-        Just 0xC2 -> case f $ Just 'x' of
+    E.InvalidWord t pos word8 f -> whenEqProp pos 1
+      . whenEqProp word8 0xC2
+        $ case f $ Just 'x' of
           E.Ok x -> t `T.append` x === "axa"
           _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
-        Just y -> y =/= 0xC2
-        _ -> counterexample "Should have been something instead of Nothing" False
     _ -> counterexample "The second word (\\xC2) should have been invalid" False
 t_decode_with_error3'' =
   case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
-    E.InvalidWord t pos mWord8 f -> if pos /= 1
-      then pos =/= 1
-      else case mWord8 of
-        Just 0xC2 -> case f Nothing of
+    E.InvalidWord t pos word8 f -> whenEqProp pos 1
+      . whenEqProp word8 0xC2
+        $ case f Nothing of
           E.Ok x -> t `T.append` x === "aa"
           _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
-        Just y -> y =/= 0xC2
-        _ -> counterexample "Should have been something instead of Nothing" False
     _ -> counterexample "The second word (\\xC2) should have been invalid" False
 t_decode_with_error4'' =
   case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2
-    E.IncompleteCodePoint t pos bs f -> if pos /= 3
-      then pos =/= 3
-      else if bs /= B.pack [0xe2]
-        then bs =/= B.pack [0xe2]
-        else case f (B.pack [0x98]) of
-          E.IncompleteCodePoint t' pos' bs' f' -> if pos' /= 0
-            then pos' =/= 0
-            else if bs' /= B.pack [0xe2, 0x98]
-              then bs' =/= B.pack [0xe2, 0x98]
-              else case f' (B.pack [0x83]) of
-                E.Ok x -> "hi ☃" === t `T.append` t' `T.append` x
-                _ -> counterexample "Should have been a completely decoded text." False
+    E.IncompleteCodePoint t pos bs f -> whenEqProp pos 3
+      . whenEqProp bs (B.pack [0xe2])
+        $ case f (B.pack [0x98]) of
+          E.IncompleteCodePoint t' pos' bs' f' -> whenEqProp pos' 3
+            . whenEqProp bs' (B.pack [0xe2, 0x98])
+              $ case f' (B.pack [0x83, 32, 0xFF]) of
+                E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7
+                  . whenEqProp word8 0xFF
+                    $ case f'' $ Just 'x' of
+                      E.Ok x -> "hi ☃ x" === t `T.append` t' `T.append` t'' `T.append` x
+                      _ -> counterexample "Should have been decoded text." False
+                _ -> counterexample "Should have been an invalid word." False
           _ -> counterexample "Should have encountered an incomplete code point." False
     _ -> counterexample "Should have encountered an incomplete code point." False
+t_decode_with_error5'' =
+  case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2, 0x98, 104]) of -- not quite "hi ☃", the last byte is wrong
+    E.IncompleteCodePoint _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False
+    E.Ok x -> counterexample ("What??? " ++ show x) False
+    E.InvalidWord _ pos mWord8 _ -> counterexample ("Invalid word " ++ show mWord8 ++ " at byte position " ++ show pos ++ ".") True
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -293,6 +296,7 @@ testTranscoding =
       testProperty "t_decode_with_error2''" t_decode_with_error2'',
       testProperty "t_decode_with_error3''" t_decode_with_error3'',
       testProperty "t_decode_with_error4''" t_decode_with_error4'',
+      testProperty "t_decode_with_error5''" t_decode_with_error5'',
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
\ No newline at end of file

From 6b648eef0be84699add2561be08918c83992f94d Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 3 Jul 2022 14:07:43 -0600
Subject: [PATCH 09/87] Indentation error in StreamDecode data type.

---
 src/Data/Text/Encoding.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 830952ad..e5b6d42e 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -338,8 +338,8 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
 data StreamDecode
   -- | The 'ByteString' was decoded without issue.
   = Ok
-    -- | The decoded text.
-    !Text
+      -- | The decoded text.
+      !Text
   -- | An incomplete code point at the end of the 'ByteString'.
   | IncompleteCodePoint
       -- | The decoded text up to but not including the incomplete code

From bc72e4eda0af3ac6840920200b776313645c087d Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 3 Jul 2022 14:35:22 -0600
Subject: [PATCH 10/87] I see how it is, Haddock 8.x...

---
 src/Data/Text/Encoding.hs             | 44 +++++++++------------------
 src/Data/Text/Encoding/Error.hs       |  2 +-
 src/Data/Text/Lazy/Encoding.hs        |  2 +-
 tests/Tests/Properties/Transcoding.hs |  3 +-
 4 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index e5b6d42e..ba16959d 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -336,35 +336,21 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
 
 -- | A stream-oriented decoding result of one of three possibilities.
 data StreamDecode
-  -- | The 'ByteString' was decoded without issue.
-  = Ok
-      -- | The decoded text.
-      !Text
-  -- | An incomplete code point at the end of the 'ByteString'.
-  | IncompleteCodePoint
-      -- | The decoded text up to but not including the incomplete code
-      -- point.
-      !Text
-      -- | The position in the 'ByteString' where the code point
-      -- starts.
-      !Int
-      -- | The incomplete code point.
-      !ByteString
-      -- | A function that accepts another 'ByteString' as a
-      -- continuation of the previous input.
-      (ByteString -> StreamDecode)
-  -- | An invalid utf-8 'Word8'.
-  | InvalidWord
-      -- | The decoded text up to but not including the invalid
-      -- 'Word8'.
-      !Text
-      -- | The position in the 'ByteString' of the offending 'Word8'.
-      !Int
-      -- | The offender.
-      !Word8
-      -- | A function that accepts a possible 'Char' as to interpret
-      -- the 'Word8' in this specific occurrence of it.
-      (Maybe Char -> StreamDecode)
+  -- | The 'ByteString' was decoded without issue. The value contains the
+  -- resulting 'Text'.
+  = Ok !Text
+  -- | An incomplete code point at the end of the 'ByteString'. The value
+  -- contains the decoded text up to but not including the incomplete
+  -- code point, the position in the 'ByteString' where the code point
+  -- starts, the incomplete code point, and a function that accepts
+  -- another 'ByteString' as a continuation of the previous input.
+  | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode)
+  -- | An invalid utf-8 'Word8'. The value contains the decoded text up
+  -- to but not including the invalid 'Word8', the position in the
+  -- 'ByteString' of the offending 'Word8', the offender, and a
+  -- function that accepts a possible 'Char' as to interpret the
+  -- 'Word8' in this specific occurrence of it.
+  | InvalidWord !Text !Int !Word8 (Maybe Char -> StreamDecode)
 
 -- | Like 'streamDecodeUtf8With', but instead of accepting an
 -- 'OnDecodeError' callback and returning a 'Decoding', it returns a
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index 1dfbf03a..ea9e0997 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -117,4 +117,4 @@ ignore _ _ = Nothing
 
 -- | Replace an invalid input with a valid output.
 replace :: b -> OnError a b
-replace c _ _ = Just c
\ No newline at end of file
+replace c _ _ = Just c
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index dc098daa..ad361af5 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -250,4 +250,4 @@ encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
 -- | Encode text using big endian UTF-32 encoding.
 encodeUtf32BE :: Text -> B.ByteString
 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
-{-# INLINE encodeUtf32BE #-}
\ No newline at end of file
+{-# INLINE encodeUtf32BE #-}
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index e6ce765a..86d9cf02 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -299,4 +299,5 @@ testTranscoding =
       testProperty "t_decode_with_error5''" t_decode_with_error5'',
       testProperty "t_infix_concat" t_infix_concat
     ]
-  ]
\ No newline at end of file
+  ]
+  
\ No newline at end of file

From 8554b98fe456e235b97d373e0849e61497127fd4 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 5 Jul 2022 15:28:04 -0600
Subject: [PATCH 11/87] Design change to StreamDecode

---
 src/Data/Text/Encoding.hs             | 97 +++++++++++++++------------
 tests/Tests/Properties/Transcoding.hs | 46 +++++++------
 2 files changed, 77 insertions(+), 66 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index ba16959d..f9fc2a36 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -74,7 +74,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
 import Control.Monad.ST (runST, ST)
-import Data.Bits (shiftR, (.&.))
+import Data.Bits (shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
@@ -328,38 +328,35 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
   where
     go t res = case res of
-      Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With'
-      IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f
+      ThusFar txt _ bs f -> Some (t `append` txt) bs $ go empty . f
       InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8
 
     desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
 -- | A stream-oriented decoding result of one of three possibilities.
-data StreamDecode
-  -- | The 'ByteString' was decoded without issue. The value contains the
-  -- resulting 'Text'.
-  = Ok !Text
-  -- | An incomplete code point at the end of the 'ByteString'. The value
-  -- contains the decoded text up to but not including the incomplete
-  -- code point, the position in the 'ByteString' where the code point
-  -- starts, the incomplete code point, and a function that accepts
-  -- another 'ByteString' as a continuation of the previous input.
-  | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode)
-  -- | An invalid utf-8 'Word8'. The value contains the decoded text up
-  -- to but not including the invalid 'Word8', the position in the
-  -- 'ByteString' of the offending 'Word8', the offender, and a
-  -- function that accepts a possible 'Char' as to interpret the
-  -- 'Word8' in this specific occurrence of it.
-  | InvalidWord !Text !Int !Word8 (Maybe Char -> StreamDecode)
+data StreamDecode w
+  -- | The decoded 'Text' thus far with a possibly incomplete code
+  -- point at the end of the 'ByteString'. The value contains the
+  -- decoded text up to but not including the incomplete code point,
+  -- the position in the 'ByteString' where the code point starts,
+  -- the incomplete code point, and a function that accepts another
+  -- 'ByteString' as a continuation of the previous input.
+  = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w)
+  -- | An encounter witt an invalid utf-8 'Word8'. The value contains
+  -- the decoded text up to but not including the invalid 'Word8',
+  -- the position in the 'ByteString' of the offending 'Word8', the
+  -- offender, and a function that accepts a possible 'Char' as to
+  -- interpret the 'Word8' in this specific occurrence of it.
+  | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w)
 
 -- | Like 'streamDecodeUtf8With', but instead of accepting an
 -- 'OnDecodeError' callback and returning a 'Decoding', it returns a
 -- 'StreamDecode'.
-streamDecodeUtf8With' :: ByteString -> StreamDecode
-streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty
+streamDecodeUtf8With' :: ByteString -> StreamDecode Word8
+streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty
   where
-    decodeWithPossibleStartChar :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode
-    decodeWithPossibleStartChar mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+    decodeAtOffset :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode Word8
+    decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
       marr <- A.new len'
       case mStartChar of
         Just c -> do
@@ -367,6 +364,7 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty
           outer marr len' 0 d
         _ -> outer marr len' 0 0
       where
+        wordByteSize = 1
         len = len1 + len2
         len' = len + 4
 
@@ -396,14 +394,11 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty
               | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
             step _ st = st
 
-        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode
+        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode Word8)
         outer dst dstLen = inner
             where
               inner srcOff dstOff
-                | srcOff >= len = do
-                  A.shrinkM dst dstOff
-                  arr <- A.unsafeFreeze dst
-                  pure . Ok $ Text arr 0 dstOff
+                | srcOff >= len = thusFar
 
                 | srcOff >= len1
                 , srcOff < len1 + guessUtf8Boundary
@@ -420,26 +415,40 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty
                   outer dst' dstLen' srcOff dstOff
 
                 | otherwise =
-                  let contin off res = do
-                        A.shrinkM dst dstOff
-                        arr <- A.unsafeFreeze dst
-                        let bs' = if off >= len1
-                              then B.drop (off - len1) bs2
-                              else B.drop off $ bs1 `B.append` bs2
-                        pure $ res arr bs'
-                  in
                   case decodeFrom srcOff of
                     Accept c -> do
                       d <- unsafeWrite dst dstOff c
                       inner (srcOff + d) $ dstOff + d
-                    Reject -> let srcOff' = srcOff + 1 in
-                      contin srcOff' $ \ arr bs' ->
-                        InvalidWord (Text arr 0 dstOff) (bytePos + srcOff) (index srcOff) $ \ mChar ->
-                          decodeWithPossibleStartChar mChar (bytePos + srcOff') bs' mempty
-                    Incomplete{} -> let bytePos' = bytePos + srcOff in
-                      contin srcOff $ \ arr bs' ->
-                        IncompleteCodePoint (Text arr 0 dstOff) bytePos' bs' $
-                          decodeWithPossibleStartChar Nothing bytePos' bs'
+                    Reject -> invalid
+                    Incomplete{} -> thusFar
+                where
+                  wrapUp f = do
+                    A.shrinkM dst dstOff
+                    arr <- A.unsafeFreeze dst
+                    f $ Text arr 0 dstOff
+                  continue off res = wrapUp $ \ t ->
+                    pure . res t $ if off >= len1
+                      then
+                        if off >= len
+                          then B.empty
+                          else B.drop (off - len1) bs2
+                      else B.drop off $ bs1 `B.append` bs2
+                  thusFar =
+                    let bytePos' = bytePos + srcOff in
+                    continue srcOff $ \ t bs' ->
+                      ThusFar t bytePos' bs' $
+                        decodeAtOffset Nothing bytePos' bs'
+                  invalid =
+                    let srcOff' = srcOff + wordByteSize
+                        bytesToWord n word =
+                          if n > 0
+                            then bytesToWord (n - 1) $
+                              (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                            else word
+                    in
+                    continue srcOff' $ \ t bs' ->
+                      InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' ->
+                        decodeAtOffset mChar' (bytePos + srcOff') bs' mempty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 86d9cf02..f18c0694 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -209,43 +209,45 @@ whenEqProp a b next = if a == b
   then next
   else a === b
 
-t_decode_with_error2'' =
-  case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
+t_decode_utf8_with_error1 =
+  case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of
     E.InvalidWord t pos word8 f -> whenEqProp pos 1
       . whenEqProp word8 0xC2
         $ case f $ Just 'x' of
-          E.Ok x -> t `T.append` x === "axa"
+          E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "axa"
+            . whenEqProp bs B.empty $ pos' === 3
           _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
     _ -> counterexample "The second word (\\xC2) should have been invalid" False
-t_decode_with_error3'' =
-  case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of
+t_decode_utf8_with_error2 =
+  case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of
     E.InvalidWord t pos word8 f -> whenEqProp pos 1
       . whenEqProp word8 0xC2
         $ case f Nothing of
-          E.Ok x -> t `T.append` x === "aa"
+          E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "aa"
+            . whenEqProp bs B.empty $ pos' === 3
           _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
     _ -> counterexample "The second word (\\xC2) should have been invalid" False
-t_decode_with_error4'' =
-  case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2
-    E.IncompleteCodePoint t pos bs f -> whenEqProp pos 3
+t_decode_utf8_with_error3 =
+  case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2] of -- hi \xe2
+    E.ThusFar t pos bs f -> whenEqProp pos 3
       . whenEqProp bs (B.pack [0xe2])
-        $ case f (B.pack [0x98]) of
-          E.IncompleteCodePoint t' pos' bs' f' -> whenEqProp pos' 3
+        $ case f $ B.pack [0x98] of
+          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 3
             . whenEqProp bs' (B.pack [0xe2, 0x98])
-              $ case f' (B.pack [0x83, 32, 0xFF]) of
+              $ case f' $ B.pack [0x83, 32, 0xFF] of
                 E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7
                   . whenEqProp word8 0xFF
                     $ case f'' $ Just 'x' of
-                      E.Ok x -> "hi ☃ x" === t `T.append` t' `T.append` t'' `T.append` x
+                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
+                        . whenEqProp pos''' 8 $ bs'' === B.empty
                       _ -> counterexample "Should have been decoded text." False
                 _ -> counterexample "Should have been an invalid word." False
           _ -> counterexample "Should have encountered an incomplete code point." False
     _ -> counterexample "Should have encountered an incomplete code point." False
-t_decode_with_error5'' =
-  case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2, 0x98, 104]) of -- not quite "hi ☃", the last byte is wrong
-    E.IncompleteCodePoint _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False
-    E.Ok x -> counterexample ("What??? " ++ show x) False
-    E.InvalidWord _ pos mWord8 _ -> counterexample ("Invalid word " ++ show mWord8 ++ " at byte position " ++ show pos ++ ".") True
+t_decode_utf8_with_error4 =
+  case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2, 0x98, 104] of -- not quite "hi ☃", the last byte is wrong
+    E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False
+    E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -293,10 +295,10 @@ testTranscoding =
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
       testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_with_error2''" t_decode_with_error2'',
-      testProperty "t_decode_with_error3''" t_decode_with_error3'',
-      testProperty "t_decode_with_error4''" t_decode_with_error4'',
-      testProperty "t_decode_with_error5''" t_decode_with_error5'',
+      testProperty "t_decode_utf8_with_error1" t_decode_utf8_with_error1,
+      testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2,
+      testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3,
+      testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]

From 1bd9ef016459e8fed3ed9d820d66d7e6dc450e85 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 6 Jul 2022 14:19:20 -0600
Subject: [PATCH 12/87] Stream decoding added for utf-16 and utf-32. Either
 function added for decoding ASCII.

---
 src/Data/Text/Encoding.hs                | 260 +++++++++++++++--------
 src/Data/Text/Internal/Encoding/Utf16.hs |  38 +++-
 src/Data/Text/Internal/Encoding/Utf32.hs |  32 +++
 tests/Tests/Properties/Transcoding.hs    |  80 +++++++
 text.cabal                               |   1 -
 5 files changed, 322 insertions(+), 89 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index f9fc2a36..fecf8adb 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -35,6 +35,7 @@ module Data.Text.Encoding
     , decodeUtf8'
 
     -- *** Controllable error handling
+    , decodeAsciiE
     , decodeUtf8With
     , decodeUtf16LEWith
     , decodeUtf16BEWith
@@ -46,6 +47,8 @@ module Data.Text.Encoding
     , streamDecodeUtf8
     , streamDecodeUtf8With
     , streamDecodeUtf8With'
+    , streamDecodeUtf16With
+    , streamDecodeUtf32With
     , Decoding(..)
     , StreamDecode(..)
 
@@ -74,7 +77,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
 import Control.Monad.ST (runST, ST)
-import Data.Bits (shiftL, shiftR, (.&.), (.|.))
+import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
@@ -85,7 +88,7 @@ import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8)
+import Data.Word (Word8, Word16, Word32)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
@@ -96,6 +99,8 @@ import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
 import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..))
+import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes)
+import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
@@ -213,6 +218,20 @@ isValidBS bs = start 0
 #endif
 #endif
 
+-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
+--
+-- This is a total function: On success the decoded 'Text' is within a
+-- 'Right' value, and an error ('Left' 'Int') indicates the postion of
+-- the offending 'Word8'.
+decodeAsciiE :: ByteString -> Either Int Text
+decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do
+  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+    c_is_ascii src (src `plusPtr` len)
+  pure $ if asciiPrefixLen == len
+  then let !(SBS.SBS arr) = SBS.toShort bs in
+        Right (Text (A.ByteArray arr) 0 len)
+  else Left asciiPrefixLen
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
 -- Surrogate code points in replacement character returned by 'OnDecodeError'
@@ -349,13 +368,20 @@ data StreamDecode w
   -- interpret the 'Word8' in this specific occurrence of it.
   | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w)
 
--- | Like 'streamDecodeUtf8With', but instead of accepting an
--- 'OnDecodeError' callback and returning a 'Decoding', it returns a
--- 'StreamDecode'.
-streamDecodeUtf8With' :: ByteString -> StreamDecode Word8
-streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty
+data Progression
+  = WriteAndAdvance Char (Int -> Int)
+  | NeedMore
+  | Invalid
+
+streamDecodeWith :: (Bits w, Num w)
+  => Int
+  -> Bool
+  -> ByteString
+  -> ((Int -> Word8) -> Int -> Int -> Progression)
+  -> StreamDecode w
+streamDecodeWith wordByteSize isUtf bs decodeF = decodeAtOffset Nothing 0 bs mempty
   where
-    decodeAtOffset :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode Word8
+    decodeAtOffset :: (Bits w, Num w) => Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode w
     decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
       marr <- A.new len'
       case mStartChar of
@@ -364,91 +390,153 @@ streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty
           outer marr len' 0 d
         _ -> outer marr len' 0 0
       where
-        wordByteSize = 1
-        len = len1 + len2
-        len' = len + 4
-
+        index :: Int -> Word8
         index i
           | i < len1  = B.index bs1 i
           | otherwise = B.index bs2 $ i - len1
 
-        -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-        guessUtf8Boundary :: Int
-        guessUtf8Boundary
-          | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
-          | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
-          | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
-          | len2 >= 3 && w2 >= 0xC0 = len2 - 3
-          | len2 >= 4 && w3 >= 0xC0 = len2 - 4
-          | otherwise = 0
+        len :: Int
+        len = len1 + len2
+        len' :: Int
+        len' = (len `div` wordByteSize) + 4
+
+        queryutf8Optim srcOff dstOff dstLen dst f g
+          -- shortcut for utf-8
+          | wordByteSize == 1
+          , isUtf
+          , srcOff >= len1
+          -- potential valid utf8 content endpoint
+          , utf8End <- len1 + guessUtf8Boundary
+          , srcOff < utf8End
+          -- potential valid utf8 content length
+          , utf8Len <- utf8End - srcOff
+          -- potential endpoint in destination array if copied
+          , dstOff' <- dstOff + utf8Len
+          , dstOff' <= dstLen
+          , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
+          , isValidBS bs' = do
+            withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
+            f utf8End dstOff'
+          | otherwise = g
           where
-            w0 = B.index bs2 (len2 - 1)
-            w1 = B.index bs2 (len2 - 2)
-            w2 = B.index bs2 (len2 - 3)
-            w3 = B.index bs2 (len2 - 4)
+            -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+            guessUtf8Boundary :: Int
+            guessUtf8Boundary
+              | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+              | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+              | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+              | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+              | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+              | otherwise = 0
+              where
+                w0 = B.index bs2 (len2 - 1)
+                w1 = B.index bs2 (len2 - 2)
+                w2 = B.index bs2 (len2 - 3)
+                w3 = B.index bs2 (len2 - 4)
 
-        decodeFrom :: Int -> DecoderResult
-        decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
+        outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode w)
+        outer dst dstLen = inner
           where
-            step i (Incomplete a b)
-              | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-            step _ st = st
+            inner srcOff dstOff
+              -- finished
+              | len - srcOff < 1 = incomplete
+              -- shortcut for utf-8
+              | otherwise = queryutf8Optim srcOff dstOff dstLen dst inner $
+                if len - srcOff < wordByteSize
+                  -- incomplete code point
+                  then incomplete
+                  else
+                    if dstOff + 4 > dstLen
+                      -- need more space in destination
+                      then do
+                        let dstLen' = dstLen + 4
+                        dst' <- A.resizeM dst dstLen'
+                        outer dst' dstLen' srcOff dstOff
+                      else
+                        case decodeF index len srcOff of
+                          WriteAndAdvance c advance -> do
+                            d <- unsafeWrite dst dstOff c
+                            inner (advance d) $ dstOff + d
+                          NeedMore -> incomplete
+                          Invalid -> invalid
+              where
+                wrapUp f = do
+                  A.shrinkM dst dstOff
+                  arr <- A.unsafeFreeze dst
+                  f $ Text arr 0 dstOff
+                contin off res = wrapUp $ \ t ->
+                  pure . res t $ if off >= len1
+                    then B.drop (off - len1) bs2
+                    else B.drop off $ bs1 `B.append` bs2
+                incomplete =
+                  let bytePos' = bytePos + srcOff in
+                  contin srcOff $ \ t bs' ->
+                    ThusFar t bytePos' bs' $
+                      decodeAtOffset Nothing bytePos' bs'
+                invalid =
+                  let srcOff' = srcOff + wordByteSize
+                      bytesToWord n word =
+                        if n > 0
+                          then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                          else word
+                  in
+                  contin srcOff' $ \ t bs' ->
+                    InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' ->
+                      decodeAtOffset mChar' (bytePos + srcOff') bs' mempty
 
-        outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode Word8)
-        outer dst dstLen = inner
-            where
-              inner srcOff dstOff
-                | srcOff >= len = thusFar
-
-                | srcOff >= len1
-                , srcOff < len1 + guessUtf8Boundary
-                , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
-                , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
-                , isValidBS bs' = do
-                  withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                    unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff
-                  inner (len1 + guessUtf8Boundary) $ dstOff + (len1 + guessUtf8Boundary - srcOff)
-
-                | dstOff + 4 > dstLen = do
-                  let dstLen' = dstLen + 4
-                  dst' <- A.resizeM dst dstLen'
-                  outer dst' dstLen' srcOff dstOff
-
-                | otherwise =
-                  case decodeFrom srcOff of
-                    Accept c -> do
-                      d <- unsafeWrite dst dstOff c
-                      inner (srcOff + d) $ dstOff + d
-                    Reject -> invalid
-                    Incomplete{} -> thusFar
-                where
-                  wrapUp f = do
-                    A.shrinkM dst dstOff
-                    arr <- A.unsafeFreeze dst
-                    f $ Text arr 0 dstOff
-                  continue off res = wrapUp $ \ t ->
-                    pure . res t $ if off >= len1
-                      then
-                        if off >= len
-                          then B.empty
-                          else B.drop (off - len1) bs2
-                      else B.drop off $ bs1 `B.append` bs2
-                  thusFar =
-                    let bytePos' = bytePos + srcOff in
-                    continue srcOff $ \ t bs' ->
-                      ThusFar t bytePos' bs' $
-                        decodeAtOffset Nothing bytePos' bs'
-                  invalid =
-                    let srcOff' = srcOff + wordByteSize
-                        bytesToWord n word =
-                          if n > 0
-                            then bytesToWord (n - 1) $
-                              (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                            else word
-                    in
-                    continue srcOff' $ \ t bs' ->
-                      InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' ->
-                        decodeAtOffset mChar' (bytePos + srcOff') bs' mempty
+-- | Like 'streamDecodeUtf8With', but instead of accepting an
+-- 'OnDecodeError' callback and returning a 'Decoding', it returns a
+-- 'StreamDecode'.
+streamDecodeUtf8With' :: ByteString -> StreamDecode Word8
+streamDecodeUtf8With' bs = streamDecodeWith 1 True bs $ \ index len srcOff ->
+  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
+        where
+          step i (Incomplete a b)
+            | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
+          step _ st = st
+  in
+  case decodeFrom srcOff of
+    Accept c -> WriteAndAdvance c (srcOff +)
+    Reject -> Invalid
+    Incomplete{} -> NeedMore
+
+-- | Like 'streamDecodeUtf8With'', but for UTF-16 encoding and a
+-- 'Bool' argument for whether the encoding is big- or little-endian.
+streamDecodeUtf16With :: Bool -> ByteString -> StreamDecode Word16
+streamDecodeUtf16With isBE bs = streamDecodeWith 2 True bs $ \ index len srcOff ->
+  -- get next Word8 pair
+  let writeAndAdvance c n = WriteAndAdvance c $ const n
+      b0 = index $ if isBE then srcOff else srcOff + 1
+      b1 = index $ if isBE then srcOff + 1 else srcOff
+  in
+  case queryUtf16Bytes b0 of
+    OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2
+    TwoWord16 g ->
+      if len - srcOff < 4
+        -- not enough Word8s to finish the code point
+        then NeedMore
+        else
+          let b2 = index $ srcOff + (if isBE then 2 else 3)
+              b3 = index $ srcOff + (if isBE then 3 else 2)
+          in
+          case g b2 of
+            Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4
+            _ -> Invalid
+    _ -> Invalid
+
+-- | Like 'streamDecodeUtf16With', but for UTF-32.
+streamDecodeUtf32With :: Bool -> ByteString -> StreamDecode Word32
+streamDecodeUtf32With isBE bs = streamDecodeWith 4 True bs $ \ index _ srcOff ->
+  -- get next Word8 quartet
+  let writeAndAdvance c n = WriteAndAdvance c $ const n in
+  case queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3 of
+    Just f -> case f . index $ srcOff + (if isBE then 1 else 2) of
+      Just f' -> case f' . index $ srcOff + (if isBE then 2 else 1) of
+        Just f'' -> writeAndAdvance (f'' . index $ if isBE then srcOff + 3 else srcOff) $ srcOff + 4
+        _ -> Invalid
+      _ -> Invalid
+    _ -> Invalid
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
@@ -657,4 +745,4 @@ cSizeToInt = fromIntegral
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
-#endif
\ No newline at end of file
+#endif
diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs
index 4fe11a62..34bc78dd 100644
--- a/src/Data/Text/Internal/Encoding/Utf16.hs
+++ b/src/Data/Text/Internal/Encoding/Utf16.hs
@@ -23,14 +23,17 @@ module Data.Text.Internal.Encoding.Utf16
       chr2
     , validate1
     , validate2
+    , Utf16Result(..)
+    , queryUtf16Bytes
     ) where
 
+import Data.Bits ((.&.))
 import GHC.Exts
-import GHC.Word (Word16(..))
+import GHC.Word (Word16(..), Word8(..))
 
 #if !MIN_VERSION_base(4,16,0)
 -- harmless to import, except for warnings that it is unused.
-import Data.Text.Internal.PrimCompat ( word16ToWord# )
+import Data.Text.Internal.PrimCompat ( word16ToWord#, word8ToWord# )
 #endif
 
 chr2 :: Word16 -> Word16 -> Char
@@ -50,3 +53,34 @@ validate2       ::  Word16 -> Word16 -> Bool
 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
                   x2 >= 0xDC00 && x2 <= 0xDFFF
 {-# INLINE validate2 #-}
+
+data Utf16Result
+  = OneWord16 (Word8 -> Char)
+  | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char))
+  | Invalid16
+
+-- queryUtf16Bytes :: Word8 -> (Word8 -> Word8 -> a)
+--   -> Utf16Result a
+queryUtf16Bytes b0@(W8# w0#) =
+  if b0 < 0xD8 || b0 >= 0xE0
+    then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#))))
+    else
+      -- 110110xx: start of surrogate pair
+      if b0 .&. 0xFC == 0xD8
+        then TwoWord16 $ \ b2@(W8# w2#) ->
+          if b2 .&. 0xFC == 0xDC
+            -- valid surrogate
+            then Just $ \ (W8# w1#) (W8# w3#) ->
+              C# (chr# (
+                (orI#
+                  (orI#
+                    (orI#
+                      (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#))
+                      (word2Int# (shiftL# (word8ToWord# w1#) 10#))
+                    )
+                    (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#)))
+                  (word2Int# (word8ToWord# w3#))) +# 0x10000#
+              ))
+            else Nothing
+        else Invalid16
+{-# INLINE queryUtf16Bytes #-}
diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs
index 4e8e9b46..517ff05a 100644
--- a/src/Data/Text/Internal/Encoding/Utf32.hs
+++ b/src/Data/Text/Internal/Encoding/Utf32.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE CPP, MagicHash #-}
+
 -- |
 -- Module      : Data.Text.Internal.Encoding.Utf32
 -- Copyright   : (c) 2008, 2009 Tom Harper,
@@ -17,10 +19,40 @@
 module Data.Text.Internal.Encoding.Utf32
     (
       validate
+    , queryUtf32Bytes
     ) where
 
 import Data.Word (Word32)
+import GHC.Exts
+import GHC.Word (Word8(..))
+
+#if !MIN_VERSION_base(4,16,0)
+-- harmless to import, except for warnings that it is unused.
+import Data.Text.Internal.PrimCompat (word8ToWord#)
+#endif
 
 validate    :: Word32 -> Bool
 validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
 {-# INLINE validate #-}
+
+queryUtf32Bytes :: (Eq a, Num a) => a -> Maybe (Word8 -> Maybe (Word8 -> Maybe (Word8 -> Char)))
+queryUtf32Bytes b0 =
+  if b0 == 0
+    then Just $ \ b1@(W8# w1#) ->
+      if b1 < 0x11
+        then Just $ \ b2@(W8# w2#) ->
+          if b1 > 0 || b2 < 0xD8 || b2 >= 0xE0
+            then Just $ \ (W8# w3#) ->
+              C# (chr#
+                  (orI#
+                    (orI#
+                      (word2Int# (shiftL# (word8ToWord# w1#) 16#))
+                      (word2Int# (shiftL# (word8ToWord# w2#) 8#))
+                    )
+                    (word2Int# (word8ToWord# w3#))
+                  )
+                )
+            else Nothing
+        else Nothing
+    else Nothing
+{-# INLINE queryUtf32Bytes #-}
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index f18c0694..e6172b47 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -27,6 +27,9 @@ import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
+t_asciiE t    = E.decodeAsciiE (E.encodeUtf8 a) === Right a
+    where a  = T.map (\c -> chr (ord c `mod` 128)) t
+
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
 tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
@@ -249,6 +252,78 @@ t_decode_utf8_with_error4 =
     E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False
     E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2
 
+t_decode_utf16BE_with_error =
+  case E.streamDecodeUtf16With True $ B.pack [0] of
+    E.ThusFar t pos bs f -> whenEqProp pos 0
+      . whenEqProp bs (B.pack [0])
+        $ case f $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] of
+          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6
+            . whenEqProp bs' (B.pack [0xD8, 0x01])
+              $ case f' $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] of
+                E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12
+                  . whenEqProp word16 (0xDC `Bits.shiftL` 8)
+                    $ case f'' $ Just 'x' of
+                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x"
+                        . whenEqProp pos''' 14 $ bs'' === B.empty
+                      _ -> counterexample "Should have been decoded text." False
+                _ -> counterexample "Should have been an invalid word." False
+          _ -> counterexample "Should have encountered an incomplete code point." False
+    _ -> counterexample "Should have encountered an incomplete code point." False
+
+t_decode_utf16LE_with_error =
+  case E.streamDecodeUtf16With False $ B.pack [104] of
+    E.ThusFar t pos bs f -> whenEqProp pos 0
+      . whenEqProp bs (B.pack [104])
+        $ case f $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] of
+          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6
+            . whenEqProp bs' (B.pack [0x01, 0xD8])
+              $ case f' $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] of
+                E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12
+                  . whenEqProp word16 0xDC
+                    $ case f'' $ Just 'x' of
+                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x"
+                        . whenEqProp pos''' 14 $ bs'' === B.empty
+                      _ -> counterexample "Should have been decoded text." False
+                _ -> counterexample "Should have been an invalid word." False
+          _ -> counterexample "Should have encountered an incomplete code point." False
+    _ -> counterexample "Should have encountered an incomplete code point." False
+
+t_decode_utf32BE_with_error =
+  case E.streamDecodeUtf32With True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] of -- hi \xe2
+    E.ThusFar t pos bs f -> whenEqProp pos 8
+      . whenEqProp bs (B.pack [0, 0])
+        $ case f $ B.pack [0, 32, 0, 0, 0x26] of
+          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12
+            . whenEqProp bs' (B.pack [0, 0, 0x26])
+              $ case f' $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] of
+                E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20
+                  . whenEqProp word32 0x0000D800
+                    $ case f'' $ Just 'x' of
+                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
+                        . whenEqProp pos''' 24 $ bs'' === B.empty
+                      _ -> counterexample "Should have been decoded text." False
+                _ -> counterexample "Should have been an invalid word." False
+          _ -> counterexample "Should have encountered an incomplete code point." False
+    _ -> counterexample "Should have encountered an incomplete code point." False
+
+t_decode_utf32LE_with_error =
+  case E.streamDecodeUtf32With False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 32, 0] of -- hi \xe2
+    E.ThusFar t pos bs f -> whenEqProp pos 8
+      . whenEqProp bs (B.pack [32, 0])
+        $ case f $ B.pack [0, 0, 0x03, 0x26, 0] of
+          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12
+            . whenEqProp bs' (B.pack [0x03, 0x26, 0])
+              $ case f' $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] of
+                E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20
+                  . whenEqProp word32 0x00D80000
+                    $ case f'' $ Just 'x' of
+                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
+                        . whenEqProp pos''' 24 $ bs'' === B.empty
+                      _ -> counterexample "Should have been decoded text." False
+                _ -> counterexample "Should have been an invalid word." False
+          _ -> counterexample "Should have encountered an incomplete code point." False
+    _ -> counterexample "Should have encountered an incomplete code point." False
+
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
   text `T.isInfixOf`
@@ -257,6 +332,7 @@ t_infix_concat bs1 text bs2 =
 testTranscoding :: TestTree
 testTranscoding =
   testGroup "transcoding" [
+    testProperty "t_asciiE" t_asciiE,
     testProperty "t_ascii" t_ascii,
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,
@@ -299,6 +375,10 @@ testTranscoding =
       testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2,
       testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3,
       testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4,
+      testProperty "t_decode_utf16BE_with_error" t_decode_utf16BE_with_error,
+      testProperty "t_decode_utf16LE_with_error" t_decode_utf16LE_with_error,
+      testProperty "t_decode_utf32BE_with_error" t_decode_utf32BE_with_error,
+      testProperty "t_decode_utf32LE_with_error" t_decode_utf32LE_with_error,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
diff --git a/text.cabal b/text.cabal
index 5231dbac..488f1451 100644
--- a/text.cabal
+++ b/text.cabal
@@ -254,7 +254,6 @@ test-suite tests
     deepseq,
     directory,
     ghc-prim,
-    mtl,
     tasty,
     tasty-hunit,
     tasty-quickcheck,

From 22b866656dcc7167a96b12cdf5bbd90a0bbf7398 Mon Sep 17 00:00:00 2001
From: Sledge <sledged@gmail.com>
Date: Thu, 7 Jul 2022 11:17:38 -0600
Subject: [PATCH 13/87] Typo in documentation

Co-authored-by: Xia Li-yao <Lysxia@users.noreply.github.com>
---
 src/Data/Text/Encoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index fecf8adb..f44a9999 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -361,7 +361,7 @@ data StreamDecode w
   -- the incomplete code point, and a function that accepts another
   -- 'ByteString' as a continuation of the previous input.
   = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w)
-  -- | An encounter witt an invalid utf-8 'Word8'. The value contains
+  -- | An encounter with an invalid UTF-8 'Word8'. The value contains
   -- the decoded text up to but not including the invalid 'Word8',
   -- the position in the 'ByteString' of the offending 'Word8', the
   -- offender, and a function that accepts a possible 'Char' as to

From 1dd6f34d9688e45b45e2d703c087136c4e62c602 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 10 Jul 2022 20:16:38 -0600
Subject: [PATCH 14/87] Redesigned with different approach

---
 src/Data/Text/Encoding.hs                | 429 ++++++++++++-----------
 src/Data/Text/Encoding/Types.hs          |  35 ++
 src/Data/Text/Internal/Encoding/Utf16.hs |   3 +-
 src/Data/Text/Lazy/Encoding.hs           |  35 +-
 tests/Tests/Properties/Transcoding.hs    | 270 +++++++-------
 text.cabal                               |   1 +
 6 files changed, 441 insertions(+), 332 deletions(-)
 create mode 100644 src/Data/Text/Encoding/Types.hs

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index f44a9999..4ed1edc8 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -30,6 +30,13 @@ module Data.Text.Encoding
     -- $total
       decodeLatin1
     , decodeUtf8Lenient
+    , decodeUtf8Chunk
+    , decodeUtf8Chunks
+    , decodeUtf16Chunk
+    , decodeUtf16Chunks
+    , decodeUtf32Chunk
+    , decodeUtf32Chunks
+    , DecodeResult(..)
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -46,11 +53,7 @@ module Data.Text.Encoding
     -- $stream
     , streamDecodeUtf8
     , streamDecodeUtf8With
-    , streamDecodeUtf8With'
-    , streamDecodeUtf16With
-    , streamDecodeUtf32With
     , Decoding(..)
-    , StreamDecode(..)
 
     -- ** Partial Functions
     -- $partial
@@ -76,14 +79,15 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (runST, ST)
+import Control.Monad.ST (runST)
 import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), safe, empty, append)
+import Data.Text.Encoding.Types (DecodeResult(..))
+import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show as T (singleton)
@@ -91,7 +95,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word8, Word16, Word32)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
-import Foreign.Storable (poke, peekByteOff)
+import Foreign.Storable (Storable(..), poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
@@ -218,6 +222,205 @@ isValidBS bs = start 0
 #endif
 #endif
 
+data Progression
+  = WriteAndAdvance Char (Int -> Int)
+  | NeedMore
+  | Invalid
+
+decodeChunksProxy :: (Bits w, Num w, Storable w) =>
+  w -- only used for Storable.sizeOf argument which the function discards
+  -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
+  -> ((Int -> Word8) -> Int -> Int -> Progression)
+  -> ByteString
+  -> ByteString
+  -> DecodeResult Text ByteString w
+decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+  marr <- A.new len'
+  outer marr len' 0 0
+  where
+    wordByteSize = sizeOf w
+
+    index :: Int -> Word8
+    index i
+      | i < len1  = B.index bs1 i
+      | otherwise = B.index bs2 $ i - len1
+
+    len :: Int
+    len = len1 + len2
+    len' :: Int
+    len' = (len `div` wordByteSize) + 4
+
+    -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)
+    outer dst dstLen = inner
+      where
+        inner srcOff dstOff
+          -- finished
+          | len - srcOff < 1 = goodSoFar
+          -- shortcut for utf-8
+          | otherwise =
+            case queryOptimization bs1 bs2 srcOff of
+              Just (arr, tLen) ->
+                let minLen = tLen + dstOff in
+                if minLen > dstLen
+                  then do
+                    let dstLen' = minLen + 4
+                    dst' <- A.resizeM dst dstLen'
+                    A.copyI tLen dst' dstOff arr 0
+                    outer dst' dstLen' (srcOff + tLen) minLen
+                  else do
+                    A.copyI tLen dst dstOff arr 0
+                    inner (srcOff + tLen) minLen
+              _ -> if len - srcOff < wordByteSize
+                  -- incomplete code point
+                  then goodSoFar
+                  else
+                    if dstOff + 4 > dstLen
+                      -- need more space in destination
+                      then do
+                        let dstLen' = dstLen + 4
+                        dst' <- A.resizeM dst dstLen'
+                        outer dst' dstLen' srcOff dstOff
+                      else
+                        case decodeF index len srcOff of
+                          WriteAndAdvance c advance -> do
+                            d <- unsafeWrite dst dstOff c
+                            inner (advance d) $ dstOff + d
+                          NeedMore -> goodSoFar
+                          Invalid -> invalid
+          where
+            contin off res = do
+              A.shrinkM dst dstOff
+              arr <- A.unsafeFreeze dst
+              pure . res (Text arr 0 dstOff) $ if off >= len1
+                then B.drop (off - len1) bs2
+                else B.drop off $ bs1 `B.append` bs2
+            goodSoFar =
+              contin srcOff $ \ t bs' ->
+                DecodeResult t Nothing bs' srcOff
+            invalid =
+              let srcOff' = srcOff + wordByteSize
+                  bytesToWord n word =
+                    if n > 0
+                      then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                      else word
+              in
+              contin srcOff' $ \ t bs' ->
+                DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
+
+decodeChunks :: (Bits w, Num w, Storable w) =>
+  (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
+  -> ((Int -> Word8) -> Int -> Int -> Progression)
+  -> ByteString
+  -> ByteString
+  -> DecodeResult Text ByteString w
+decodeChunks = decodeChunksProxy undefined -- This allows Haskell can
+-- determine the size in bytes of a data type using Storable.sizeOf
+-- so that it doesn't have to be passed as an arugment. Storable.sizeOf
+-- discards the actual value without resolving it.
+
+queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int)
+queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
+  | srcOff >= len1
+  -- potential valid utf8 content endpoint
+  , utf8End <- len1 + guessUtf8Boundary
+  , srcOff < utf8End
+  -- potential valid utf8 content length
+  , utf8Len <- utf8End - srcOff
+  , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
+  , isValidBS bs' = Just (runST $ do
+    marr <- A.new utf8Len
+    withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+      unsafeSTToIO $ A.copyFromPointer marr 0 src utf8Len
+    arr <- A.unsafeFreeze marr
+    pure $ (arr, utf8Len))
+  | otherwise = Nothing
+  where
+    -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
+    guessUtf8Boundary :: Int
+    guessUtf8Boundary
+      | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
+      | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
+      | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
+      | len2 >= 3 && w2 >= 0xC0 = len2 - 3
+      | len2 >= 4 && w3 >= 0xC0 = len2 - 4
+      | otherwise = 0
+      where
+        w0 = B.index bs2 (len2 - 1)
+        w1 = B.index bs2 (len2 - 2)
+        w2 = B.index bs2 (len2 - 3)
+        w3 = B.index bs2 (len2 - 4)
+
+decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression
+decodeUtf8Base index len srcOff =
+  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
+        where
+          step i (Incomplete a b)
+            | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
+          step _ st = st
+  in
+  case decodeFrom srcOff of
+    Accept c -> WriteAndAdvance c (srcOff +)
+    Reject -> Invalid
+    Incomplete{} -> NeedMore
+
+-- | Decode two 'ByteString's containing UTF-8-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8
+decodeUtf8Chunks = decodeChunks queryUtf8DecodeOptimization decodeUtf8Base
+
+-- | Decode a 'ByteString' containing UTF-8-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf8Chunk :: ByteString -> DecodeResult Text ByteString Word8
+decodeUtf8Chunk = decodeUtf8Chunks mempty
+
+noOptimization :: p1 -> p2 -> p3 -> Maybe a
+noOptimization _ _ _ = Nothing
+
+-- | Decode two 'ByteString's containing UTF-16-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16
+decodeUtf16Chunks isBE = decodeChunks noOptimization $ \ index len srcOff ->
+  -- get next Word8 pair
+  let writeAndAdvance c n = WriteAndAdvance c $ const n
+      b0 = index $ if isBE then srcOff else srcOff + 1
+      b1 = index $ if isBE then srcOff + 1 else srcOff
+  in
+  case queryUtf16Bytes b0 of
+    OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2
+    TwoWord16 g ->
+      if len - srcOff < 4
+        -- not enough Word8s to finish the code point
+        then NeedMore
+        else
+          let b2 = index $ srcOff + (if isBE then 2 else 3)
+              b3 = index $ srcOff + (if isBE then 3 else 2)
+          in
+          case g b2 of
+            Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4
+            _ -> Invalid
+    _ -> Invalid
+
+-- | Decode a 'ByteString' containing UTF-16-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf16Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word16
+decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty
+
+-- | Decode two 'ByteString's containing UTF-16-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32
+decodeUtf32Chunks isBE = decodeChunks noOptimization $ \ index _ srcOff ->
+  -- get next Word8 quartet
+  case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3)
+      >>= ($ (index $ srcOff + (if isBE then 1 else 2)))
+      >>= ($ (index $ srcOff + (if isBE then 2 else 1))) of
+    Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4
+    _ -> Invalid
+
+-- | Decode a 'ByteString' containing UTF-32-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf32Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word32
+decodeUtf32Chunk isBE = decodeUtf32Chunks isBE mempty
+
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a total function: On success the decoded 'Text' is within a
@@ -255,10 +458,9 @@ decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of
 
 -- $stream
 --
--- The 'streamDecodeUtf8',  'streamDecodeUtf8With', and
--- 'streamDecodeUtf8With'' functions accept a 'ByteString' that
--- represents a possibly incomplete input (e.g. a packet from a network
--- stream) that may not end on a UTF-8 boundary.
+-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
+-- a 'ByteString' that represents a possibly incomplete input (e.g. a
+-- packet from a network stream) that may not end on a UTF-8 boundary.
 --
 -- 1. The maximal prefix of 'Text' that could be decoded from the
 --    given input.
@@ -335,7 +537,7 @@ streamDecodeUtf8 ::
   ByteString -> Decoding
 streamDecodeUtf8 = streamDecodeUtf8With strictDecode
 
--- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
 -- encoded text.
 --
 -- @since 1.0.0.0
@@ -344,199 +546,18 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With'
-  where
-    go t res = case res of
-      ThusFar txt _ bs f -> Some (t `append` txt) bs $ go empty . f
-      InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8
-
-    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-
--- | A stream-oriented decoding result of one of three possibilities.
-data StreamDecode w
-  -- | The decoded 'Text' thus far with a possibly incomplete code
-  -- point at the end of the 'ByteString'. The value contains the
-  -- decoded text up to but not including the incomplete code point,
-  -- the position in the 'ByteString' where the code point starts,
-  -- the incomplete code point, and a function that accepts another
-  -- 'ByteString' as a continuation of the previous input.
-  = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w)
-  -- | An encounter with an invalid UTF-8 'Word8'. The value contains
-  -- the decoded text up to but not including the invalid 'Word8',
-  -- the position in the 'ByteString' of the offending 'Word8', the
-  -- offender, and a function that accepts a possible 'Char' as to
-  -- interpret the 'Word8' in this specific occurrence of it.
-  | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w)
-
-data Progression
-  = WriteAndAdvance Char (Int -> Int)
-  | NeedMore
-  | Invalid
-
-streamDecodeWith :: (Bits w, Num w)
-  => Int
-  -> Bool
-  -> ByteString
-  -> ((Int -> Word8) -> Int -> Int -> Progression)
-  -> StreamDecode w
-streamDecodeWith wordByteSize isUtf bs decodeF = decodeAtOffset Nothing 0 bs mempty
+streamDecodeUtf8With onErr = g empty mempty
   where
-    decodeAtOffset :: (Bits w, Num w) => Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode w
-    decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
-      marr <- A.new len'
-      case mStartChar of
-        Just c -> do
-          d <- unsafeWrite marr 0 $ safe c
-          outer marr len' 0 d
-        _ -> outer marr len' 0 0
-      where
-        index :: Int -> Word8
-        index i
-          | i < len1  = B.index bs1 i
-          | otherwise = B.index bs2 $ i - len1
-
-        len :: Int
-        len = len1 + len2
-        len' :: Int
-        len' = (len `div` wordByteSize) + 4
-
-        queryutf8Optim srcOff dstOff dstLen dst f g
-          -- shortcut for utf-8
-          | wordByteSize == 1
-          , isUtf
-          , srcOff >= len1
-          -- potential valid utf8 content endpoint
-          , utf8End <- len1 + guessUtf8Boundary
-          , srcOff < utf8End
-          -- potential valid utf8 content length
-          , utf8Len <- utf8End - srcOff
-          -- potential endpoint in destination array if copied
-          , dstOff' <- dstOff + utf8Len
-          , dstOff' <= dstLen
-          , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
-          , isValidBS bs' = do
-            withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
-            f utf8End dstOff'
-          | otherwise = g
-          where
-            -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-            guessUtf8Boundary :: Int
-            guessUtf8Boundary
-              | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
-              | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
-              | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
-              | len2 >= 3 && w2 >= 0xC0 = len2 - 3
-              | len2 >= 4 && w3 >= 0xC0 = len2 - 4
-              | otherwise = 0
-              where
-                w0 = B.index bs2 (len2 - 1)
-                w1 = B.index bs2 (len2 - 2)
-                w2 = B.index bs2 (len2 - 3)
-                w3 = B.index bs2 (len2 - 4)
-
-        outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode w)
-        outer dst dstLen = inner
-          where
-            inner srcOff dstOff
-              -- finished
-              | len - srcOff < 1 = incomplete
-              -- shortcut for utf-8
-              | otherwise = queryutf8Optim srcOff dstOff dstLen dst inner $
-                if len - srcOff < wordByteSize
-                  -- incomplete code point
-                  then incomplete
-                  else
-                    if dstOff + 4 > dstLen
-                      -- need more space in destination
-                      then do
-                        let dstLen' = dstLen + 4
-                        dst' <- A.resizeM dst dstLen'
-                        outer dst' dstLen' srcOff dstOff
-                      else
-                        case decodeF index len srcOff of
-                          WriteAndAdvance c advance -> do
-                            d <- unsafeWrite dst dstOff c
-                            inner (advance d) $ dstOff + d
-                          NeedMore -> incomplete
-                          Invalid -> invalid
-              where
-                wrapUp f = do
-                  A.shrinkM dst dstOff
-                  arr <- A.unsafeFreeze dst
-                  f $ Text arr 0 dstOff
-                contin off res = wrapUp $ \ t ->
-                  pure . res t $ if off >= len1
-                    then B.drop (off - len1) bs2
-                    else B.drop off $ bs1 `B.append` bs2
-                incomplete =
-                  let bytePos' = bytePos + srcOff in
-                  contin srcOff $ \ t bs' ->
-                    ThusFar t bytePos' bs' $
-                      decodeAtOffset Nothing bytePos' bs'
-                invalid =
-                  let srcOff' = srcOff + wordByteSize
-                      bytesToWord n word =
-                        if n > 0
-                          then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                          else word
-                  in
-                  contin srcOff' $ \ t bs' ->
-                    InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' ->
-                      decodeAtOffset mChar' (bytePos + srcOff') bs' mempty
-
--- | Like 'streamDecodeUtf8With', but instead of accepting an
--- 'OnDecodeError' callback and returning a 'Decoding', it returns a
--- 'StreamDecode'.
-streamDecodeUtf8With' :: ByteString -> StreamDecode Word8
-streamDecodeUtf8With' bs = streamDecodeWith 1 True bs $ \ index len srcOff ->
-  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
-        where
-          step i (Incomplete a b)
-            | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-          step _ st = st
-  in
-  case decodeFrom srcOff of
-    Accept c -> WriteAndAdvance c (srcOff +)
-    Reject -> Invalid
-    Incomplete{} -> NeedMore
-
--- | Like 'streamDecodeUtf8With'', but for UTF-16 encoding and a
--- 'Bool' argument for whether the encoding is big- or little-endian.
-streamDecodeUtf16With :: Bool -> ByteString -> StreamDecode Word16
-streamDecodeUtf16With isBE bs = streamDecodeWith 2 True bs $ \ index len srcOff ->
-  -- get next Word8 pair
-  let writeAndAdvance c n = WriteAndAdvance c $ const n
-      b0 = index $ if isBE then srcOff else srcOff + 1
-      b1 = index $ if isBE then srcOff + 1 else srcOff
-  in
-  case queryUtf16Bytes b0 of
-    OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2
-    TwoWord16 g ->
-      if len - srcOff < 4
-        -- not enough Word8s to finish the code point
-        then NeedMore
-        else
-          let b2 = index $ srcOff + (if isBE then 2 else 3)
-              b3 = index $ srcOff + (if isBE then 3 else 2)
-          in
-          case g b2 of
-            Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4
-            _ -> Invalid
-    _ -> Invalid
-
--- | Like 'streamDecodeUtf16With', but for UTF-32.
-streamDecodeUtf32With :: Bool -> ByteString -> StreamDecode Word32
-streamDecodeUtf32With isBE bs = streamDecodeWith 4 True bs $ \ index _ srcOff ->
-  -- get next Word8 quartet
-  let writeAndAdvance c n = WriteAndAdvance c $ const n in
-  case queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3 of
-    Just f -> case f . index $ srcOff + (if isBE then 1 else 2) of
-      Just f' -> case f' . index $ srcOff + (if isBE then 2 else 1) of
-        Just f'' -> writeAndAdvance (f'' . index $ if isBE then srcOff + 3 else srcOff) $ srcOff + 4
-        _ -> Invalid
-      _ -> Invalid
-    _ -> Invalid
+    g t bs0 bs1 =
+      let DecodeResult t' mW bs1' _ = decodeUtf8Chunks bs0 bs1
+          txt = t `append` t'
+      in
+      (case (mW :: Maybe Word8) of
+        Just _ ->
+          g (case onErr "" mW of
+            Just c -> txt `append` T.singleton c
+            _ -> txt) mempty
+        _ -> Some txt bs1' . g empty) bs1'
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
new file mode 100644
index 00000000..e791a55e
--- /dev/null
+++ b/src/Data/Text/Encoding/Types.hs
@@ -0,0 +1,35 @@
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
+{-# LANGUAGE Safe #-}
+-- |
+-- Module      : Data.Text.Encoding.Error
+-- Copyright   : (c) Bryan O'Sullivan 2009
+--
+-- License     : BSD-style
+-- Maintainer  : bos@serpentine.com
+-- Portability : GHC
+--
+-- Types and functions for dealing with encoding and decoding errors
+-- in Unicode text.
+--
+-- The standard functions for encoding and decoding text are strict,
+-- which is to say that they throw exceptions on invalid input.  This
+-- is often unhelpful on real world input, so alternative functions
+-- exist that accept custom handlers for dealing with invalid inputs.
+-- These 'OnError' handlers are normal Haskell functions.  You can use
+-- one of the presupplied functions in this module, or you can write a
+-- custom handler of your own.
+
+module Data.Text.Encoding.Types
+  ( DecodeResult(..) )
+  where
+
+-- | A decoding result on encoded data.
+data DecodeResult t b w = DecodeResult
+  !t          -- ^ The decoded data up to an incomplete code point at
+              -- the end of the input data, an invalid word, or to the
+              -- end of the input.
+  !(Maybe w)  -- ^ If an invalid code point was encountered.
+  !b          -- ^ The remaining undecoded data. If an invald code
+              -- point was encountered, this is after that code point.
+  !Int        -- ^ Byte position of remaining undecoded data.
+  deriving (Eq, Ord, Show, Read)
diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs
index 34bc78dd..2749fa34 100644
--- a/src/Data/Text/Internal/Encoding/Utf16.hs
+++ b/src/Data/Text/Internal/Encoding/Utf16.hs
@@ -59,8 +59,7 @@ data Utf16Result
   | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char))
   | Invalid16
 
--- queryUtf16Bytes :: Word8 -> (Word8 -> Word8 -> a)
---   -> Utf16Result a
+queryUtf16Bytes :: Word8 -> Utf16Result
 queryUtf16Bytes b0@(W8# w0#) =
   if b0 < 0xD8 || b0 >= 0xE0
     then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#))))
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index ad361af5..62964203 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,6 +23,9 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
+    , decodeUtf8Chunk
+    , decodeUtf16Chunk
+    , decodeUtf32Chunk
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -58,8 +61,9 @@ module Data.Text.Lazy.Encoding
 import Control.Exception (evaluate, try)
 import Data.Monoid (Monoid(..))
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Encoding.Types (DecodeResult(..))
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
-import Data.Word (Word8)
+import Data.Word (Word16, Word32, Word8)
 import qualified Data.ByteString as S
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Prim as BP
@@ -104,6 +108,35 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
+decodeChunk :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
+  -> B.ByteString
+  -> DecodeResult Text B.ByteString w
+decodeChunk decoder = g id 0 mempty
+  where
+    g tDiff pos sb0 (B.Chunk sb1 lb1) =
+      let DecodeResult t mW sb1' pos' = decoder sb0 sb1
+          pos1 = pos + pos'
+      in
+      case mW of
+        Just _ -> DecodeResult (tDiff (chunk t Empty)) mW (B.chunk sb1' lb1) pos1
+        _ -> (g (tDiff . chunk t) pos1 sb1' lb1)
+    g tDiff pos sb0 _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 B.Empty) pos
+
+-- | Decode a 'ByteString' containing UTF-8-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf8Chunk :: B.ByteString -> DecodeResult Text B.ByteString Word8
+decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks
+
+-- | Decode a 'ByteString' containing UTF-16-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16
+decodeUtf16Chunk isBE = decodeChunk $ TE.decodeUtf16Chunks isBE
+
+-- | Decode a 'ByteString' containing UTF-32-encoded text returning a
+-- 'DecodeResult'.
+decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32
+decodeUtf32Chunk isBE = decodeChunk $ TE.decodeUtf32Chunks isBE
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr (B.Chunk b0 bs0) =
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index e6172b47..54a282fe 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -38,17 +38,51 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
+whenEqProp a b next = if a == b
+  then next
+  else a === b
+
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
 tl_utf8'     = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
+t_utf8_c     = (\ t ->
+                let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf8_c    = (\ t ->
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16LE    = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
+t_utf16LE_c  = (\ t ->
+                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf16LE_c = (\ t ->
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16LE   = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
 t_utf16BE    = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
+t_utf16BE_c  = (\ t ->
+                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf16BE_c = (\ t ->
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16BE   = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
 t_utf32LE    = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
+t_utf32LE_c  = (\ t ->
+                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf32LE_c = (\ t ->
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
 t_utf32BE    = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
+t_utf32BE_c  = (\ t ->
+                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf32BE_c  = (\ t ->
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 
 runBuilder :: B.Builder -> B.ByteString
@@ -127,6 +161,13 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro
       length (show err) >= 0
     Right _  -> counterexample (show (decoded, l)) $ de /= Strict
 
+t_utf8_c_err :: InvalidUtf8 -> Property
+t_utf8_c_err bad =
+  let E.DecodeResult t mW bs _ = E.decodeUtf8Chunk (toByteString bad) in
+  case mW of
+    Just w -> counterexample (show w) True
+    _  -> counterexample (show t) $ B.length bs > 0
+
 t_utf8_err' :: B.ByteString -> Bool
 t_utf8_err' bs = case E.decodeUtf8' bs of
   Left err -> length (show err) >= 0
@@ -168,6 +209,85 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
+t_chunk_decode_utf8_1 =
+  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
+  decodeResult0 === (E.DecodeResult "hi, ☃!" Nothing mempty 8)
+t_chunk_decode_utf8_2 =
+  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [97, 0xC2, 97]
+      expectedBs0 = B.singleton 97
+  in
+  whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $
+    let decodeResult1 = E.decodeUtf8Chunk expectedBs0 in
+    decodeResult1 === (E.DecodeResult (T.singleton 'a') Nothing mempty 1)
+t_chunk_decode_utf8_3 =
+  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2]
+      expectedBs0 = B.singleton 0xe2
+  in -- hi \xe2
+  whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $
+    let decodeResult1 = E.decodeUtf8Chunk $ expectedBs0 `B.snoc` 0x98
+        expectedBs1 = B.pack [0xe2, 0x98]
+    in
+    whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $
+      let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in
+      decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5)
+t_chunk_decode_utf8_4 =
+  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2, 0x98, 104]
+      expectedBs0 = B.pack [0x98, 104]
+  in
+  decodeResult0 === (E.DecodeResult "hi " (Just 0xe2) expectedBs0 4)
+
+t_chunk_decode_utf16BE =
+  let decode = E.decodeUtf16Chunk True
+      expectedBs0 = B.pack [0]
+      decodeResult0 = decode expectedBs0 in
+  whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
+    let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
+        expectedBs1 = B.pack [0xD8, 0x01]
+    in
+    whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
+      let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
+      decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8)
+t_chunk_decode_utf16LE =
+  let decode = E.decodeUtf16Chunk False
+      expectedBs0 = B.pack [104]
+      decodeResult0 = decode expectedBs0 in
+  whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
+    let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
+        expectedBs1 = B.pack [0x01, 0xD8]
+    in
+    whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
+      let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
+      decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC) mempty 8)
+
+t_chunk_decode_utf32BE =
+  let decode = E.decodeUtf32Chunk True
+      expBs0 = B.pack [0, 0]
+      decodeResult0 = decode $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
+  in
+  whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
+    let expBs1 = B.pack [0, 0, 0x26]
+        decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 32, 0, 0, 0x26])
+    in
+    whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
+      let expBs2 = mempty
+          decodeResult2 = decode $ expBs1 `mappend` (B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0])
+      in
+      decodeResult2 === (E.DecodeResult "☃ " (Just 0xD800) expBs2 12)
+t_chunk_decode_utf32LE =
+  let decode = E.decodeUtf32Chunk False
+      expBs0 = B.pack [0x20, 0]
+      decodeResult0 = decode $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
+  in
+  whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
+    let expBs1 = B.pack [0x03, 0x26, 0]
+        decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 0, 0x03, 0x26, 0])
+    in
+    whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
+      let expBs2 = mempty
+          decodeResult2 = decode $ expBs1 `mappend` (B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0])
+      in
+      decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12)
+
 decodeLL :: BL.ByteString -> TL.Text
 decodeLL = EL.decodeUtf8With E.lenientDecode
 
@@ -180,6 +300,12 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
+-- The decoding of lazy bytestrings should not depend on how they are chunked,
+-- and it should behave the same as decoding of strict bytestrings.
+-- t_decode_utf8_chunk :: Property
+-- t_decode_utf8_chunk = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
+--     EL.decodeUtf8Chunk bs === (TL.fromStrict . E.decodeUtf8Chunk . B.concat . BL.toChunks) bs
+
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
 -- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -208,122 +334,6 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
-whenEqProp a b next = if a == b
-  then next
-  else a === b
-
-t_decode_utf8_with_error1 =
-  case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of
-    E.InvalidWord t pos word8 f -> whenEqProp pos 1
-      . whenEqProp word8 0xC2
-        $ case f $ Just 'x' of
-          E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "axa"
-            . whenEqProp bs B.empty $ pos' === 3
-          _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
-    _ -> counterexample "The second word (\\xC2) should have been invalid" False
-t_decode_utf8_with_error2 =
-  case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of
-    E.InvalidWord t pos word8 f -> whenEqProp pos 1
-      . whenEqProp word8 0xC2
-        $ case f Nothing of
-          E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "aa"
-            . whenEqProp bs B.empty $ pos' === 3
-          _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False
-    _ -> counterexample "The second word (\\xC2) should have been invalid" False
-t_decode_utf8_with_error3 =
-  case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2] of -- hi \xe2
-    E.ThusFar t pos bs f -> whenEqProp pos 3
-      . whenEqProp bs (B.pack [0xe2])
-        $ case f $ B.pack [0x98] of
-          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 3
-            . whenEqProp bs' (B.pack [0xe2, 0x98])
-              $ case f' $ B.pack [0x83, 32, 0xFF] of
-                E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7
-                  . whenEqProp word8 0xFF
-                    $ case f'' $ Just 'x' of
-                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
-                        . whenEqProp pos''' 8 $ bs'' === B.empty
-                      _ -> counterexample "Should have been decoded text." False
-                _ -> counterexample "Should have been an invalid word." False
-          _ -> counterexample "Should have encountered an incomplete code point." False
-    _ -> counterexample "Should have encountered an incomplete code point." False
-t_decode_utf8_with_error4 =
-  case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2, 0x98, 104] of -- not quite "hi ☃", the last byte is wrong
-    E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False
-    E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2
-
-t_decode_utf16BE_with_error =
-  case E.streamDecodeUtf16With True $ B.pack [0] of
-    E.ThusFar t pos bs f -> whenEqProp pos 0
-      . whenEqProp bs (B.pack [0])
-        $ case f $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] of
-          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6
-            . whenEqProp bs' (B.pack [0xD8, 0x01])
-              $ case f' $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] of
-                E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12
-                  . whenEqProp word16 (0xDC `Bits.shiftL` 8)
-                    $ case f'' $ Just 'x' of
-                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x"
-                        . whenEqProp pos''' 14 $ bs'' === B.empty
-                      _ -> counterexample "Should have been decoded text." False
-                _ -> counterexample "Should have been an invalid word." False
-          _ -> counterexample "Should have encountered an incomplete code point." False
-    _ -> counterexample "Should have encountered an incomplete code point." False
-
-t_decode_utf16LE_with_error =
-  case E.streamDecodeUtf16With False $ B.pack [104] of
-    E.ThusFar t pos bs f -> whenEqProp pos 0
-      . whenEqProp bs (B.pack [104])
-        $ case f $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] of
-          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6
-            . whenEqProp bs' (B.pack [0x01, 0xD8])
-              $ case f' $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] of
-                E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12
-                  . whenEqProp word16 0xDC
-                    $ case f'' $ Just 'x' of
-                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x"
-                        . whenEqProp pos''' 14 $ bs'' === B.empty
-                      _ -> counterexample "Should have been decoded text." False
-                _ -> counterexample "Should have been an invalid word." False
-          _ -> counterexample "Should have encountered an incomplete code point." False
-    _ -> counterexample "Should have encountered an incomplete code point." False
-
-t_decode_utf32BE_with_error =
-  case E.streamDecodeUtf32With True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] of -- hi \xe2
-    E.ThusFar t pos bs f -> whenEqProp pos 8
-      . whenEqProp bs (B.pack [0, 0])
-        $ case f $ B.pack [0, 32, 0, 0, 0x26] of
-          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12
-            . whenEqProp bs' (B.pack [0, 0, 0x26])
-              $ case f' $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] of
-                E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20
-                  . whenEqProp word32 0x0000D800
-                    $ case f'' $ Just 'x' of
-                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
-                        . whenEqProp pos''' 24 $ bs'' === B.empty
-                      _ -> counterexample "Should have been decoded text." False
-                _ -> counterexample "Should have been an invalid word." False
-          _ -> counterexample "Should have encountered an incomplete code point." False
-    _ -> counterexample "Should have encountered an incomplete code point." False
-
-t_decode_utf32LE_with_error =
-  case E.streamDecodeUtf32With False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 32, 0] of -- hi \xe2
-    E.ThusFar t pos bs f -> whenEqProp pos 8
-      . whenEqProp bs (B.pack [32, 0])
-        $ case f $ B.pack [0, 0, 0x03, 0x26, 0] of
-          E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12
-            . whenEqProp bs' (B.pack [0x03, 0x26, 0])
-              $ case f' $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] of
-                E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20
-                  . whenEqProp word32 0x00D80000
-                    $ case f'' $ Just 'x' of
-                      E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x"
-                        . whenEqProp pos''' 24 $ bs'' === B.empty
-                      _ -> counterexample "Should have been decoded text." False
-                _ -> counterexample "Should have been an invalid word." False
-          _ -> counterexample "Should have encountered an incomplete code point." False
-    _ -> counterexample "Should have encountered an incomplete code point." False
-
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
   text `T.isInfixOf`
@@ -339,18 +349,28 @@ testTranscoding =
     testProperty "tl_latin1" tl_latin1,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
+    testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8_incr" t_utf8_incr,
     testProperty "t_utf8_undecoded" t_utf8_undecoded,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
+    testProperty "tl_utf8_c" tl_utf8_c,
     testProperty "t_utf16LE" t_utf16LE,
+    testProperty "t_utf16LE_c" t_utf16LE_c,
     testProperty "tl_utf16LE" tl_utf16LE,
+    testProperty "tl_utf16LE_c" tl_utf16LE_c,
     testProperty "t_utf16BE" t_utf16BE,
+    testProperty "t_utf16BE_c" t_utf16BE_c,
     testProperty "tl_utf16BE" tl_utf16BE,
+    testProperty "tl_utf16BE_c" tl_utf16BE_c,
     testProperty "t_utf32LE" t_utf32LE,
+    testProperty "t_utf32LE_c" t_utf32LE_c,
     testProperty "tl_utf32LE" tl_utf32LE,
+    testProperty "tl_utf32LE_c" tl_utf32LE_c,
     testProperty "t_utf32BE" t_utf32BE,
+    testProperty "t_utf32BE_c" t_utf32BE_c,
     testProperty "tl_utf32BE" tl_utf32BE,
+    testProperty "tl_utf32BE_c" tl_utf32BE_c,
     testGroup "builder" [
       testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder,
       testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset,
@@ -360,9 +380,18 @@ testTranscoding =
     ],
     testGroup "errors" [
       testProperty "t_utf8_err" t_utf8_err,
+      testProperty "t_utf8_c_err" t_utf8_c_err,
       testProperty "t_utf8_err'" t_utf8_err'
     ],
     testGroup "error recovery" [
+      testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1,
+      testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2,
+      testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3,
+      testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4,
+      testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE,
+      testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
+      testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,
+      testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE,
       testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
@@ -371,15 +400,6 @@ testTranscoding =
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
       testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_utf8_with_error1" t_decode_utf8_with_error1,
-      testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2,
-      testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3,
-      testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4,
-      testProperty "t_decode_utf16BE_with_error" t_decode_utf16BE_with_error,
-      testProperty "t_decode_utf16LE_with_error" t_decode_utf16LE_with_error,
-      testProperty "t_decode_utf32BE_with_error" t_decode_utf32BE_with_error,
-      testProperty "t_decode_utf32LE_with_error" t_decode_utf32LE_with_error,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
-  
\ No newline at end of file
diff --git a/text.cabal b/text.cabal
index 488f1451..956d8ce3 100644
--- a/text.cabal
+++ b/text.cabal
@@ -137,6 +137,7 @@ library
     Data.Text.Array
     Data.Text.Encoding
     Data.Text.Encoding.Error
+    Data.Text.Encoding.Types
     Data.Text.Foreign
     Data.Text.IO
     Data.Text.Internal

From c1544101fdaf16c3e433345380a1ad5407853214 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 10 Jul 2022 20:23:51 -0600
Subject: [PATCH 15/87] fix documentation

---
 src/Data/Text/Encoding/Types.hs | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
index e791a55e..f50e4920 100644
--- a/src/Data/Text/Encoding/Types.hs
+++ b/src/Data/Text/Encoding/Types.hs
@@ -1,23 +1,13 @@
-{-# LANGUAGE CPP, DeriveDataTypeable #-}
 {-# LANGUAGE Safe #-}
 -- |
--- Module      : Data.Text.Encoding.Error
+-- Module      : Data.Text.Encoding.Types
 -- Copyright   : (c) Bryan O'Sullivan 2009
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
 -- Portability : GHC
 --
--- Types and functions for dealing with encoding and decoding errors
--- in Unicode text.
---
--- The standard functions for encoding and decoding text are strict,
--- which is to say that they throw exceptions on invalid input.  This
--- is often unhelpful on real world input, so alternative functions
--- exist that accept custom handlers for dealing with invalid inputs.
--- These 'OnError' handlers are normal Haskell functions.  You can use
--- one of the presupplied functions in this module, or you can write a
--- custom handler of your own.
+-- Types to indicate the result of an attempt to decode data.
 
 module Data.Text.Encoding.Types
   ( DecodeResult(..) )

From 8414af3cda4d55d1f2a5efdeb78e7e45a299d101 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Mon, 11 Jul 2022 19:14:38 -0600
Subject: [PATCH 16/87] Lazy and strict stream decoders built from chunk
 decoders

---
 src/Data/Text/Encoding.hs             |  61 ++++++++---
 src/Data/Text/Encoding/Types.hs       |  19 +++-
 src/Data/Text/Lazy/Encoding.hs        |  38 ++++++-
 tests/Tests/Properties/Transcoding.hs | 144 +++++++++++++++++++++++---
 4 files changed, 227 insertions(+), 35 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 4ed1edc8..9d21d55b 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -30,13 +30,13 @@ module Data.Text.Encoding
     -- $total
       decodeLatin1
     , decodeUtf8Lenient
+    , DecodeResult(..)
     , decodeUtf8Chunk
     , decodeUtf8Chunks
     , decodeUtf16Chunk
     , decodeUtf16Chunks
     , decodeUtf32Chunk
     , decodeUtf32Chunks
-    , DecodeResult(..)
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -55,6 +55,11 @@ module Data.Text.Encoding
     , streamDecodeUtf8With
     , Decoding(..)
 
+    , StreamDecodeResult(..)
+    , decodeUtf8Stream
+    , decodeUtf16Stream
+    , decodeUtf32Stream
+
     -- ** Partial Functions
     -- $partial
     , decodeASCII
@@ -86,7 +91,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Encoding.Types (DecodeResult(..))
+import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..))
 import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
@@ -227,14 +232,14 @@ data Progression
   | NeedMore
   | Invalid
 
-decodeChunksProxy :: (Bits w, Num w, Storable w) =>
+decodeChunks :: (Bits w, Num w, Storable w) =>
   w -- only used for Storable.sizeOf argument which the function discards
   -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
   -> DecodeResult Text ByteString w
-decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
+decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
   marr <- A.new len'
   outer marr len' 0 0
   where
@@ -307,16 +312,16 @@ decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.leng
               contin srcOff' $ \ t bs' ->
                 DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
 
-decodeChunks :: (Bits w, Num w, Storable w) =>
+decodeChunksProxy :: (Bits w, Num w, Storable w) =>
   (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
   -> DecodeResult Text ByteString w
-decodeChunks = decodeChunksProxy undefined -- This allows Haskell can
+decodeChunksProxy = decodeChunks undefined -- This allows Haskell can
 -- determine the size in bytes of a data type using Storable.sizeOf
 -- so that it doesn't have to be passed as an arugment. Storable.sizeOf
--- discards the actual value without resolving it.
+-- discards the actual value without evaluating it.
 
 queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int)
 queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
@@ -352,21 +357,21 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
 
 decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression
 decodeUtf8Base index len srcOff =
-  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
-        where
-          step i (Incomplete a b)
-            | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-          step _ st = st
-  in
   case decodeFrom srcOff of
     Accept c -> WriteAndAdvance c (srcOff +)
     Reject -> Invalid
     Incomplete{} -> NeedMore
+  where
+    decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
+
+    step i (Incomplete a b)
+      | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
+    step _ st = st
 
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks = decodeChunks queryUtf8DecodeOptimization decodeUtf8Base
+decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization decodeUtf8Base
 
 -- | Decode a 'ByteString' containing UTF-8-encoded text returning a
 -- 'DecodeResult'.
@@ -379,7 +384,7 @@ noOptimization _ _ _ = Nothing
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16
-decodeUtf16Chunks isBE = decodeChunks noOptimization $ \ index len srcOff ->
+decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff ->
   -- get next Word8 pair
   let writeAndAdvance c n = WriteAndAdvance c $ const n
       b0 = index $ if isBE then srcOff else srcOff + 1
@@ -408,7 +413,7 @@ decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32
-decodeUtf32Chunks isBE = decodeChunks noOptimization $ \ index _ srcOff ->
+decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
   -- get next Word8 quartet
   case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3)
       >>= ($ (index $ srcOff + (if isBE then 1 else 2)))
@@ -559,6 +564,30 @@ streamDecodeUtf8With onErr = g empty mempty
             _ -> txt) mempty
         _ -> Some txt bs1' . g empty) bs1'
 
+decodeStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w
+decodeStream chunksDecoder = g 0 mempty
+  where
+    g pos bs0 bs1 =
+      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
+          pos' = pos + pos1
+      in
+      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8-
+-- encoded text.
+decodeUtf8Stream :: ByteString -> StreamDecodeResult Text ByteString Word8
+decodeUtf8Stream = decodeStream decodeUtf8Chunks
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-16-
+-- encoded text.
+decodeUtf16Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word16
+decodeUtf16Stream = decodeStream . decodeUtf16Chunks
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-32-
+-- encoded text.
+decodeUtf32Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word32
+decodeUtf32Stream = decodeStream . decodeUtf32Chunks
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
index f50e4920..06b124a9 100644
--- a/src/Data/Text/Encoding/Types.hs
+++ b/src/Data/Text/Encoding/Types.hs
@@ -10,7 +10,9 @@
 -- Types to indicate the result of an attempt to decode data.
 
 module Data.Text.Encoding.Types
-  ( DecodeResult(..) )
+  ( DecodeResult(..)
+  , StreamDecodeResult(..)
+  )
   where
 
 -- | A decoding result on encoded data.
@@ -23,3 +25,18 @@ data DecodeResult t b w = DecodeResult
               -- point was encountered, this is after that code point.
   !Int        -- ^ Byte position of remaining undecoded data.
   deriving (Eq, Ord, Show, Read)
+
+-- | A decoding result on encoded data.
+data StreamDecodeResult t b w = StreamDecodeResult
+  !t          -- ^ The decoded data up to an incomplete code point at
+              -- the end of the input data, an invalid word, or to the
+              -- end of the input.
+  !(Maybe w)  -- ^ If an invalid code point was encountered.
+  !b          -- ^ The remaining undecoded data. If an invald code
+              -- point was encountered, this is after that code point.
+  !Int        -- ^ Byte position of remaining undecoded data. This is
+              -- treated as if all the data fed to previous invocations
+              -- of the continations where one continuous feed.
+  (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next
+                                  -- span of data to be decoded with
+                                  -- the remaining unencoded data.
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 62964203..99626efa 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,9 +23,14 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
+    , DecodeResult(..)
     , decodeUtf8Chunk
     , decodeUtf16Chunk
     , decodeUtf32Chunk
+    , StreamDecodeResult(..)
+    , decodeUtf8Stream
+    , decodeUtf16Stream
+    , decodeUtf32Stream
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -61,7 +66,7 @@ module Data.Text.Lazy.Encoding
 import Control.Exception (evaluate, try)
 import Data.Monoid (Monoid(..))
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Encoding.Types (DecodeResult(..))
+import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..))
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word16, Word32, Word8)
 import qualified Data.ByteString as S
@@ -130,12 +135,39 @@ decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks
 -- | Decode a 'ByteString' containing UTF-16-encoded text returning a
 -- 'DecodeResult'.
 decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16
-decodeUtf16Chunk isBE = decodeChunk $ TE.decodeUtf16Chunks isBE
+decodeUtf16Chunk = decodeChunk . TE.decodeUtf16Chunks
 
 -- | Decode a 'ByteString' containing UTF-32-encoded text returning a
 -- 'DecodeResult'.
 decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32
-decodeUtf32Chunk isBE = decodeChunk $ TE.decodeUtf32Chunks isBE
+decodeUtf32Chunk = decodeChunk . TE.decodeUtf32Chunks
+
+decodeStream
+  :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
+  -> B.ByteString
+  -> StreamDecodeResult Text B.ByteString w
+decodeStream decoder = g 0 mempty
+  where
+    g pos bs0 bs1 =
+      let DecodeResult t mW bs1' pos1 = decodeChunk decoder $ B.append bs0 bs1
+          pos' = pos + pos1
+      in
+      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
+
+-- | Decode a 'ByteString' containing UTF-8-encoded text returning a
+-- 'StreamDecodeResult'.
+decodeUtf8Stream :: B.ByteString -> StreamDecodeResult Text B.ByteString Word8
+decodeUtf8Stream = decodeStream TE.decodeUtf8Chunks
+
+-- | Decode a 'ByteString' containing UTF-16-encoded text returning a
+-- 'StreamDecodeResult'.
+decodeUtf16Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word16
+decodeUtf16Stream = decodeStream . TE.decodeUtf16Chunks
+
+-- | Decode a 'ByteString' containing UTF-32-encoded text returning a
+-- 'StreamDecodeResult'.
+decodeUtf32Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word32
+decodeUtf32Stream = decodeStream . TE.decodeUtf32Chunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 54a282fe..81767e75 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -44,46 +44,76 @@ whenEqProp a b next = if a == b
 
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
-tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
-tl_utf8'     = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
 t_utf8_c     = (\ t ->
                 let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+t_utf8_s     = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf8Stream $ E.encodeUtf8 t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
+tl_utf8'     = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
 tl_utf8_c    = (\ t ->
                 let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf8_s    = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf8Stream $ EL.encodeUtf8 t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16LE    = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
 t_utf16LE_c  = (\ t ->
                 let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+t_utf16LE_s  = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream False $ E.encodeUtf16LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf16LE   = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
 tl_utf16LE_c = (\ t ->
                 let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf16LE   = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
+tl_utf16LE_s = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream False $ EL.encodeUtf16LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16BE    = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
 t_utf16BE_c  = (\ t ->
                 let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+t_utf16BE_s  = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream True $ E.encodeUtf16BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf16BE   = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
 tl_utf16BE_c = (\ t ->
                 let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf16BE   = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
+tl_utf16BE_s = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream True $ EL.encodeUtf16BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32LE    = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
 t_utf32LE_c  = (\ t ->
                 let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+t_utf32LE_s  = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream False $ E.encodeUtf32LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
 tl_utf32LE_c = (\ t ->
                 let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
+tl_utf32LE_s = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream False $ EL.encodeUtf32LE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32BE    = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
 t_utf32BE_c  = (\ t ->
                 let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+t_utf32BE_s  = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream True $ E.encodeUtf32BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
+tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 tl_utf32BE_c  = (\ t ->
                 let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
+tl_utf32BE_s  = (\ t ->
+                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream True $ EL.encodeUtf32BE t in
+                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 
 runBuilder :: B.Builder -> B.ByteString
 runBuilder =
@@ -168,6 +198,13 @@ t_utf8_c_err bad =
     Just w -> counterexample (show w) True
     _  -> counterexample (show t) $ B.length bs > 0
 
+t_utf8_s_err :: InvalidUtf8 -> Property
+t_utf8_s_err bad =
+  let E.StreamDecodeResult t mW bs _ _ = E.decodeUtf8Stream (toByteString bad) in
+  case mW of
+    Just w -> counterexample (show w) True
+    _  -> counterexample (show t) $ B.length bs > 0
+
 t_utf8_err' :: B.ByteString -> Bool
 t_utf8_err' bs = case E.decodeUtf8' bs of
   Left err -> length (show err) >= 0
@@ -230,11 +267,6 @@ t_chunk_decode_utf8_3 =
     whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $
       let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in
       decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5)
-t_chunk_decode_utf8_4 =
-  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2, 0x98, 104]
-      expectedBs0 = B.pack [0x98, 104]
-  in
-  decodeResult0 === (E.DecodeResult "hi " (Just 0xe2) expectedBs0 4)
 
 t_chunk_decode_utf16BE =
   let decode = E.decodeUtf16Chunk True
@@ -288,6 +320,54 @@ t_chunk_decode_utf32LE =
       in
       decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12)
 
+t_stream_decode_utf8_1 =
+  let (E.StreamDecodeResult t0 mW0 bs0 pos0 _) = E.decodeUtf8Stream $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
+  (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8)
+t_stream_decode_utf8_2 =
+  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [97, 0xC2, 97] in
+  whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in
+    (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3)
+t_stream_decode_utf8_3 =
+  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [104, 105, 32, 0xe2] in
+  whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in
+    whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $
+      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in
+      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8)
+
+t_stream_decode_utf16BE =
+  let expectedBs0 = B.pack [0]
+      (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream True expectedBs0 in
+  whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in
+    whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $
+      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
+      (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14)
+t_stream_decode_utf16LE =
+  let expectedBs0 = B.pack [104]
+      (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream False expectedBs0 in
+  whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in
+    whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $
+      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
+      (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14)
+
+t_stream_decode_utf32BE =
+  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in
+  whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in
+    whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $
+      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in
+      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24)
+t_stream_decode_utf32LE =
+  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in
+  whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $
+    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in
+    whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $
+      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in
+      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24)
+
 decodeLL :: BL.ByteString -> TL.Text
 decodeLL = EL.decodeUtf8With E.lenientDecode
 
@@ -300,11 +380,27 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
+decodeStream decoder snoc bs =
+  g mempty $ decoder bs
+  where
+    g t0 (E.StreamDecodeResult t mW bs' pos f) =
+      let t0' = t0 `mappend` t in
+      case mW of
+        Just _ -> let mC = E.lenientDecode "" mW in
+          let t' = case mC of
+                Just c -> t0' `snoc` c
+                _ -> t0'
+          in g t' $ f mempty
+        _ -> (t0', bs', pos)
+
 -- The decoding of lazy bytestrings should not depend on how they are chunked,
 -- and it should behave the same as decoding of strict bytestrings.
--- t_decode_utf8_chunk :: Property
--- t_decode_utf8_chunk = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
---     EL.decodeUtf8Chunk bs === (TL.fromStrict . E.decodeUtf8Chunk . B.concat . BL.toChunks) bs
+t_decode_utf8_stream :: Property
+t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
+    decodeStream EL.decodeUtf8Stream TL.snoc bs ===
+      ( let (st, sb, pos) = decodeStream E.decodeUtf8Stream T.snoc . B.concat $ BL.toChunks bs in
+        (TL.fromStrict st, BL.fromStrict sb, pos)
+      )
 
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
@@ -350,27 +446,37 @@ testTranscoding =
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
     testProperty "t_utf8_c" t_utf8_c,
+    testProperty "t_utf8_s" t_utf8_s,
     testProperty "t_utf8_incr" t_utf8_incr,
     testProperty "t_utf8_undecoded" t_utf8_undecoded,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
     testProperty "tl_utf8_c" tl_utf8_c,
+    testProperty "tl_utf8_s" tl_utf8_s,
     testProperty "t_utf16LE" t_utf16LE,
     testProperty "t_utf16LE_c" t_utf16LE_c,
+    testProperty "t_utf16LE_s" t_utf16LE_s,
     testProperty "tl_utf16LE" tl_utf16LE,
     testProperty "tl_utf16LE_c" tl_utf16LE_c,
+    testProperty "tl_utf16LE_s" tl_utf16LE_s,
     testProperty "t_utf16BE" t_utf16BE,
     testProperty "t_utf16BE_c" t_utf16BE_c,
+    testProperty "t_utf16BE_s" t_utf16BE_s,
     testProperty "tl_utf16BE" tl_utf16BE,
     testProperty "tl_utf16BE_c" tl_utf16BE_c,
+    testProperty "tl_utf16BE_s" tl_utf16BE_s,
     testProperty "t_utf32LE" t_utf32LE,
     testProperty "t_utf32LE_c" t_utf32LE_c,
+    testProperty "t_utf32LE_s" t_utf32LE_s,
     testProperty "tl_utf32LE" tl_utf32LE,
     testProperty "tl_utf32LE_c" tl_utf32LE_c,
+    testProperty "tl_utf32LE_s" tl_utf32LE_s,
     testProperty "t_utf32BE" t_utf32BE,
     testProperty "t_utf32BE_c" t_utf32BE_c,
+    testProperty "t_utf32BE_s" t_utf32BE_s,
     testProperty "tl_utf32BE" tl_utf32BE,
     testProperty "tl_utf32BE_c" tl_utf32BE_c,
+    testProperty "tl_utf32BE_s" tl_utf32BE_s,
     testGroup "builder" [
       testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder,
       testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset,
@@ -381,18 +487,26 @@ testTranscoding =
     testGroup "errors" [
       testProperty "t_utf8_err" t_utf8_err,
       testProperty "t_utf8_c_err" t_utf8_c_err,
+      testProperty "t_utf8_s_err" t_utf8_s_err,
       testProperty "t_utf8_err'" t_utf8_err'
     ],
     testGroup "error recovery" [
       testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1,
       testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2,
       testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3,
-      testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4,
       testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE,
       testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
       testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,
       testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE,
+      testProperty "t_stream_decode_utf8_1" t_stream_decode_utf8_1,
+      testProperty "t_stream_decode_utf8_2" t_stream_decode_utf8_2,
+      testProperty "t_stream_decode_utf8_3" t_stream_decode_utf8_3,
+      testProperty "t_stream_decode_utf16BE" t_stream_decode_utf16BE,
+      testProperty "t_stream_decode_utf16LE" t_stream_decode_utf16LE,
+      testProperty "t_stream_decode_utf32BE" t_stream_decode_utf32BE,
+      testProperty "t_stream_decode_utf32LE" t_stream_decode_utf32LE,
       testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
+      testProperty "t_decode_utf8_stream" t_decode_utf8_stream,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,

From e1eded8e829be5d6aa0a4ab5ba5b4540fdd9b6c9 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 12 Jul 2022 20:21:10 -0600
Subject: [PATCH 17/87] removed decodeUtf[X]Stream functions but left the
 function chunksDecoderToStream

---
 src/Data/Text/Encoding.hs             | 221 ++++++++++----------------
 src/Data/Text/Encoding/Types.hs       |  14 ++
 src/Data/Text/Lazy/Encoding.hs        |  83 ++++------
 tests/Tests/Properties/Transcoding.hs | 181 +++++++++------------
 4 files changed, 200 insertions(+), 299 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 9d21d55b..41752922 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -31,11 +31,8 @@ module Data.Text.Encoding
       decodeLatin1
     , decodeUtf8Lenient
     , DecodeResult(..)
-    , decodeUtf8Chunk
     , decodeUtf8Chunks
-    , decodeUtf16Chunk
     , decodeUtf16Chunks
-    , decodeUtf32Chunk
     , decodeUtf32Chunks
 
     -- *** Catchable failure
@@ -55,11 +52,6 @@ module Data.Text.Encoding
     , streamDecodeUtf8With
     , Decoding(..)
 
-    , StreamDecodeResult(..)
-    , decodeUtf8Stream
-    , decodeUtf16Stream
-    , decodeUtf32Stream
-
     -- ** Partial Functions
     -- $partial
     , decodeASCII
@@ -91,7 +83,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..))
+import Data.Text.Encoding.Types (DecodeResult(..))
 import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
@@ -239,78 +231,81 @@ decodeChunks :: (Bits w, Num w, Storable w) =>
   -> ByteString
   -> ByteString
   -> DecodeResult Text ByteString w
-decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
-  marr <- A.new len'
-  outer marr len' 0 0
-  where
-    wordByteSize = sizeOf w
-
-    index :: Int -> Word8
-    index i
-      | i < len1  = B.index bs1 i
-      | otherwise = B.index bs2 $ i - len1
-
-    len :: Int
-    len = len1 + len2
-    len' :: Int
-    len' = (len `div` wordByteSize) + 4
-
-    -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)
-    outer dst dstLen = inner
-      where
-        inner srcOff dstOff
-          -- finished
-          | len - srcOff < 1 = goodSoFar
-          -- shortcut for utf-8
-          | otherwise =
-            case queryOptimization bs1 bs2 srcOff of
-              Just (arr, tLen) ->
-                let minLen = tLen + dstOff in
-                if minLen > dstLen
-                  then do
-                    let dstLen' = minLen + 4
-                    dst' <- A.resizeM dst dstLen'
-                    A.copyI tLen dst' dstOff arr 0
-                    outer dst' dstLen' (srcOff + tLen) minLen
-                  else do
-                    A.copyI tLen dst dstOff arr 0
-                    inner (srcOff + tLen) minLen
-              _ -> if len - srcOff < wordByteSize
-                  -- incomplete code point
-                  then goodSoFar
-                  else
-                    if dstOff + 4 > dstLen
-                      -- need more space in destination
-                      then do
-                        let dstLen' = dstLen + 4
-                        dst' <- A.resizeM dst dstLen'
-                        outer dst' dstLen' srcOff dstOff
-                      else
-                        case decodeF index len srcOff of
-                          WriteAndAdvance c advance -> do
-                            d <- unsafeWrite dst dstOff c
-                            inner (advance d) $ dstOff + d
-                          NeedMore -> goodSoFar
-                          Invalid -> invalid
-          where
-            contin off res = do
-              A.shrinkM dst dstOff
-              arr <- A.unsafeFreeze dst
-              pure . res (Text arr 0 dstOff) $ if off >= len1
-                then B.drop (off - len1) bs2
-                else B.drop off $ bs1 `B.append` bs2
-            goodSoFar =
-              contin srcOff $ \ t bs' ->
-                DecodeResult t Nothing bs' srcOff
-            invalid =
-              let srcOff' = srcOff + wordByteSize
-                  bytesToWord n word =
-                    if n > 0
-                      then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                      else word
-              in
-              contin srcOff' $ \ t bs' ->
-                DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
+decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2)
+  | len2 == 0
+  , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1
+  | otherwise = runST $ do
+    marr <- A.new len'
+    outer marr len' 0 0
+    where
+      wordByteSize = sizeOf w
+
+      index :: Int -> Word8
+      index i
+        | i < len1  = B.index bs1 i
+        | otherwise = B.index bs2 $ i - len1
+
+      len :: Int
+      len = len1 + len2
+      len' :: Int
+      len' = (len `div` wordByteSize) + 4
+
+      -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)
+      outer dst dstLen = inner
+        where
+          inner srcOff dstOff
+            -- finished
+            | len - srcOff < 1 = goodSoFar
+            -- shortcut for utf-8
+            | otherwise =
+              case queryOptimization bs1 bs2 srcOff of
+                Just (arr, tLen) ->
+                  let minLen = tLen + dstOff in
+                  if minLen > dstLen
+                    then do
+                      let dstLen' = minLen + 4
+                      dst' <- A.resizeM dst dstLen'
+                      A.copyI tLen dst' dstOff arr 0
+                      outer dst' dstLen' (srcOff + tLen) minLen
+                    else do
+                      A.copyI tLen dst dstOff arr 0
+                      inner (srcOff + tLen) minLen
+                _ -> if len - srcOff < wordByteSize
+                    -- incomplete code point
+                    then goodSoFar
+                    else
+                      if dstOff + 4 > dstLen
+                        -- need more space in destination
+                        then do
+                          let dstLen' = dstLen + 4
+                          dst' <- A.resizeM dst dstLen'
+                          outer dst' dstLen' srcOff dstOff
+                        else
+                          case decodeF index len srcOff of
+                            WriteAndAdvance c advance -> do
+                              d <- unsafeWrite dst dstOff c
+                              inner (advance d) $ dstOff + d
+                            NeedMore -> goodSoFar
+                            Invalid -> invalid
+            where
+              contin off res = do
+                A.shrinkM dst dstOff
+                arr <- A.unsafeFreeze dst
+                pure . res (Text arr 0 dstOff) $ if off >= len1
+                  then B.drop (off - len1) bs2
+                  else B.drop off $ bs1 `B.append` bs2
+              goodSoFar =
+                contin srcOff $ \ t bs' ->
+                  DecodeResult t Nothing bs' srcOff
+              invalid =
+                let srcOff' = srcOff + wordByteSize
+                    bytesToWord n word =
+                      if n > 0
+                        then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                        else word
+                in
+                contin srcOff' $ \ t bs' ->
+                  DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
 
 decodeChunksProxy :: (Bits w, Num w, Storable w) =>
   (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
@@ -318,7 +313,7 @@ decodeChunksProxy :: (Bits w, Num w, Storable w) =>
   -> ByteString
   -> ByteString
   -> DecodeResult Text ByteString w
-decodeChunksProxy = decodeChunks undefined -- This allows Haskell can
+decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- determine the size in bytes of a data type using Storable.sizeOf
 -- so that it doesn't have to be passed as an arugment. Storable.sizeOf
 -- discards the actual value without evaluating it.
@@ -355,28 +350,20 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
         w2 = B.index bs2 (len2 - 3)
         w3 = B.index bs2 (len2 - 4)
 
-decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression
-decodeUtf8Base index len srcOff =
-  case decodeFrom srcOff of
-    Accept c -> WriteAndAdvance c (srcOff +)
-    Reject -> Invalid
-    Incomplete{} -> NeedMore
-  where
-    decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
-
-    step i (Incomplete a b)
-      | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-    step _ st = st
-
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization decodeUtf8Base
+decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff ->
+  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
 
--- | Decode a 'ByteString' containing UTF-8-encoded text returning a
--- 'DecodeResult'.
-decodeUtf8Chunk :: ByteString -> DecodeResult Text ByteString Word8
-decodeUtf8Chunk = decodeUtf8Chunks mempty
+      step i (Incomplete a b)
+        | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
+      step _ st = st
+  in
+  case decodeFrom srcOff of
+    Accept c -> WriteAndAdvance c (srcOff +)
+    Reject -> Invalid
+    Incomplete{} -> NeedMore
 
 noOptimization :: p1 -> p2 -> p3 -> Maybe a
 noOptimization _ _ _ = Nothing
@@ -405,11 +392,6 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -
             _ -> Invalid
     _ -> Invalid
 
--- | Decode a 'ByteString' containing UTF-16-encoded text returning a
--- 'DecodeResult'.
-decodeUtf16Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word16
-decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty
-
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32
@@ -421,11 +403,6 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
     Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4
     _ -> Invalid
 
--- | Decode a 'ByteString' containing UTF-32-encoded text returning a
--- 'DecodeResult'.
-decodeUtf32Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word32
-decodeUtf32Chunk isBE = decodeUtf32Chunks isBE mempty
-
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a total function: On success the decoded 'Text' is within a
@@ -564,30 +541,6 @@ streamDecodeUtf8With onErr = g empty mempty
             _ -> txt) mempty
         _ -> Some txt bs1' . g empty) bs1'
 
-decodeStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w
-decodeStream chunksDecoder = g 0 mempty
-  where
-    g pos bs0 bs1 =
-      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
-          pos' = pos + pos1
-      in
-      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
-
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8-
--- encoded text.
-decodeUtf8Stream :: ByteString -> StreamDecodeResult Text ByteString Word8
-decodeUtf8Stream = decodeStream decodeUtf8Chunks
-
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-16-
--- encoded text.
-decodeUtf16Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word16
-decodeUtf16Stream = decodeStream . decodeUtf16Chunks
-
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-32-
--- encoded text.
-decodeUtf32Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word32
-decodeUtf32Stream = decodeStream . decodeUtf32Chunks
-
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
index 06b124a9..1f806be9 100644
--- a/src/Data/Text/Encoding/Types.hs
+++ b/src/Data/Text/Encoding/Types.hs
@@ -12,6 +12,7 @@
 module Data.Text.Encoding.Types
   ( DecodeResult(..)
   , StreamDecodeResult(..)
+  , chunksDecoderToStream
   )
   where
 
@@ -40,3 +41,16 @@ data StreamDecodeResult t b w = StreamDecodeResult
   (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next
                                   -- span of data to be decoded with
                                   -- the remaining unencoded data.
+
+-- | Create a stream decoder from a chunks decoder. The resulting
+-- stream decoder will return a 'StreamDecodeResult' which contains a
+-- continuation function that accepts another section of unencoded
+-- data as a continuation of any remaining unencoded data.
+chunksDecoderToStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w
+chunksDecoderToStream chunksDecoder = g 0 mempty
+  where
+    g pos bs0 bs1 =
+      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
+          pos' = pos + pos1
+      in
+      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 99626efa..1e1360fa 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -24,13 +24,9 @@ module Data.Text.Lazy.Encoding
     -- $total
       decodeLatin1
     , DecodeResult(..)
-    , decodeUtf8Chunk
-    , decodeUtf16Chunk
-    , decodeUtf32Chunk
-    , StreamDecodeResult(..)
-    , decodeUtf8Stream
-    , decodeUtf16Stream
-    , decodeUtf32Stream
+    , decodeUtf8Chunks
+    , decodeUtf16Chunks
+    , decodeUtf32Chunks
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -66,7 +62,7 @@ module Data.Text.Lazy.Encoding
 import Control.Exception (evaluate, try)
 import Data.Monoid (Monoid(..))
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..))
+import Data.Text.Encoding.Types (DecodeResult(..))
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word16, Word32, Word8)
 import qualified Data.ByteString as S
@@ -113,61 +109,36 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
-decodeChunk :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
+decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
+  -> B.ByteString
   -> B.ByteString
   -> DecodeResult Text B.ByteString w
-decodeChunk decoder = g id 0 mempty
+decodeChunks decoder = g id 0 mempty
   where
-    g tDiff pos sb0 (B.Chunk sb1 lb1) =
+    g tDiff pos sb0 (B.Chunk sb1 lb1) lb2 =
       let DecodeResult t mW sb1' pos' = decoder sb0 sb1
           pos1 = pos + pos'
       in
       case mW of
-        Just _ -> DecodeResult (tDiff (chunk t Empty)) mW (B.chunk sb1' lb1) pos1
-        _ -> (g (tDiff . chunk t) pos1 sb1' lb1)
-    g tDiff pos sb0 _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 B.Empty) pos
-
--- | Decode a 'ByteString' containing UTF-8-encoded text returning a
--- 'DecodeResult'.
-decodeUtf8Chunk :: B.ByteString -> DecodeResult Text B.ByteString Word8
-decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks
-
--- | Decode a 'ByteString' containing UTF-16-encoded text returning a
--- 'DecodeResult'.
-decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16
-decodeUtf16Chunk = decodeChunk . TE.decodeUtf16Chunks
-
--- | Decode a 'ByteString' containing UTF-32-encoded text returning a
--- 'DecodeResult'.
-decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32
-decodeUtf32Chunk = decodeChunk . TE.decodeUtf32Chunks
-
-decodeStream
-  :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
-  -> B.ByteString
-  -> StreamDecodeResult Text B.ByteString w
-decodeStream decoder = g 0 mempty
-  where
-    g pos bs0 bs1 =
-      let DecodeResult t mW bs1' pos1 = decodeChunk decoder $ B.append bs0 bs1
-          pos' = pos + pos1
-      in
-      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
-
--- | Decode a 'ByteString' containing UTF-8-encoded text returning a
--- 'StreamDecodeResult'.
-decodeUtf8Stream :: B.ByteString -> StreamDecodeResult Text B.ByteString Word8
-decodeUtf8Stream = decodeStream TE.decodeUtf8Chunks
-
--- | Decode a 'ByteString' containing UTF-16-encoded text returning a
--- 'StreamDecodeResult'.
-decodeUtf16Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word16
-decodeUtf16Stream = decodeStream . TE.decodeUtf16Chunks
-
--- | Decode a 'ByteString' containing UTF-32-encoded text returning a
--- 'StreamDecodeResult'.
-decodeUtf32Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word32
-decodeUtf32Stream = decodeStream . TE.decodeUtf32Chunks
+        Just _ -> DecodeResult (tDiff $ chunk t Empty) mW (B.chunk sb1' $ lb1 `B.append` lb2) pos1
+        _ -> g (tDiff . chunk t) pos1 sb1' lb1 lb2
+    g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty
+    g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos
+
+-- | Decode two 'ByteString's containing UTF-8-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf8Chunks :: B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word8
+decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
+
+-- | Decode two 'ByteString's containing UTF-16-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf16Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word16
+decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
+
+-- | Decode two 'ByteString's containing UTF-32-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+decodeUtf32Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word32
+decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 81767e75..bc1c36e4 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -24,6 +24,7 @@ import qualified Data.ByteString.Lazy.Char8 as BLC
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
+import qualified Data.Text.Encoding.Types as Ty
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
@@ -45,74 +46,44 @@ whenEqProp a b next = if a == b
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 t_utf8_c     = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-t_utf8_s     = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf8Stream $ E.encodeUtf8 t in
+                let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunks mempty $ E.encodeUtf8 t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
 tl_utf8'     = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
 tl_utf8_c    = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf8_s    = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf8Stream $ EL.encodeUtf8 t in
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunks mempty $ EL.encodeUtf8 t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16LE    = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
 t_utf16LE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-t_utf16LE_s  = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream False $ E.encodeUtf16LE t in
+                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks False mempty $ E.encodeUtf16LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16LE   = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
 tl_utf16LE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf16LE_s = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream False $ EL.encodeUtf16LE t in
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks False mempty $ EL.encodeUtf16LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16BE    = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
 t_utf16BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-t_utf16BE_s  = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream True $ E.encodeUtf16BE t in
+                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks True mempty $ E.encodeUtf16BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16BE   = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
 tl_utf16BE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf16BE_s = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream True $ EL.encodeUtf16BE t in
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks True mempty $ EL.encodeUtf16BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32LE    = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
 t_utf32LE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-t_utf32LE_s  = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream False $ E.encodeUtf32LE t in
+                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks False mempty $ E.encodeUtf32LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
 tl_utf32LE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf32LE_s = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream False $ EL.encodeUtf32LE t in
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks False mempty $ EL.encodeUtf32LE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32BE    = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
 t_utf32BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-t_utf32BE_s  = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream True $ E.encodeUtf32BE t in
+                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks True mempty $ E.encodeUtf32BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 tl_utf32BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
-tl_utf32BE_s  = (\ t ->
-                let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream True $ EL.encodeUtf32BE t in
+                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks True mempty $ EL.encodeUtf32BE t in
                 (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 
 runBuilder :: B.Builder -> B.ByteString
@@ -193,14 +164,7 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro
 
 t_utf8_c_err :: InvalidUtf8 -> Property
 t_utf8_c_err bad =
-  let E.DecodeResult t mW bs _ = E.decodeUtf8Chunk (toByteString bad) in
-  case mW of
-    Just w -> counterexample (show w) True
-    _  -> counterexample (show t) $ B.length bs > 0
-
-t_utf8_s_err :: InvalidUtf8 -> Property
-t_utf8_s_err bad =
-  let E.StreamDecodeResult t mW bs _ _ = E.decodeUtf8Stream (toByteString bad) in
+  let E.DecodeResult t mW bs _ = E.decodeUtf8Chunks mempty $ toByteString bad in
   case mW of
     Just w -> counterexample (show w) True
     _  -> counterexample (show t) $ B.length bs > 0
@@ -247,125 +211,133 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
 t_chunk_decode_utf8_1 =
-  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
-  decodeResult0 === (E.DecodeResult "hi, ☃!" Nothing mempty 8)
+  let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
+  decodeResult0 === E.DecodeResult "hi, ☃!" Nothing mempty 8
 t_chunk_decode_utf8_2 =
-  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [97, 0xC2, 97]
+  let decode = E.decodeUtf8Chunks mempty
+      decodeResult0 = decode $ B.pack [97, 0xC2, 97]
       expectedBs0 = B.singleton 97
   in
   whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $
-    let decodeResult1 = E.decodeUtf8Chunk expectedBs0 in
-    decodeResult1 === (E.DecodeResult (T.singleton 'a') Nothing mempty 1)
+    let decodeResult1 = decode expectedBs0 in
+    decodeResult1 === E.DecodeResult (T.singleton 'a') Nothing mempty 1
 t_chunk_decode_utf8_3 =
-  let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2]
+  let decode = E.decodeUtf8Chunks
+      decodeResult0 = decode mempty $ B.pack [104, 105, 32, 0xe2]
       expectedBs0 = B.singleton 0xe2
   in -- hi \xe2
   whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $
-    let decodeResult1 = E.decodeUtf8Chunk $ expectedBs0 `B.snoc` 0x98
+    let decodeResult1 = decode expectedBs0 $ B.singleton 0x98
         expectedBs1 = B.pack [0xe2, 0x98]
     in
     whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $
-      let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in
-      decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5)
+      let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in
+      decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5
 
 t_chunk_decode_utf16BE =
-  let decode = E.decodeUtf16Chunk True
+  let decode = E.decodeUtf16Chunks True
       expectedBs0 = B.pack [0]
-      decodeResult0 = decode expectedBs0 in
+      decodeResult0 = decode mempty expectedBs0 in
   whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
-    let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
+    let decodeResult1 = decode expectedBs0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
         expectedBs1 = B.pack [0xD8, 0x01]
     in
     whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
-      let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
-      decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8)
+      let decodeResult2 = decode expectedBs1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
+      decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8
 t_chunk_decode_utf16LE =
-  let decode = E.decodeUtf16Chunk False
+  let decode = E.decodeUtf16Chunks False
       expectedBs0 = B.pack [104]
-      decodeResult0 = decode expectedBs0 in
+      decodeResult0 = decode mempty expectedBs0 in
   whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
-    let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
+    let decodeResult1 = decode expectedBs0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
         expectedBs1 = B.pack [0x01, 0xD8]
     in
     whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
-      let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
-      decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC) mempty 8)
+      let decodeResult2 = decode expectedBs1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
+      decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC) mempty 8
 
 t_chunk_decode_utf32BE =
-  let decode = E.decodeUtf32Chunk True
+  let decode = E.decodeUtf32Chunks True
       expBs0 = B.pack [0, 0]
-      decodeResult0 = decode $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
+      decodeResult0 = decode mempty $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
   in
   whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
     let expBs1 = B.pack [0, 0, 0x26]
-        decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 32, 0, 0, 0x26])
+        decodeResult1 = decode expBs0 $ B.pack [0, 32, 0, 0, 0x26]
     in
     whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
       let expBs2 = mempty
-          decodeResult2 = decode $ expBs1 `mappend` (B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0])
+          decodeResult2 = decode expBs1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]
       in
-      decodeResult2 === (E.DecodeResult "☃ " (Just 0xD800) expBs2 12)
+      decodeResult2 === E.DecodeResult "☃ " (Just 0xD800) expBs2 12
 t_chunk_decode_utf32LE =
-  let decode = E.decodeUtf32Chunk False
+  let decode = E.decodeUtf32Chunks False
       expBs0 = B.pack [0x20, 0]
-      decodeResult0 = decode $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
+      decodeResult0 = decode mempty $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
   in
   whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
     let expBs1 = B.pack [0x03, 0x26, 0]
-        decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 0, 0x03, 0x26, 0])
+        decodeResult1 = decode expBs0 $ B.pack [0, 0, 0x03, 0x26, 0]
     in
     whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
       let expBs2 = mempty
-          decodeResult2 = decode $ expBs1 `mappend` (B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0])
+          decodeResult2 = decode expBs1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]
       in
-      decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12)
+      decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12
+
+decodeUtf8StreamS = Ty.chunksDecoderToStream E.decodeUtf8Chunks
 
 t_stream_decode_utf8_1 =
-  let (E.StreamDecodeResult t0 mW0 bs0 pos0 _) = E.decodeUtf8Stream $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
+  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 _) = decodeUtf8StreamS $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
   (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8)
 t_stream_decode_utf8_2 =
-  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [97, 0xC2, 97] in
+  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [97, 0xC2, 97] in
   whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in
     (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3)
 t_stream_decode_utf8_3 =
-  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [104, 105, 32, 0xe2] in
+  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [104, 105, 32, 0xe2] in
   whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in
     whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $
-      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in
+      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in
       (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8)
 
+decodeUtf16StreamS = Ty.chunksDecoderToStream . E.decodeUtf16Chunks
+
 t_stream_decode_utf16BE =
   let expectedBs0 = B.pack [0]
-      (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream True expectedBs0 in
+      (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS True expectedBs0 in
   whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in
     whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $
-      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
+      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
       (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14)
 t_stream_decode_utf16LE =
   let expectedBs0 = B.pack [104]
-      (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream False expectedBs0 in
+      (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS False expectedBs0 in
   whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in
     whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $
-      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
+      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
       (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14)
 
+decodeUtf32StreamS = Ty.chunksDecoderToStream . E.decodeUtf32Chunks
+
 t_stream_decode_utf32BE =
-  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in
+  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in
   whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in
     whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $
-      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in
+      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in
       (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24)
 t_stream_decode_utf32LE =
-  let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in
+  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in
   whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $
-    let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in
+    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in
     whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $
-      let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in
+      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in
       (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24)
 
 decodeLL :: BL.ByteString -> TL.Text
@@ -383,7 +355,7 @@ t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $
 decodeStream decoder snoc bs =
   g mempty $ decoder bs
   where
-    g t0 (E.StreamDecodeResult t mW bs' pos f) =
+    g t0 (Ty.StreamDecodeResult t mW bs' pos f) =
       let t0' = t0 `mappend` t in
       case mW of
         Just _ -> let mC = E.lenientDecode "" mW in
@@ -393,12 +365,14 @@ decodeStream decoder snoc bs =
           in g t' $ f mempty
         _ -> (t0', bs', pos)
 
+decodeUtf8StreamL = Ty.chunksDecoderToStream EL.decodeUtf8Chunks
+
 -- The decoding of lazy bytestrings should not depend on how they are chunked,
 -- and it should behave the same as decoding of strict bytestrings.
 t_decode_utf8_stream :: Property
 t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
-    decodeStream EL.decodeUtf8Stream TL.snoc bs ===
-      ( let (st, sb, pos) = decodeStream E.decodeUtf8Stream T.snoc . B.concat $ BL.toChunks bs in
+    decodeStream decodeUtf8StreamL TL.snoc bs ===
+      ( let (st, sb, pos) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs in
         (TL.fromStrict st, BL.fromStrict sb, pos)
       )
 
@@ -446,37 +420,27 @@ testTranscoding =
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
     testProperty "t_utf8_c" t_utf8_c,
-    testProperty "t_utf8_s" t_utf8_s,
     testProperty "t_utf8_incr" t_utf8_incr,
     testProperty "t_utf8_undecoded" t_utf8_undecoded,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
     testProperty "tl_utf8_c" tl_utf8_c,
-    testProperty "tl_utf8_s" tl_utf8_s,
     testProperty "t_utf16LE" t_utf16LE,
     testProperty "t_utf16LE_c" t_utf16LE_c,
-    testProperty "t_utf16LE_s" t_utf16LE_s,
     testProperty "tl_utf16LE" tl_utf16LE,
     testProperty "tl_utf16LE_c" tl_utf16LE_c,
-    testProperty "tl_utf16LE_s" tl_utf16LE_s,
     testProperty "t_utf16BE" t_utf16BE,
     testProperty "t_utf16BE_c" t_utf16BE_c,
-    testProperty "t_utf16BE_s" t_utf16BE_s,
     testProperty "tl_utf16BE" tl_utf16BE,
     testProperty "tl_utf16BE_c" tl_utf16BE_c,
-    testProperty "tl_utf16BE_s" tl_utf16BE_s,
     testProperty "t_utf32LE" t_utf32LE,
     testProperty "t_utf32LE_c" t_utf32LE_c,
-    testProperty "t_utf32LE_s" t_utf32LE_s,
     testProperty "tl_utf32LE" tl_utf32LE,
     testProperty "tl_utf32LE_c" tl_utf32LE_c,
-    testProperty "tl_utf32LE_s" tl_utf32LE_s,
     testProperty "t_utf32BE" t_utf32BE,
     testProperty "t_utf32BE_c" t_utf32BE_c,
-    testProperty "t_utf32BE_s" t_utf32BE_s,
     testProperty "tl_utf32BE" tl_utf32BE,
     testProperty "tl_utf32BE_c" tl_utf32BE_c,
-    testProperty "tl_utf32BE_s" tl_utf32BE_s,
     testGroup "builder" [
       testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder,
       testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset,
@@ -487,7 +451,6 @@ testTranscoding =
     testGroup "errors" [
       testProperty "t_utf8_err" t_utf8_err,
       testProperty "t_utf8_c_err" t_utf8_c_err,
-      testProperty "t_utf8_s_err" t_utf8_s_err,
       testProperty "t_utf8_err'" t_utf8_err'
     ],
     testGroup "error recovery" [

From ff571c092badbbf13249fdbee7c1cbd703f2b8fe Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 12 Jul 2022 21:58:34 -0600
Subject: [PATCH 18/87] Added missing lazy decodeAsciiE function. Updated
 documentation including the change log.

---
 src/Data/Text/Encoding.hs             | 25 +++++++++++++--
 src/Data/Text/Encoding/Types.hs       | 10 +++++-
 src/Data/Text/Lazy/Encoding.hs        | 46 +++++++++++++++++++++------
 tests/Tests/Properties/Transcoding.hs |  3 ++
 4 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 41752922..ccc5e935 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -352,7 +352,12 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
 
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8
+--
+-- @since 2.0.0.1
+decodeUtf8Chunks
+  :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  -> DecodeResult Text ByteString Word8
 decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff ->
   let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
 
@@ -370,7 +375,13 @@ noOptimization _ _ _ = Nothing
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16
+--
+-- @since 2.0.0.1
+decodeUtf16Chunks
+  :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
+  -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  -> DecodeResult Text ByteString Word16
 decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff ->
   -- get next Word8 pair
   let writeAndAdvance c n = WriteAndAdvance c $ const n
@@ -394,7 +405,13 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32
+--
+-- @since 2.0.0.1
+decodeUtf32Chunks
+  :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
+  -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  -> DecodeResult Text ByteString Word32
 decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
   -- get next Word8 quartet
   case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3)
@@ -408,6 +425,8 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
 -- This is a total function: On success the decoded 'Text' is within a
 -- 'Right' value, and an error ('Left' 'Int') indicates the postion of
 -- the offending 'Word8'.
+--
+-- @since 2.0.0.1
 decodeAsciiE :: ByteString -> Either Int Text
 decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do
   asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
index 1f806be9..20113b3d 100644
--- a/src/Data/Text/Encoding/Types.hs
+++ b/src/Data/Text/Encoding/Types.hs
@@ -17,6 +17,8 @@ module Data.Text.Encoding.Types
   where
 
 -- | A decoding result on encoded data.
+--
+-- @since 2.0.0.1
 data DecodeResult t b w = DecodeResult
   !t          -- ^ The decoded data up to an incomplete code point at
               -- the end of the input data, an invalid word, or to the
@@ -28,6 +30,8 @@ data DecodeResult t b w = DecodeResult
   deriving (Eq, Ord, Show, Read)
 
 -- | A decoding result on encoded data.
+--
+-- @since 2.0.0.1
 data StreamDecodeResult t b w = StreamDecodeResult
   !t          -- ^ The decoded data up to an incomplete code point at
               -- the end of the input data, an invalid word, or to the
@@ -46,7 +50,11 @@ data StreamDecodeResult t b w = StreamDecodeResult
 -- stream decoder will return a 'StreamDecodeResult' which contains a
 -- continuation function that accepts another section of unencoded
 -- data as a continuation of any remaining unencoded data.
-chunksDecoderToStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w
+--
+-- @since 2.0.0.1
+chunksDecoderToStream :: Monoid b
+  => (b -> b -> DecodeResult t b w) -- ^ Chunk decoder function
+  -> b -> StreamDecodeResult t b w
 chunksDecoderToStream chunksDecoder = g 0 mempty
   where
     g pos bs0 bs1 =
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 1e1360fa..c598280f 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,6 +23,7 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
+    , decodeAsciiE
     , DecodeResult(..)
     , decodeUtf8Chunks
     , decodeUtf16Chunks
@@ -109,6 +110,16 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
+-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
+--
+-- This is a total function: On success the decoded 'Text' is within a
+-- 'Right' value, and an error ('Left' 'Int') indicates the postion of
+-- the offending 'Word8'.
+--
+-- @since 2.0.0.1
+decodeAsciiE :: B.ByteString -> Either Int Text
+decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks
+
 decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
   -> B.ByteString
   -> B.ByteString
@@ -125,19 +136,36 @@ decodeChunks decoder = g id 0 mempty
     g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty
     g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos
 
--- | Decode two 'ByteString's containing UTF-8-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf8Chunks :: B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word8
+-- | Decode two 'B.ByteString's containing UTF-8-encoded text as though
+-- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.0.1
+decodeUtf8Chunks
+  :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  -> DecodeResult Text B.ByteString Word8
 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
 
--- | Decode two 'ByteString's containing UTF-16-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf16Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word16
+-- | Decode two 'B.ByteString's containing UTF-16-encoded text as though
+-- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.0.1
+decodeUtf16Chunks
+  :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
+  -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  -> DecodeResult Text B.ByteString Word16
 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
 
--- | Decode two 'ByteString's containing UTF-32-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
-decodeUtf32Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word32
+-- | Decode two 'B.ByteString's containing UTF-32-encoded text as though
+-- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.0.1
+decodeUtf32Chunks
+  :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
+  -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  -> DecodeResult Text B.ByteString Word32
 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index bc1c36e4..c71fef67 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -30,6 +30,8 @@ import qualified Data.Text.Lazy.Encoding as EL
 
 t_asciiE t    = E.decodeAsciiE (E.encodeUtf8 a) === Right a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
+tl_asciiE t    = EL.decodeAsciiE (EL.encodeUtf8 a) === Right a
+    where a  = TL.map (\c -> chr (ord c `mod` 128)) t
 
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
@@ -413,6 +415,7 @@ testTranscoding :: TestTree
 testTranscoding =
   testGroup "transcoding" [
     testProperty "t_asciiE" t_asciiE,
+    testProperty "tl_asciiE" tl_asciiE,
     testProperty "t_ascii" t_ascii,
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,

From 7d0757afaec4ec694e877687b48a6e95b6c1e86b Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 16 Jul 2022 17:55:27 -0600
Subject: [PATCH 19/87] created module Data.Text.Encoding.Common and moved the
 contents Types and Error to it. Redesigned chunksDecoderToStream

---
 benchmarks/haskell/Benchmarks/Stream.hs       |   2 +-
 src/Data/Text.hs                              |  18 +-
 src/Data/Text/Encoding.hs                     |   3 +-
 src/Data/Text/Encoding/Common.hs              | 177 ++++++++++++++++++
 src/Data/Text/Encoding/Error.hs               |  97 ++--------
 src/Data/Text/Encoding/Types.hs               |  64 -------
 src/Data/Text/Internal/Encoding/Fusion.hs     |   2 +-
 .../Text/Internal/Lazy/Encoding/Fusion.hs     |   2 +-
 src/Data/Text/Lazy.hs                         |  18 +-
 src/Data/Text/Lazy/Encoding.hs                |   3 +-
 tests/Tests/Properties/Transcoding.hs         | 128 +++++--------
 tests/Tests/QuickCheckUtils.hs                |   2 +-
 tests/Tests/Regressions.hs                    |   2 +-
 text.cabal                                    |   3 +-
 14 files changed, 249 insertions(+), 272 deletions(-)
 create mode 100644 src/Data/Text/Encoding/Common.hs
 delete mode 100644 src/Data/Text/Encoding/Types.hs

diff --git a/benchmarks/haskell/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs
index 1d53c498..539b709c 100644
--- a/benchmarks/haskell/Benchmarks/Stream.hs
+++ b/benchmarks/haskell/Benchmarks/Stream.hs
@@ -21,7 +21,7 @@ import qualified Data.Text.Lazy as TL
 import qualified Data.ByteString.Lazy as BL
 import Data.Text.Internal.Fusion.Types (Step (..), Stream (..))
 import qualified Data.Text.Encoding as T
-import qualified Data.Text.Encoding.Error as E
+import qualified Data.Text.Encoding.Common as E
 import qualified Data.Text.Internal.Encoding.Fusion as T
 import qualified Data.Text.Internal.Encoding.Fusion.Common as F
 import qualified Data.Text.Internal.Fusion as T
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 5176a0ab..c1eda582 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -355,21 +355,10 @@ instance Monoid Text where
     mappend = (<>)
     mconcat = concat
 
--- | Performs replacement on invalid scalar values:
---
--- >>> :set -XOverloadedStrings
--- >>> "\55555" :: Text
--- "\65533"
 instance IsString Text where
     fromString = pack
 
--- | Performs replacement on invalid scalar values:
---
--- >>> :set -XOverloadedLists
--- >>> ['\55555'] :: Text
--- "\65533"
---
--- @since 1.2.0.0
+-- | @since 1.2.0.0
 instance Exts.IsList Text where
     type Item Text = Char
     fromList       = pack
@@ -456,10 +445,7 @@ compareText (Text arrA offA lenA) (Text arrB offB lenB) =
 -- * Conversion to/from 'Text'
 
 -- | /O(n)/ Convert a 'String' into a 'Text'.
--- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id':
---
--- >>> unpack (pack "\55555")
--- "\65533"
+-- Performs replacement on invalid scalar values.
 pack :: String -> Text
 pack = unstream . S.map safe . S.streamList
 {-# INLINE [1] pack #-}
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index ccc5e935..b8e9ffb1 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -82,8 +82,7 @@ import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Encoding.Types (DecodeResult(..))
+import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
new file mode 100644
index 00000000..3212e315
--- /dev/null
+++ b/src/Data/Text/Encoding/Common.hs
@@ -0,0 +1,177 @@
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
+{-# LANGUAGE Safe #-}
+-- |
+-- Module      : Data.Text.Encoding.Common
+-- Copyright   : (c) Bryan O'Sullivan 2009
+--
+-- License     : BSD-style
+-- Maintainer  : bos@serpentine.com
+-- Portability : GHC
+--
+-- Common functions and types for both lazy and strict encoding and
+-- decoding including error handling.
+--
+-- The standard functions for encoding and decoding text are strict,
+-- which is to say that they throw exceptions on invalid input.  This
+-- is often unhelpful on real world input, so alternative functions
+-- exist that accept custom handlers for dealing with invalid inputs.
+-- These 'OnError' handlers are normal Haskell functions.  You can use
+-- one of the presupplied functions in this module, or you can write a
+-- custom handler of your own.
+
+module Data.Text.Encoding.Common
+  (
+  -- * Full-service result types and functions
+    DecodeResult(..)
+  , DecodeResultHandler
+  , chunksDecoderToStream
+  -- * Error handling types
+  , UnicodeException(..)
+  , OnError
+  , OnDecodeError
+  , OnEncodeError
+  -- * Useful error handling functions
+  , lenientDecode
+  , strictDecode
+  , strictEncode
+  , ignore
+  , replace
+  )
+  where
+
+import Control.DeepSeq (NFData (..))
+import Control.Exception (Exception, throw)
+import Data.Typeable (Typeable)
+import Data.Word (Word8)
+import Numeric (showHex)
+
+-- | A decoding result on encoded data.
+--
+-- @since 2.0.0.1
+data DecodeResult t b w = DecodeResult
+  !t          -- ^ The decoded data up to an incomplete code point at
+              -- the end of the input data, an invalid word, or to the
+              -- end of the input.
+  !(Maybe w)  -- ^ If an invalid code point was encountered.
+  !b          -- ^ The remaining undecoded data. If an invald code
+              -- point was encountered, this is after that code point.
+  !Int        -- ^ Byte position of remaining undecoded data.
+  deriving (Eq, Ord, Show, Read)
+
+-- | The funtion type for handling successful decodings.
+--
+-- @since 2.0.0.1
+type DecodeResultHandler t w b b' r
+  = t           -- ^ The decoded data up to an incomplete code point at
+                -- the end of the input data, an invalid word, or to
+                -- the end of the input.
+  -> Maybe w    -- ^ If an invalid code point was encountered.
+  -> b          -- ^ The remaining undecoded data. If an invald code
+                -- point was encountered, this is after that code
+                -- point.
+  -> Int        -- ^ Byte position of remaining undecoded data. This
+                -- is treated as if all the data fed to previous
+                -- invocations of the continations were one continuous
+                -- feed.
+  -> (b' -> r)  -- ^ Continuation to accept the next span of data to be
+                -- decoded with the remaining unencoded data.
+  -> r          -- ^ Result of the continuation.
+
+-- | Create a stream decoder from a chunks decoder. The resulting
+-- stream decoder accepts a 'DecodeResultHandler' to process the decode
+-- result. The continuations accept another section of unencoded data
+-- as a continuation of any remaining unencoded data.
+--
+-- @since 2.0.0.1
+chunksDecoderToStream :: Monoid b
+  => (b -> b' -> DecodeResult t b w)  -- ^ Chunks decoder
+  -> b'                               -- ^ Encoded data
+  -> DecodeResultHandler t w b b' r   -- ^ Result continuation
+  -> r                                -- ^ Continuation result
+chunksDecoderToStream chunksDecoder bs f =
+  g mempty bs 0
+  where
+    g bs0 bs1 pos0 =
+      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
+          pos = pos0 + pos1
+      in
+      f t mW bs1' pos $ \ bs2 -> g bs1' bs2 pos
+
+-- | Function type for handling a coding error.  It is supplied with
+-- two inputs:
+--
+-- * A 'String' that describes the error.
+--
+-- * The input value that caused the error.  If the error arose
+--   because the end of input was reached or could not be identified
+--   precisely, this value will be 'Nothing'.
+--
+-- If the handler returns a value wrapped with 'Just', that value will
+-- be used in the output as the replacement for the invalid input.  If
+-- it returns 'Nothing', no value will be used in the output.
+--
+-- Should the handler need to abort processing, it should use 'error'
+-- or 'throw' an exception (preferably a 'UnicodeException').  It may
+-- use the description provided to construct a more helpful error
+-- report.
+type OnError a b = String -> Maybe a -> Maybe b
+
+-- | A handler for a decoding error.
+type OnDecodeError = OnError Word8 Char
+
+-- | A handler for an encoding error.
+{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
+type OnEncodeError = OnError Char Word8
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+    DecodeError String (Maybe Word8)
+    -- ^ Could not decode a byte sequence because it was invalid under
+    -- the given encoding, or ran out of input in mid-decode.
+  | EncodeError String (Maybe Char)
+    -- ^ Tried to encode a character that could not be represented
+    -- under the given encoding, or ran out of input in mid-encode.
+    deriving (Eq, Typeable)
+
+{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
+
+showUnicodeException :: UnicodeException -> String
+showUnicodeException (DecodeError desc (Just w))
+    = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
+showUnicodeException (DecodeError desc Nothing)
+    = "Cannot decode input: " ++ desc
+showUnicodeException (EncodeError desc (Just c))
+    = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
+showUnicodeException (EncodeError desc Nothing)
+    = "Cannot encode input: " ++ desc
+
+instance Show UnicodeException where
+    show = showUnicodeException
+
+instance Exception UnicodeException
+
+instance NFData UnicodeException where
+    rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
+    rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
+
+-- | Throw a 'UnicodeException' if decoding fails.
+strictDecode :: OnDecodeError
+strictDecode desc c = throw (DecodeError desc c)
+
+-- | Replace an invalid input byte with the Unicode replacement
+-- character U+FFFD.
+lenientDecode :: OnDecodeError
+lenientDecode _ _ = Just '\xfffd'
+
+-- | Throw a 'UnicodeException' if encoding fails.
+{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
+strictEncode :: OnEncodeError
+strictEncode desc c = throw (EncodeError desc c)
+
+-- | Ignore an invalid input, substituting nothing in the output.
+ignore :: OnError a b
+ignore _ _ = Nothing
+
+-- | Replace an invalid input with a valid output.
+replace :: b -> OnError a b
+replace c _ _ = Just c
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index ea9e0997..569fafcf 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -1,4 +1,3 @@
-{-# LANGUAGE CPP, DeriveDataTypeable #-}
 {-# LANGUAGE Safe #-}
 -- |
 -- Module      : Data.Text.Encoding.Error
@@ -34,87 +33,15 @@ module Data.Text.Encoding.Error
     , replace
     ) where
 
-import Control.DeepSeq (NFData (..))
-import Control.Exception (Exception, throw)
-import Data.Typeable (Typeable)
-import Data.Word (Word8)
-import Numeric (showHex)
-
--- | Function type for handling a coding error.  It is supplied with
--- two inputs:
---
--- * A 'String' that describes the error.
---
--- * The input value that caused the error.  If the error arose
---   because the end of input was reached or could not be identified
---   precisely, this value will be 'Nothing'.
---
--- If the handler returns a value wrapped with 'Just', that value will
--- be used in the output as the replacement for the invalid input.  If
--- it returns 'Nothing', no value will be used in the output.
---
--- Should the handler need to abort processing, it should use 'error'
--- or 'throw' an exception (preferably a 'UnicodeException').  It may
--- use the description provided to construct a more helpful error
--- report.
-type OnError a b = String -> Maybe a -> Maybe b
-
--- | A handler for a decoding error.
-type OnDecodeError = OnError Word8 Char
-
--- | A handler for an encoding error.
-{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
-type OnEncodeError = OnError Char Word8
-
--- | An exception type for representing Unicode encoding errors.
-data UnicodeException =
-    DecodeError String (Maybe Word8)
-    -- ^ Could not decode a byte sequence because it was invalid under
-    -- the given encoding, or ran out of input in mid-decode.
-  | EncodeError String (Maybe Char)
-    -- ^ Tried to encode a character that could not be represented
-    -- under the given encoding, or ran out of input in mid-encode.
-    deriving (Eq, Typeable)
-
-{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
-
-showUnicodeException :: UnicodeException -> String
-showUnicodeException (DecodeError desc (Just w))
-    = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
-showUnicodeException (DecodeError desc Nothing)
-    = "Cannot decode input: " ++ desc
-showUnicodeException (EncodeError desc (Just c))
-    = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
-showUnicodeException (EncodeError desc Nothing)
-    = "Cannot encode input: " ++ desc
-
-instance Show UnicodeException where
-    show = showUnicodeException
-
-instance Exception UnicodeException
-
-instance NFData UnicodeException where
-    rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
-    rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
-
--- | Throw a 'UnicodeException' if decoding fails.
-strictDecode :: OnDecodeError
-strictDecode desc c = throw (DecodeError desc c)
-
--- | Replace an invalid input byte with the Unicode replacement
--- character U+FFFD.
-lenientDecode :: OnDecodeError
-lenientDecode _ _ = Just '\xfffd'
-
--- | Throw a 'UnicodeException' if encoding fails.
-{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
-strictEncode :: OnEncodeError
-strictEncode desc c = throw (EncodeError desc c)
-
--- | Ignore an invalid input, substituting nothing in the output.
-ignore :: OnError a b
-ignore _ _ = Nothing
-
--- | Replace an invalid input with a valid output.
-replace :: b -> OnError a b
-replace c _ _ = Just c
+import Data.Text.Encoding.Common
+  ( UnicodeException(..)
+  , OnError
+  , OnDecodeError
+  , OnEncodeError
+  -- * Useful error handling functions
+  , lenientDecode
+  , strictDecode
+  , strictEncode
+  , ignore
+  , replace
+  )
diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs
deleted file mode 100644
index 20113b3d..00000000
--- a/src/Data/Text/Encoding/Types.hs
+++ /dev/null
@@ -1,64 +0,0 @@
-{-# LANGUAGE Safe #-}
--- |
--- Module      : Data.Text.Encoding.Types
--- Copyright   : (c) Bryan O'Sullivan 2009
---
--- License     : BSD-style
--- Maintainer  : bos@serpentine.com
--- Portability : GHC
---
--- Types to indicate the result of an attempt to decode data.
-
-module Data.Text.Encoding.Types
-  ( DecodeResult(..)
-  , StreamDecodeResult(..)
-  , chunksDecoderToStream
-  )
-  where
-
--- | A decoding result on encoded data.
---
--- @since 2.0.0.1
-data DecodeResult t b w = DecodeResult
-  !t          -- ^ The decoded data up to an incomplete code point at
-              -- the end of the input data, an invalid word, or to the
-              -- end of the input.
-  !(Maybe w)  -- ^ If an invalid code point was encountered.
-  !b          -- ^ The remaining undecoded data. If an invald code
-              -- point was encountered, this is after that code point.
-  !Int        -- ^ Byte position of remaining undecoded data.
-  deriving (Eq, Ord, Show, Read)
-
--- | A decoding result on encoded data.
---
--- @since 2.0.0.1
-data StreamDecodeResult t b w = StreamDecodeResult
-  !t          -- ^ The decoded data up to an incomplete code point at
-              -- the end of the input data, an invalid word, or to the
-              -- end of the input.
-  !(Maybe w)  -- ^ If an invalid code point was encountered.
-  !b          -- ^ The remaining undecoded data. If an invald code
-              -- point was encountered, this is after that code point.
-  !Int        -- ^ Byte position of remaining undecoded data. This is
-              -- treated as if all the data fed to previous invocations
-              -- of the continations where one continuous feed.
-  (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next
-                                  -- span of data to be decoded with
-                                  -- the remaining unencoded data.
-
--- | Create a stream decoder from a chunks decoder. The resulting
--- stream decoder will return a 'StreamDecodeResult' which contains a
--- continuation function that accepts another section of unencoded
--- data as a continuation of any remaining unencoded data.
---
--- @since 2.0.0.1
-chunksDecoderToStream :: Monoid b
-  => (b -> b -> DecodeResult t b w) -- ^ Chunk decoder function
-  -> b -> StreamDecodeResult t b w
-chunksDecoderToStream chunksDecoder = g 0 mempty
-  where
-    g pos bs0 bs1 =
-      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
-          pos' = pos + pos1
-      in
-      StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2
diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs
index aa8f0d02..aa3019ab 100644
--- a/src/Data/Text/Internal/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Encoding/Fusion.hs
@@ -41,7 +41,7 @@ import Data.Bits (shiftL, shiftR)
 import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
-import Data.Text.Encoding.Error
+import Data.Text.Encoding.Common
 import Data.Text.Internal.Encoding.Fusion.Common
 import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
index 69149779..5ebad0ad 100644
--- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -38,7 +38,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Unsafe as B
 import Data.Text.Internal.ByteStringCompat
 import Data.Text.Internal.Encoding.Fusion.Common
-import Data.Text.Encoding.Error
+import Data.Text.Encoding.Common
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
 import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 39834a8b..9e2638eb 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -316,21 +316,10 @@ instance Monoid Text where
     mappend = (<>)
     mconcat = concat
 
--- | Performs replacement on invalid scalar values:
---
--- >>> :set -XOverloadedStrings
--- >>> "\55555" :: Data.Text.Lazy.Text
--- "\65533"
 instance IsString Text where
     fromString = pack
 
--- | Performs replacement on invalid scalar values:
---
--- >>> :set -XOverloadedLists
--- >>> ['\55555'] :: Data.Text.Lazy.Text
--- "\65533"
---
--- @since 1.2.0.0
+-- | @since 1.2.0.0
 instance Exts.IsList Text where
     type Item Text = Char
     fromList       = pack
@@ -383,10 +372,7 @@ textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr]
 
 -- | /O(n)/ Convert a 'String' into a 'Text'.
 --
--- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id':
---
--- >>> Data.Text.Lazy.unpack (Data.Text.Lazy.pack "\55555")
--- "\65533"
+-- Performs replacement on invalid scalar values.
 pack ::
 #if defined(ASSERTS)
   HasCallStack =>
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index c598280f..25f655c2 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -62,8 +62,7 @@ module Data.Text.Lazy.Encoding
 
 import Control.Exception (evaluate, try)
 import Data.Monoid (Monoid(..))
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Encoding.Types (DecodeResult(..))
+import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word16, Word32, Word8)
 import qualified Data.ByteString as S
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index c71fef67..9b36bfde 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -1,13 +1,17 @@
 -- | Tests for encoding and decoding
 
-{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-}
+{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, FlexibleContexts #-}
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
 module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
+import Control.Monad.Writer (runWriterT, tell)
+import Data.Functor.Identity (runIdentity)
+-- import Data.Monoid (Endo(appEndo))
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
+import Data.Maybe (fromMaybe)
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -23,8 +27,7 @@ import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Char8 as BLC
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
-import qualified Data.Text.Encoding.Error as E
-import qualified Data.Text.Encoding.Types as Ty
+import qualified Data.Text.Encoding.Common as C
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
@@ -288,65 +291,11 @@ t_chunk_decode_utf32LE =
       in
       decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12
 
-decodeUtf8StreamS = Ty.chunksDecoderToStream E.decodeUtf8Chunks
-
-t_stream_decode_utf8_1 =
-  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 _) = decodeUtf8StreamS $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
-  (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8)
-t_stream_decode_utf8_2 =
-  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [97, 0xC2, 97] in
-  whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in
-    (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3)
-t_stream_decode_utf8_3 =
-  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [104, 105, 32, 0xe2] in
-  whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in
-    whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $
-      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in
-      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8)
-
-decodeUtf16StreamS = Ty.chunksDecoderToStream . E.decodeUtf16Chunks
-
-t_stream_decode_utf16BE =
-  let expectedBs0 = B.pack [0]
-      (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS True expectedBs0 in
-  whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in
-    whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $
-      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
-      (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14)
-t_stream_decode_utf16LE =
-  let expectedBs0 = B.pack [104]
-      (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS False expectedBs0 in
-  whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in
-    whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $
-      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
-      (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14)
-
-decodeUtf32StreamS = Ty.chunksDecoderToStream . E.decodeUtf32Chunks
-
-t_stream_decode_utf32BE =
-  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in
-  whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in
-    whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $
-      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in
-      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24)
-t_stream_decode_utf32LE =
-  let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in
-  whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $
-    let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in
-    whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $
-      let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in
-      (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24)
-
 decodeLL :: BL.ByteString -> TL.Text
-decodeLL = EL.decodeUtf8With E.lenientDecode
+decodeLL = EL.decodeUtf8With C.lenientDecode
 
 decodeL :: B.ByteString -> T.Text
-decodeL = E.decodeUtf8With E.lenientDecode
+decodeL = E.decodeUtf8With C.lenientDecode
 
 -- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
 -- and it should behave the same as decoding of strict bytestrings.
@@ -354,20 +303,42 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
-decodeStream decoder snoc bs =
-  g mempty $ decoder bs
-  where
-    g t0 (Ty.StreamDecodeResult t mW bs' pos f) =
-      let t0' = t0 `mappend` t in
-      case mW of
-        Just _ -> let mC = E.lenientDecode "" mW in
-          let t' = case mC of
-                Just c -> t0' `snoc` c
-                _ -> t0'
-          in g t' $ f mempty
-        _ -> (t0', bs', pos)
-
-decodeUtf8StreamL = Ty.chunksDecoderToStream EL.decodeUtf8Chunks
+decodeStream decoder snoc bs = runIdentity $ do
+  ((b, p), tDiff) <- runWriterT $ decoder bs $ \ t mW b p f ->
+    case mW of
+      Just w ->
+        let t' = (case C.lenientDecode "" $ Just w of
+              Just c -> t `snoc` c
+              _ -> t)
+        in do
+          tell (t' <>)
+          f mempty
+      _ -> do
+      tell (t <>)
+      pure (b, p)
+  pure (tDiff mempty, b, p)
+
+decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks
+
+decodeUtf8StreamS = C.chunksDecoderToStream E.decodeUtf8Chunks
+
+-- The results of the stream decoder on a strict bytestring should
+-- match the results of a non-stream decoder on a lazy bytestring.
+t_decode_utf8_lenient_stream :: Property
+t_decode_utf8_lenient_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
+  let (st, sb, _) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs
+      st' = B.foldr (\ w st'' -> st'' `T.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) st sb
+  in
+  decodeLL bs === TL.fromStrict st'
+
+-- The results of the non-stream decoder on a strict bytestring should
+-- match the results of a stream decoder on a lazy bytestring.
+t_decode_utf8_stream_lenient :: Property
+t_decode_utf8_stream_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
+  let (lt, lb, _) = decodeStream decodeUtf8StreamL TL.snoc bs
+      lt' = BL.foldr (\ w lt'' -> lt'' `TL.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) lt lb
+  in
+  lt' === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
 -- The decoding of lazy bytestrings should not depend on how they are chunked,
 -- and it should behave the same as decoding of strict bytestrings.
@@ -403,7 +374,7 @@ t_decode_with_error4' =
 t_decode_with_error5' = ioProperty $ do
   ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81])
   pure $ case ret of
-    Left (_ :: E.UnicodeException) -> True
+    Left (_ :: C.UnicodeException) -> True
     Right{} -> False
 
 t_infix_concat bs1 text bs2 =
@@ -464,14 +435,9 @@ testTranscoding =
       testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
       testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,
       testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE,
-      testProperty "t_stream_decode_utf8_1" t_stream_decode_utf8_1,
-      testProperty "t_stream_decode_utf8_2" t_stream_decode_utf8_2,
-      testProperty "t_stream_decode_utf8_3" t_stream_decode_utf8_3,
-      testProperty "t_stream_decode_utf16BE" t_stream_decode_utf16BE,
-      testProperty "t_stream_decode_utf16LE" t_stream_decode_utf16LE,
-      testProperty "t_stream_decode_utf32BE" t_stream_decode_utf32BE,
-      testProperty "t_stream_decode_utf32LE" t_stream_decode_utf32LE,
       testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
+      testProperty "t_decode_utf8_stream_lenient" t_decode_utf8_stream_lenient,
+      testProperty "t_decode_utf8_lenient_stream" t_decode_utf8_lenient_stream,
       testProperty "t_decode_utf8_stream" t_decode_utf8_stream,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 8f36f7ec..a6d72529 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -42,7 +42,7 @@ import Tests.Utils
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
-import qualified Data.Text.Encoding.Error as T
+import qualified Data.Text.Encoding.Common as T
 import qualified Data.Text.Internal.Fusion as TF
 import qualified Data.Text.Internal.Fusion.Common as TF
 import qualified Data.Text.Internal.Lazy as TL
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
index cf011680..7828bc75 100644
--- a/tests/Tests/Regressions.hs
+++ b/tests/Tests/Regressions.hs
@@ -21,7 +21,7 @@ import qualified Data.ByteString.Lazy as LB
 import qualified Data.Text as T
 import qualified Data.Text.Array as TA
 import qualified Data.Text.Encoding as TE
-import qualified Data.Text.Encoding.Error as E
+import qualified Data.Text.Encoding.Common as E
 import qualified Data.Text.Internal as T
 import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as LT
diff --git a/text.cabal b/text.cabal
index 956d8ce3..7def5192 100644
--- a/text.cabal
+++ b/text.cabal
@@ -137,7 +137,7 @@ library
     Data.Text.Array
     Data.Text.Encoding
     Data.Text.Encoding.Error
-    Data.Text.Encoding.Types
+    Data.Text.Encoding.Common
     Data.Text.Foreign
     Data.Text.IO
     Data.Text.Internal
@@ -255,6 +255,7 @@ test-suite tests
     deepseq,
     directory,
     ghc-prim,
+    mtl,
     tasty,
     tasty-hunit,
     tasty-quickcheck,

From 270a6055f2430cba38d9ff6312550461e05d288e Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 16 Jul 2022 19:46:22 -0600
Subject: [PATCH 20/87] replace <> with mappend

---
 tests/Tests/Properties/Transcoding.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 9b36bfde..4c68f90a 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -311,10 +311,10 @@ decodeStream decoder snoc bs = runIdentity $ do
               Just c -> t `snoc` c
               _ -> t)
         in do
-          tell (t' <>)
+          tell (mappend t')
           f mempty
       _ -> do
-      tell (t <>)
+      tell (mappend t)
       pure (b, p)
   pure (tDiff mempty, b, p)
 

From 135e7f5f3d56e15e00dc9bb7be0cbc09d23f13c6 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 17 Jul 2022 14:31:43 -0600
Subject: [PATCH 21/87] fix haddocks, remove commented code, remove mtl
 dependecy for tests

---
 changelog.md                          |  2 --
 src/Data/Text/Encoding.hs             |  8 ++++----
 src/Data/Text/Encoding/Common.hs      |  8 ++++----
 src/Data/Text/Encoding/Error.hs       |  1 -
 src/Data/Text/Internal/Builder.hs     | 11 -----------
 src/Data/Text/Lazy/Encoding.hs        |  8 ++++----
 tests/Tests/Properties/Transcoding.hs | 15 +++++----------
 text.cabal                            |  1 -
 8 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/changelog.md b/changelog.md
index 4dc76498..b0347e05 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,3 @@
-### 2.0.1
-
 * Improve portability of C and C++ code.
 * [Make `Lift` instance more efficient](https://github.com/haskell/text/pull/413)
 * [Make `toCaseFold` idempotent](https://github.com/haskell/text/pull/402)
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index b8e9ffb1..f67ca72c 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -352,7 +352,7 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf8Chunks
   :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
   -> ByteString -- ^ The second `ByteString` chunk to decode.
@@ -375,7 +375,7 @@ noOptimization _ _ _ = Nothing
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf16Chunks
   :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
   -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
@@ -405,7 +405,7 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf32Chunks
   :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
   -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
@@ -425,7 +425,7 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
 -- 'Right' value, and an error ('Left' 'Int') indicates the postion of
 -- the offending 'Word8'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeAsciiE :: ByteString -> Either Int Text
 decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do
   asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
index 3212e315..d46c40ee 100644
--- a/src/Data/Text/Encoding/Common.hs
+++ b/src/Data/Text/Encoding/Common.hs
@@ -47,12 +47,12 @@ import Numeric (showHex)
 
 -- | A decoding result on encoded data.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 data DecodeResult t b w = DecodeResult
   !t          -- ^ The decoded data up to an incomplete code point at
               -- the end of the input data, an invalid word, or to the
               -- end of the input.
-  !(Maybe w)  -- ^ If an invalid code point was encountered.
+  (Maybe w)  -- ^ If an invalid code point was encountered.
   !b          -- ^ The remaining undecoded data. If an invald code
               -- point was encountered, this is after that code point.
   !Int        -- ^ Byte position of remaining undecoded data.
@@ -60,7 +60,7 @@ data DecodeResult t b w = DecodeResult
 
 -- | The funtion type for handling successful decodings.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 type DecodeResultHandler t w b b' r
   = t           -- ^ The decoded data up to an incomplete code point at
                 -- the end of the input data, an invalid word, or to
@@ -82,7 +82,7 @@ type DecodeResultHandler t w b b' r
 -- result. The continuations accept another section of unencoded data
 -- as a continuation of any remaining unencoded data.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 chunksDecoderToStream :: Monoid b
   => (b -> b' -> DecodeResult t b w)  -- ^ Chunks decoder
   -> b'                               -- ^ Encoded data
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index 569fafcf..83fa3a76 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -38,7 +38,6 @@ import Data.Text.Encoding.Common
   , OnError
   , OnDecodeError
   , OnEncodeError
-  -- * Useful error handling functions
   , lenientDecode
   , strictDecode
   , strictEncode
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 6269c597..590c6528 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -108,11 +108,6 @@ instance Monoid Builder where
    mconcat = foldr mappend Data.Monoid.mempty
    {-# INLINE mconcat #-}
 
--- | Performs replacement on invalid scalar values:
---
--- >>> :set -XOverloadedStrings
--- >>> "\55555" :: Builder
--- "\65533"
 instance String.IsString Builder where
     fromString = fromString
     {-# INLINE fromString #-}
@@ -186,12 +181,6 @@ fromText t@(Text arr off l)
 --
 --  * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@
 --
--- Performs replacement on invalid scalar values:
---
--- >>> fromString "\55555"
--- "\65533"
---
--- @since 1.2.0.0
 fromString :: String -> Builder
 fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
     let loop !marr !o !u !l [] = k (Buffer marr o u l)
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 25f655c2..14fa6ef8 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -115,7 +115,7 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 -- 'Right' value, and an error ('Left' 'Int') indicates the postion of
 -- the offending 'Word8'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeAsciiE :: B.ByteString -> Either Int Text
 decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks
 
@@ -138,7 +138,7 @@ decodeChunks decoder = g id 0 mempty
 -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf8Chunks
   :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
   -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
@@ -148,7 +148,7 @@ decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
 -- | Decode two 'B.ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf16Chunks
   :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
   -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
@@ -159,7 +159,7 @@ decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
 -- | Decode two 'B.ByteString's containing UTF-32-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.0.1
+-- @since 2.0.1
 decodeUtf32Chunks
   :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
   -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 4c68f90a..0cbc0141 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -6,9 +6,6 @@ module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
-import Control.Monad.Writer (runWriterT, tell)
-import Data.Functor.Identity (runIdentity)
--- import Data.Monoid (Endo(appEndo))
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Data.Maybe (fromMaybe)
@@ -303,20 +300,18 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
-decodeStream decoder snoc bs = runIdentity $ do
-  ((b, p), tDiff) <- runWriterT $ decoder bs $ \ t mW b p f ->
+decodeStream decoder snoc bs =
+  case decoder bs $ \ t mW b p f ->
     case mW of
       Just w ->
         let t' = (case C.lenientDecode "" $ Just w of
               Just c -> t `snoc` c
               _ -> t)
         in do
-          tell (mappend t')
+          (mappend t', ())
           f mempty
-      _ -> do
-      tell (mappend t)
-      pure (b, p)
-  pure (tDiff mempty, b, p)
+      _ -> (mappend t, (b, p)) of
+    (tDiff, (b, p)) -> (tDiff mempty, b, p)
 
 decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks
 
diff --git a/text.cabal b/text.cabal
index 7def5192..cabb1582 100644
--- a/text.cabal
+++ b/text.cabal
@@ -255,7 +255,6 @@ test-suite tests
     deepseq,
     directory,
     ghc-prim,
-    mtl,
     tasty,
     tasty-hunit,
     tasty-quickcheck,

From 74520741a084c28f285676ea055624cd0aacddbe Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 17 Jul 2022 14:51:16 -0600
Subject: [PATCH 22/87] remove unused extension from
 Tests.Properties.Transcoding

---
 tests/Tests/Properties/Transcoding.hs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 0cbc0141..0fcdb7ef 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -1,6 +1,6 @@
 -- | Tests for encoding and decoding
 
-{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, FlexibleContexts #-}
+{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-}
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
 module Tests.Properties.Transcoding
     ( testTranscoding
@@ -307,9 +307,9 @@ decodeStream decoder snoc bs =
         let t' = (case C.lenientDecode "" $ Just w of
               Just c -> t `snoc` c
               _ -> t)
-        in do
-          (mappend t', ())
-          f mempty
+        in
+        case f mempty of
+          (tDiff, bp) -> (mappend t' `mappend` tDiff, bp)
       _ -> (mappend t, (b, p)) of
     (tDiff, (b, p)) -> (tDiff mempty, b, p)
 

From 438ab9b728841f39737114f75550591b80232d7d Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 17 Jul 2022 15:27:24 -0600
Subject: [PATCH 23/87] Accommodating older versions of Haddock

---
 src/Data/Text/Encoding/Common.hs | 50 +++++++++++++++-----------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
index d46c40ee..dfd9f05d 100644
--- a/src/Data/Text/Encoding/Common.hs
+++ b/src/Data/Text/Encoding/Common.hs
@@ -45,42 +45,38 @@ import Data.Typeable (Typeable)
 import Data.Word (Word8)
 import Numeric (showHex)
 
--- | A decoding result on encoded data.
+-- | A decoding result on encoded data. It contains:
+--
+--   1. the decoded data up to an incomplete code point at the end of
+--      the input data, an invalid word, or to the end of the input,
+--   2. the invalid code point if one was encountered,
+--   3. the remaining undecoded data, which is either an incomplete
+--      code point, the data after the invalid code point, or empty,
+--      and
+--   4. the byte position of remaining undecoded data.
 --
 -- @since 2.0.1
-data DecodeResult t b w = DecodeResult
-  !t          -- ^ The decoded data up to an incomplete code point at
-              -- the end of the input data, an invalid word, or to the
-              -- end of the input.
-  (Maybe w)  -- ^ If an invalid code point was encountered.
-  !b          -- ^ The remaining undecoded data. If an invald code
-              -- point was encountered, this is after that code point.
-  !Int        -- ^ Byte position of remaining undecoded data.
+data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int
   deriving (Eq, Ord, Show, Read)
 
--- | The funtion type for handling successful decodings.
+-- | The funtion type for handling successful decodings. The arguments
+-- passed to the handler are:
+--
+--   1. the decoded data up to an incomplete code point at the end of
+--      the input data, an invalid word, or to the end of the input,
+--   2. the invalid code point if one was encountered,
+--   3. the remaining undecoded data, which is either an incomplete
+--      code point, the data after the invalid code point, or empty,
+--   4. the byte position of remaining undecoded data, and
+--   5. a continuation function that accepts data that as treated as
+--      a continuation of any remaining unencoded data.
 --
 -- @since 2.0.1
-type DecodeResultHandler t w b b' r
-  = t           -- ^ The decoded data up to an incomplete code point at
-                -- the end of the input data, an invalid word, or to
-                -- the end of the input.
-  -> Maybe w    -- ^ If an invalid code point was encountered.
-  -> b          -- ^ The remaining undecoded data. If an invald code
-                -- point was encountered, this is after that code
-                -- point.
-  -> Int        -- ^ Byte position of remaining undecoded data. This
-                -- is treated as if all the data fed to previous
-                -- invocations of the continations were one continuous
-                -- feed.
-  -> (b' -> r)  -- ^ Continuation to accept the next span of data to be
-                -- decoded with the remaining unencoded data.
-  -> r          -- ^ Result of the continuation.
+type DecodeResultHandler t w b b' r = t -> Maybe w -> b -> Int -> (b' -> r) -> r
 
 -- | Create a stream decoder from a chunks decoder. The resulting
 -- stream decoder accepts a 'DecodeResultHandler' to process the decode
--- result. The continuations accept another section of unencoded data
--- as a continuation of any remaining unencoded data.
+-- result.
 --
 -- @since 2.0.1
 chunksDecoderToStream :: Monoid b

From 623eff363708f8f60663f66a702b5d4b9b6fe387 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 20 Jul 2022 07:45:29 -0600
Subject: [PATCH 24/87] Removed double copy of UTF-8 optimization. Refactored
 UTF-16 and UTF-32 code point inspectors

---
 src/Data/Text/Encoding.hs                | 167 +++++++++++++----------
 src/Data/Text/Internal/Encoding/Utf16.hs |  47 +++----
 src/Data/Text/Internal/Encoding/Utf32.hs |  36 +++--
 3 files changed, 135 insertions(+), 115 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index f67ca72c..496f66de 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -4,6 +4,7 @@
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE TupleSections #-}
 -- |
 -- Module      : Data.Text.Encoding
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
@@ -76,7 +77,7 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (runST)
+import Control.Monad.ST (ST, runST)
 import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
@@ -218,22 +219,29 @@ isValidBS bs = start 0
 #endif
 #endif
 
-data Progression
-  = WriteAndAdvance Char (Int -> Int)
-  | NeedMore
-  | Invalid
-
-decodeChunks :: (Bits w, Num w, Storable w) =>
-  w -- only used for Storable.sizeOf argument which the function discards
-  -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
+decodeChunks :: (Bits w, Num w, Storable w)
+  => w
+  -> ( Int
+    -> ByteString
+    -> Int
+    -> A.MArray s
+    -> Int
+    -> Int
+    -> ST
+        s
+        (Maybe
+            ((A.MArray s
+              -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
+            -> ST s (DecodeResult Text ByteString w)))
+    )
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
-  -> DecodeResult Text ByteString w
+  -> ST s (DecodeResult Text ByteString w)
 decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2)
   | len2 == 0
   , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1
-  | otherwise = runST $ do
+  | otherwise = do
     marr <- A.new len'
     outer marr len' 0 0
     where
@@ -249,43 +257,33 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
       len' :: Int
       len' = (len `div` wordByteSize) + 4
 
-      -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)
       outer dst dstLen = inner
         where
           inner srcOff dstOff
             -- finished
             | len - srcOff < 1 = goodSoFar
             -- shortcut for utf-8
-            | otherwise =
-              case queryOptimization bs1 bs2 srcOff of
-                Just (arr, tLen) ->
-                  let minLen = tLen + dstOff in
-                  if minLen > dstLen
-                    then do
-                      let dstLen' = minLen + 4
-                      dst' <- A.resizeM dst dstLen'
-                      A.copyI tLen dst' dstOff arr 0
-                      outer dst' dstLen' (srcOff + tLen) minLen
-                    else do
-                      A.copyI tLen dst dstOff arr 0
-                      inner (srcOff + tLen) minLen
+            | otherwise = do
+              mOuterArgs <- queryOptimization len1 bs2 srcOff dst dstLen dstOff
+              case mOuterArgs of
+                Just outerArgs -> outerArgs outer
                 _ -> if len - srcOff < wordByteSize
-                    -- incomplete code point
-                    then goodSoFar
-                    else
-                      if dstOff + 4 > dstLen
-                        -- need more space in destination
-                        then do
-                          let dstLen' = dstLen + 4
-                          dst' <- A.resizeM dst dstLen'
-                          outer dst' dstLen' srcOff dstOff
-                        else
-                          case decodeF index len srcOff of
-                            WriteAndAdvance c advance -> do
-                              d <- unsafeWrite dst dstOff c
-                              inner (advance d) $ dstOff + d
-                            NeedMore -> goodSoFar
-                            Invalid -> invalid
+                  -- incomplete code point
+                  then goodSoFar
+                  else
+                    if dstOff + 4 > dstLen
+                      -- need more space in destination
+                      then do
+                        let dstLen' = dstLen + 4
+                        dst' <- A.resizeM dst dstLen'
+                        outer dst' dstLen' srcOff dstOff
+                      else
+                        case decodeF index len srcOff of
+                          WriteAndAdvance c advance -> do
+                            d <- unsafeWrite dst dstOff c
+                            inner (advance d) $ dstOff + d
+                          NeedMore -> goodSoFar
+                          Invalid -> invalid
             where
               contin off res = do
                 A.shrinkM dst dstOff
@@ -306,19 +304,37 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
                 contin srcOff' $ \ t bs' ->
                   DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
 
-decodeChunksProxy :: (Bits w, Num w, Storable w) =>
-  (ByteString -> ByteString -> Int -> Maybe (A.Array, Int))
+decodeChunksProxy :: (Bits w, Num w, Storable w)
+  => ( Int
+    -> ByteString
+    -> Int
+    -> A.MArray s
+    -> Int
+    -> Int
+    -> ST s (Maybe (
+        (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
+        -> ST s (DecodeResult Text ByteString w)
+        )
+      )
+    )
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
-  -> DecodeResult Text ByteString w
+  -> ST s (DecodeResult Text ByteString w)
 decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- determine the size in bytes of a data type using Storable.sizeOf
 -- so that it doesn't have to be passed as an arugment. Storable.sizeOf
 -- discards the actual value without evaluating it.
 
-queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int)
-queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
+queryUtf8DecodeOptimization
+  :: Int
+  -> ByteString
+  -> Int
+  -> A.MArray s
+  -> Int
+  -> Int
+  -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t))
+queryUtf8DecodeOptimization len1 bs2@(B.length -> len2) srcOff dst dstLen dstOff
   | srcOff >= len1
   -- potential valid utf8 content endpoint
   , utf8End <- len1 + guessUtf8Boundary
@@ -326,13 +342,18 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff
   -- potential valid utf8 content length
   , utf8Len <- utf8End - srcOff
   , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
-  , isValidBS bs' = Just (runST $ do
-    marr <- A.new utf8Len
+  , isValidBS bs'
+  , minLen <- utf8Len + dstOff = do
+    (dst', dstLen') <-
+      if minLen > dstLen
+        then
+          let newLen = minLen + 4 in
+          (, newLen) <$> A.resizeM dst newLen
+        else pure (dst, dstLen)
     withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-      unsafeSTToIO $ A.copyFromPointer marr 0 src utf8Len
-    arr <- A.unsafeFreeze marr
-    pure $ (arr, utf8Len))
-  | otherwise = Nothing
+      unsafeSTToIO $ A.copyFromPointer dst' dstOff src utf8Len
+    pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) minLen
+  | otherwise = pure Nothing
   where
     -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
     guessUtf8Boundary :: Int
@@ -357,7 +378,7 @@ decodeUtf8Chunks
   :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
   -> ByteString -- ^ The second `ByteString` chunk to decode.
   -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff ->
+decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff ->
   let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
 
       step i (Incomplete a b)
@@ -367,10 +388,10 @@ decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len s
   case decodeFrom srcOff of
     Accept c -> WriteAndAdvance c (srcOff +)
     Reject -> Invalid
-    Incomplete{} -> NeedMore
+    Incomplete{} -> NeedMore) bs1 bs2
 
-noOptimization :: p1 -> p2 -> p3 -> Maybe a
-noOptimization _ _ _ = Nothing
+noOptimization :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a)
+noOptimization _ _ _ _ _ _ = pure Nothing
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
@@ -381,14 +402,14 @@ decodeUtf16Chunks
   -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
   -> ByteString -- ^ The second `ByteString` chunk to decode.
   -> DecodeResult Text ByteString Word16
-decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff ->
+decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff ->
   -- get next Word8 pair
   let writeAndAdvance c n = WriteAndAdvance c $ const n
       b0 = index $ if isBE then srcOff else srcOff + 1
       b1 = index $ if isBE then srcOff + 1 else srcOff
   in
-  case queryUtf16Bytes b0 of
-    OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2
+  case queryUtf16Bytes b0 b1 of
+    OneWord16 c -> writeAndAdvance c $ srcOff + 2
     TwoWord16 g ->
       if len - srcOff < 4
         -- not enough Word8s to finish the code point
@@ -397,10 +418,10 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -
           let b2 = index $ srcOff + (if isBE then 2 else 3)
               b3 = index $ srcOff + (if isBE then 3 else 2)
           in
-          case g b2 of
-            Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4
+          case g b2 b3 of
+            Just c -> writeAndAdvance c $ srcOff + 4
             _ -> Invalid
-    _ -> Invalid
+    _ -> Invalid) bs1 bs2
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
@@ -411,19 +432,25 @@ decodeUtf32Chunks
   -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
   -> ByteString -- ^ The second `ByteString` chunk to decode.
   -> DecodeResult Text ByteString Word32
-decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff ->
+decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index _ srcOff ->
   -- get next Word8 quartet
-  case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3)
-      >>= ($ (index $ srcOff + (if isBE then 1 else 2)))
-      >>= ($ (index $ srcOff + (if isBE then 2 else 1))) of
-    Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4
-    _ -> Invalid
+  case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3)
+      (index $ srcOff + (if isBE then 1 else 2))
+      (index $ srcOff + (if isBE then 2 else 1))
+      (index $ if isBE then srcOff + 3 else srcOff) of
+    Just c -> WriteAndAdvance c . const $ srcOff + 4
+    _ -> Invalid) bs1 bs2
+
+data Progression
+  = WriteAndAdvance Char (Int -> Int)
+  | NeedMore
+  | Invalid
 
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a total function: On success the decoded 'Text' is within a
--- 'Right' value, and an error ('Left' 'Int') indicates the postion of
--- the offending 'Word8'.
+-- 'Right' value, and an error @('Left' 'Int')@ indicates the position
+-- of the offending 'Word8'.
 --
 -- @since 2.0.1
 decodeAsciiE :: ByteString -> Either Int Text
diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs
index 2749fa34..45b0eeb3 100644
--- a/src/Data/Text/Internal/Encoding/Utf16.hs
+++ b/src/Data/Text/Internal/Encoding/Utf16.hs
@@ -55,31 +55,28 @@ validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
 {-# INLINE validate2 #-}
 
 data Utf16Result
-  = OneWord16 (Word8 -> Char)
-  | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char))
+  = OneWord16 Char
+  | TwoWord16 (Word8 -> Word8 -> Maybe Char)
   | Invalid16
 
-queryUtf16Bytes :: Word8 -> Utf16Result
-queryUtf16Bytes b0@(W8# w0#) =
-  if b0 < 0xD8 || b0 >= 0xE0
-    then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#))))
-    else
-      -- 110110xx: start of surrogate pair
-      if b0 .&. 0xFC == 0xD8
-        then TwoWord16 $ \ b2@(W8# w2#) ->
-          if b2 .&. 0xFC == 0xDC
-            -- valid surrogate
-            then Just $ \ (W8# w1#) (W8# w3#) ->
-              C# (chr# (
-                (orI#
-                  (orI#
-                    (orI#
-                      (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#))
-                      (word2Int# (shiftL# (word8ToWord# w1#) 10#))
-                    )
-                    (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#)))
-                  (word2Int# (word8ToWord# w3#))) +# 0x10000#
-              ))
-            else Nothing
-        else Invalid16
+queryUtf16Bytes :: Word8 -> Word8 -> Utf16Result
+queryUtf16Bytes b0@(W8# w0#) (W8# w1#)
+  | b0 < 0xD8 || b0 >= 0xE0 = OneWord16 $ C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#))))
+  -- 110110xx: start of surrogate pair
+  | b0 .&. 0xFC == 0xD8 = TwoWord16 $ \ b2@(W8# w2#) (W8# w3#) ->
+    if b2 .&. 0xFC == 0xDC
+      -- valid surrogate
+      then Just $
+        C# (chr# (
+          (orI#
+            (orI#
+              (orI#
+                (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#))
+                (word2Int# (shiftL# (word8ToWord# w1#) 10#))
+              )
+              (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#)))
+            (word2Int# (word8ToWord# w3#))) +# 0x10000#
+        ))
+      else Nothing
+  | otherwise = Invalid16
 {-# INLINE queryUtf16Bytes #-}
diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs
index 517ff05a..40a0e25d 100644
--- a/src/Data/Text/Internal/Encoding/Utf32.hs
+++ b/src/Data/Text/Internal/Encoding/Utf32.hs
@@ -35,24 +35,20 @@ validate    :: Word32 -> Bool
 validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
 {-# INLINE validate #-}
 
-queryUtf32Bytes :: (Eq a, Num a) => a -> Maybe (Word8 -> Maybe (Word8 -> Maybe (Word8 -> Char)))
-queryUtf32Bytes b0 =
-  if b0 == 0
-    then Just $ \ b1@(W8# w1#) ->
-      if b1 < 0x11
-        then Just $ \ b2@(W8# w2#) ->
-          if b1 > 0 || b2 < 0xD8 || b2 >= 0xE0
-            then Just $ \ (W8# w3#) ->
-              C# (chr#
-                  (orI#
-                    (orI#
-                      (word2Int# (shiftL# (word8ToWord# w1#) 16#))
-                      (word2Int# (shiftL# (word8ToWord# w2#) 8#))
-                    )
-                    (word2Int# (word8ToWord# w3#))
-                  )
-                )
-            else Nothing
-        else Nothing
-    else Nothing
+queryUtf32Bytes :: (Eq a, Num a) => a -> Word8 -> Word8 -> Word8 -> Maybe Char
+queryUtf32Bytes b0 b1@(W8# w1#) b2@(W8# w2#) (W8# w3#)
+  | b0 == 0
+  , b1 < 0x11
+  , b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 =
+    Just $
+      C# (chr#
+          (orI#
+            (orI#
+              (word2Int# (shiftL# (word8ToWord# w1#) 16#))
+              (word2Int# (shiftL# (word8ToWord# w2#) 8#))
+            )
+            (word2Int# (word8ToWord# w3#))
+          )
+        )
+  | otherwise = Nothing
 {-# INLINE queryUtf32Bytes #-}

From 30a3ae1079331ca76fcbd492bbaa24252794143d Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 20 Jul 2022 22:22:52 -0600
Subject: [PATCH 25/87] Clarify DecodeResult documentation. Simplify
 WriteAndProgress data constructor.

---
 src/Data/Text.hs                      | 18 ++++++++-
 src/Data/Text/Encoding.hs             | 56 +++++++++++++--------------
 src/Data/Text/Encoding/Common.hs      | 40 +------------------
 src/Data/Text/Internal/Builder.hs     | 11 ++++++
 src/Data/Text/Lazy.hs                 | 18 ++++++++-
 tests/Tests/Properties/Transcoding.hs | 48 -----------------------
 6 files changed, 71 insertions(+), 120 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index c1eda582..5176a0ab 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -355,10 +355,21 @@ instance Monoid Text where
     mappend = (<>)
     mconcat = concat
 
+-- | Performs replacement on invalid scalar values:
+--
+-- >>> :set -XOverloadedStrings
+-- >>> "\55555" :: Text
+-- "\65533"
 instance IsString Text where
     fromString = pack
 
--- | @since 1.2.0.0
+-- | Performs replacement on invalid scalar values:
+--
+-- >>> :set -XOverloadedLists
+-- >>> ['\55555'] :: Text
+-- "\65533"
+--
+-- @since 1.2.0.0
 instance Exts.IsList Text where
     type Item Text = Char
     fromList       = pack
@@ -445,7 +456,10 @@ compareText (Text arrA offA lenA) (Text arrB offB lenB) =
 -- * Conversion to/from 'Text'
 
 -- | /O(n)/ Convert a 'String' into a 'Text'.
--- Performs replacement on invalid scalar values.
+-- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id':
+--
+-- >>> unpack (pack "\55555")
+-- "\65533"
 pack :: String -> Text
 pack = unstream . S.map safe . S.streamList
 {-# INLINE [1] pack #-}
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 496f66de..af27cc50 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -74,10 +74,9 @@ module Data.Text.Encoding
     , encodeUtf8BuilderEscaped
     ) where
 
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-
 import Control.Exception (evaluate, try)
 import Control.Monad.ST (ST, runST)
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
@@ -219,20 +218,23 @@ isValidBS bs = start 0
 #endif
 #endif
 
+data Progression
+  = WriteAndAdvance Char Int
+  | NeedMore
+  | Invalid
+
 decodeChunks :: (Bits w, Num w, Storable w)
   => w
-  -> ( Int
+  -> ( ByteString
     -> ByteString
     -> Int
     -> A.MArray s
     -> Int
     -> Int
-    -> ST
-        s
-        (Maybe
-            ((A.MArray s
-              -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
-            -> ST s (DecodeResult Text ByteString w)))
+    -> ST s (Maybe (
+        (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
+        -> ST s (DecodeResult Text ByteString w))
+      )
     )
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
@@ -264,7 +266,7 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
             | len - srcOff < 1 = goodSoFar
             -- shortcut for utf-8
             | otherwise = do
-              mOuterArgs <- queryOptimization len1 bs2 srcOff dst dstLen dstOff
+              mOuterArgs <- queryOptimization bs1 bs2 srcOff dst dstLen dstOff
               case mOuterArgs of
                 Just outerArgs -> outerArgs outer
                 _ -> if len - srcOff < wordByteSize
@@ -279,9 +281,9 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
                         outer dst' dstLen' srcOff dstOff
                       else
                         case decodeF index len srcOff of
-                          WriteAndAdvance c advance -> do
+                          WriteAndAdvance c srcOff' -> do
                             d <- unsafeWrite dst dstOff c
-                            inner (advance d) $ dstOff + d
+                            inner srcOff' $ dstOff + d
                           NeedMore -> goodSoFar
                           Invalid -> invalid
             where
@@ -305,7 +307,7 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
                   DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
 
 decodeChunksProxy :: (Bits w, Num w, Storable w)
-  => ( Int
+  => ( ByteString
     -> ByteString
     -> Int
     -> A.MArray s
@@ -327,14 +329,14 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- discards the actual value without evaluating it.
 
 queryUtf8DecodeOptimization
-  :: Int
+  :: ByteString
   -> ByteString
   -> Int
   -> A.MArray s
   -> Int
   -> Int
   -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t))
-queryUtf8DecodeOptimization len1 bs2@(B.length -> len2) srcOff dst dstLen dstOff
+queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff dst dstLen dstOff
   | srcOff >= len1
   -- potential valid utf8 content endpoint
   , utf8End <- len1 + guessUtf8Boundary
@@ -379,14 +381,13 @@ decodeUtf8Chunks
   -> ByteString -- ^ The second `ByteString` chunk to decode.
   -> DecodeResult Text ByteString Word8
 decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff ->
-  let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off
-
-      step i (Incomplete a b)
+  let step i (Incomplete a b)
         | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-      step _ st = st
+      step i st = (st, i)
+      (dr, srcOff') = step (srcOff + 1) . utf8DecodeStart $ index srcOff
   in
-  case decodeFrom srcOff of
-    Accept c -> WriteAndAdvance c (srcOff +)
+  case dr of
+    Accept c -> WriteAndAdvance c srcOff'
     Reject -> Invalid
     Incomplete{} -> NeedMore) bs1 bs2
 
@@ -404,12 +405,12 @@ decodeUtf16Chunks
   -> DecodeResult Text ByteString Word16
 decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff ->
   -- get next Word8 pair
-  let writeAndAdvance c n = WriteAndAdvance c $ const n
+  let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n
       b0 = index $ if isBE then srcOff else srcOff + 1
       b1 = index $ if isBE then srcOff + 1 else srcOff
   in
   case queryUtf16Bytes b0 b1 of
-    OneWord16 c -> writeAndAdvance c $ srcOff + 2
+    OneWord16 c -> writeAndAdvance c 2
     TwoWord16 g ->
       if len - srcOff < 4
         -- not enough Word8s to finish the code point
@@ -419,7 +420,7 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind
               b3 = index $ srcOff + (if isBE then 3 else 2)
           in
           case g b2 b3 of
-            Just c -> writeAndAdvance c $ srcOff + 4
+            Just c -> writeAndAdvance c 4
             _ -> Invalid
     _ -> Invalid) bs1 bs2
 
@@ -438,14 +439,9 @@ decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind
       (index $ srcOff + (if isBE then 1 else 2))
       (index $ srcOff + (if isBE then 2 else 1))
       (index $ if isBE then srcOff + 3 else srcOff) of
-    Just c -> WriteAndAdvance c . const $ srcOff + 4
+    Just c -> WriteAndAdvance c $ srcOff + 4
     _ -> Invalid) bs1 bs2
 
-data Progression
-  = WriteAndAdvance Char (Int -> Int)
-  | NeedMore
-  | Invalid
-
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a total function: On success the decoded 'Text' is within a
diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
index dfd9f05d..385c6025 100644
--- a/src/Data/Text/Encoding/Common.hs
+++ b/src/Data/Text/Encoding/Common.hs
@@ -21,10 +21,8 @@
 
 module Data.Text.Encoding.Common
   (
-  -- * Full-service result types and functions
+  -- * Full-service result type
     DecodeResult(..)
-  , DecodeResultHandler
-  , chunksDecoderToStream
   -- * Error handling types
   , UnicodeException(..)
   , OnError
@@ -49,7 +47,7 @@ import Numeric (showHex)
 --
 --   1. the decoded data up to an incomplete code point at the end of
 --      the input data, an invalid word, or to the end of the input,
---   2. the invalid code point if one was encountered,
+--   2. the first word of an invalid code point if one was encountered,
 --   3. the remaining undecoded data, which is either an incomplete
 --      code point, the data after the invalid code point, or empty,
 --      and
@@ -59,40 +57,6 @@ import Numeric (showHex)
 data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int
   deriving (Eq, Ord, Show, Read)
 
--- | The funtion type for handling successful decodings. The arguments
--- passed to the handler are:
---
---   1. the decoded data up to an incomplete code point at the end of
---      the input data, an invalid word, or to the end of the input,
---   2. the invalid code point if one was encountered,
---   3. the remaining undecoded data, which is either an incomplete
---      code point, the data after the invalid code point, or empty,
---   4. the byte position of remaining undecoded data, and
---   5. a continuation function that accepts data that as treated as
---      a continuation of any remaining unencoded data.
---
--- @since 2.0.1
-type DecodeResultHandler t w b b' r = t -> Maybe w -> b -> Int -> (b' -> r) -> r
-
--- | Create a stream decoder from a chunks decoder. The resulting
--- stream decoder accepts a 'DecodeResultHandler' to process the decode
--- result.
---
--- @since 2.0.1
-chunksDecoderToStream :: Monoid b
-  => (b -> b' -> DecodeResult t b w)  -- ^ Chunks decoder
-  -> b'                               -- ^ Encoded data
-  -> DecodeResultHandler t w b b' r   -- ^ Result continuation
-  -> r                                -- ^ Continuation result
-chunksDecoderToStream chunksDecoder bs f =
-  g mempty bs 0
-  where
-    g bs0 bs1 pos0 =
-      let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1
-          pos = pos0 + pos1
-      in
-      f t mW bs1' pos $ \ bs2 -> g bs1' bs2 pos
-
 -- | Function type for handling a coding error.  It is supplied with
 -- two inputs:
 --
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 590c6528..6269c597 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -108,6 +108,11 @@ instance Monoid Builder where
    mconcat = foldr mappend Data.Monoid.mempty
    {-# INLINE mconcat #-}
 
+-- | Performs replacement on invalid scalar values:
+--
+-- >>> :set -XOverloadedStrings
+-- >>> "\55555" :: Builder
+-- "\65533"
 instance String.IsString Builder where
     fromString = fromString
     {-# INLINE fromString #-}
@@ -181,6 +186,12 @@ fromText t@(Text arr off l)
 --
 --  * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@
 --
+-- Performs replacement on invalid scalar values:
+--
+-- >>> fromString "\55555"
+-- "\65533"
+--
+-- @since 1.2.0.0
 fromString :: String -> Builder
 fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
     let loop !marr !o !u !l [] = k (Buffer marr o u l)
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 9e2638eb..39834a8b 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -316,10 +316,21 @@ instance Monoid Text where
     mappend = (<>)
     mconcat = concat
 
+-- | Performs replacement on invalid scalar values:
+--
+-- >>> :set -XOverloadedStrings
+-- >>> "\55555" :: Data.Text.Lazy.Text
+-- "\65533"
 instance IsString Text where
     fromString = pack
 
--- | @since 1.2.0.0
+-- | Performs replacement on invalid scalar values:
+--
+-- >>> :set -XOverloadedLists
+-- >>> ['\55555'] :: Data.Text.Lazy.Text
+-- "\65533"
+--
+-- @since 1.2.0.0
 instance Exts.IsList Text where
     type Item Text = Char
     fromList       = pack
@@ -372,7 +383,10 @@ textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr]
 
 -- | /O(n)/ Convert a 'String' into a 'Text'.
 --
--- Performs replacement on invalid scalar values.
+-- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id':
+--
+-- >>> Data.Text.Lazy.unpack (Data.Text.Lazy.pack "\55555")
+-- "\65533"
 pack ::
 #if defined(ASSERTS)
   HasCallStack =>
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 0fcdb7ef..580d15cc 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -8,7 +8,6 @@ module Tests.Properties.Transcoding
 
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
-import Data.Maybe (fromMaybe)
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -300,50 +299,6 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
-decodeStream decoder snoc bs =
-  case decoder bs $ \ t mW b p f ->
-    case mW of
-      Just w ->
-        let t' = (case C.lenientDecode "" $ Just w of
-              Just c -> t `snoc` c
-              _ -> t)
-        in
-        case f mempty of
-          (tDiff, bp) -> (mappend t' `mappend` tDiff, bp)
-      _ -> (mappend t, (b, p)) of
-    (tDiff, (b, p)) -> (tDiff mempty, b, p)
-
-decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks
-
-decodeUtf8StreamS = C.chunksDecoderToStream E.decodeUtf8Chunks
-
--- The results of the stream decoder on a strict bytestring should
--- match the results of a non-stream decoder on a lazy bytestring.
-t_decode_utf8_lenient_stream :: Property
-t_decode_utf8_lenient_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
-  let (st, sb, _) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs
-      st' = B.foldr (\ w st'' -> st'' `T.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) st sb
-  in
-  decodeLL bs === TL.fromStrict st'
-
--- The results of the non-stream decoder on a strict bytestring should
--- match the results of a stream decoder on a lazy bytestring.
-t_decode_utf8_stream_lenient :: Property
-t_decode_utf8_stream_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
-  let (lt, lb, _) = decodeStream decodeUtf8StreamL TL.snoc bs
-      lt' = BL.foldr (\ w lt'' -> lt'' `TL.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) lt lb
-  in
-  lt' === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
-
--- The decoding of lazy bytestrings should not depend on how they are chunked,
--- and it should behave the same as decoding of strict bytestrings.
-t_decode_utf8_stream :: Property
-t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
-    decodeStream decodeUtf8StreamL TL.snoc bs ===
-      ( let (st, sb, pos) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs in
-        (TL.fromStrict st, BL.fromStrict sb, pos)
-      )
-
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
 -- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -431,9 +386,6 @@ testTranscoding =
       testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,
       testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE,
       testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
-      testProperty "t_decode_utf8_stream_lenient" t_decode_utf8_stream_lenient,
-      testProperty "t_decode_utf8_lenient_stream" t_decode_utf8_lenient_stream,
-      testProperty "t_decode_utf8_stream" t_decode_utf8_stream,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,

From 0b4bceaf04da164a547f42301766c38be7d4d8bb Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 23 Jul 2022 13:19:13 -0600
Subject: [PATCH 26/87] Replaced decodeAsciiE with decodeAsciiChunks (lazy and
 strict)

---
 changelog.md                          |   2 +-
 src/Data/Text/Encoding.hs             | 327 +++++++++++++-------------
 src/Data/Text/Encoding/Common.hs      |   2 +-
 src/Data/Text/Lazy/Encoding.hs        |  44 ++--
 tests/Tests/Properties/Transcoding.hs |  56 ++++-
 5 files changed, 240 insertions(+), 191 deletions(-)

diff --git a/changelog.md b/changelog.md
index b0347e05..f198612a 100644
--- a/changelog.md
+++ b/changelog.md
@@ -207,7 +207,7 @@
 * Bumped lower bound on deepseq to 1.4 for compatibility with the
   upcoming GHC 7.10
 
-#### 1.2.0.1
+#### 1.2.0.2
 
 * Fixed a buffer overflow in rendering of large Integers
   (https://github.com/bos/text/issues/99)
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index af27cc50..799fedcb 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -32,6 +32,7 @@ module Data.Text.Encoding
       decodeLatin1
     , decodeUtf8Lenient
     , DecodeResult(..)
+    , decodeAsciiChunks
     , decodeUtf8Chunks
     , decodeUtf16Chunks
     , decodeUtf32Chunks
@@ -40,7 +41,6 @@ module Data.Text.Encoding
     , decodeUtf8'
 
     -- *** Controllable error handling
-    , decodeAsciiE
     , decodeUtf8With
     , decodeUtf16LEWith
     , decodeUtf16BEWith
@@ -81,7 +81,6 @@ import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
-import qualified Data.ByteString.Short.Internal as SBS
 import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
@@ -147,13 +146,11 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
 -- anything except ASCII and copies buffer or throws an error otherwise.
 --
 decodeASCII :: ByteString -> Text
-decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
-  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-    c_is_ascii src (src `plusPtr` len)
-  if asciiPrefixLen == len
-  then let !(SBS.SBS arr) = SBS.toShort bs in
-        return (Text (A.ByteArray arr) 0 len)
-  else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
+decodeASCII bs = case decodeAsciiChunks bs mempty of
+  DecodeResult t mW _ pos ->
+    case mW of
+      Just _ -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show (pos - 1)
+      _ -> t
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --
@@ -171,7 +168,7 @@ decodeLatin1 ::
 decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
   dst <- A.new (2 * len)
   let inner srcOff dstOff = if srcOff >= len then return dstOff else do
-        asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+        asciiPrefixLen <- fmap fromIntegral . unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
           c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
         if asciiPrefixLen == 0
         then do
@@ -192,45 +189,23 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
-isValidBS :: ByteString -> Bool
-#ifdef SIMDUTF
-isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
-  unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len)
-#else
-#if MIN_VERSION_bytestring(0,11,2)
-isValidBS = B.isValidUtf8
-#else
-isValidBS bs = start 0
-  where
-    start ix
-      | ix >= B.length bs = True
-      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st _ -> step (ix + 1) st
-    step ix st
-      | ix >= B.length bs = False
-      -- We do not use decoded code point, so passing a dummy value to save an argument.
-      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st' _ -> step (ix + 1) st'
-#endif
-#endif
-
 data Progression
   = WriteAndAdvance Char Int
   | NeedMore
   | Invalid
 
-decodeChunks :: (Bits w, Num w, Storable w)
+decodeChunks ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  (Bits w, Num w, Storable w)
   => w
-  -> ( ByteString
+  -> ( Int
     -> ByteString
-    -> Int
     -> A.MArray s
     -> Int
     -> Int
+    -> Int
     -> ST s (Maybe (
         (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
         -> ST s (DecodeResult Text ByteString w))
@@ -240,33 +215,32 @@ decodeChunks :: (Bits w, Num w, Storable w)
   -> ByteString
   -> ByteString
   -> ST s (DecodeResult Text ByteString w)
-decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2)
-  | len2 == 0
-  , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1
-  | otherwise = do
-    marr <- A.new len'
-    outer marr len' 0 0
-    where
-      wordByteSize = sizeOf w
-
-      index :: Int -> Word8
-      index i
-        | i < len1  = B.index bs1 i
-        | otherwise = B.index bs2 $ i - len1
-
-      len :: Int
-      len = len1 + len2
-      len' :: Int
-      len' = (len `div` wordByteSize) + 4
-
-      outer dst dstLen = inner
-        where
-          inner srcOff dstOff
-            -- finished
-            | len - srcOff < 1 = goodSoFar
-            -- shortcut for utf-8
-            | otherwise = do
-              mOuterArgs <- queryOptimization bs1 bs2 srcOff dst dstLen dstOff
+decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = do
+  marr <- A.new len'
+  outer marr len' 0 0
+  where
+    wordByteSize = sizeOf w
+
+    index :: Int -> Word8
+    index i
+      | i < len1  = B.index bs1 i
+      | otherwise = B.index bs2 $ i - len1
+
+    len :: Int
+    len = len1 + len2
+    len' :: Int
+    len' = len `div` wordByteSize + 4
+
+    outer dst dstLen = inner
+      where
+        inner srcOff dstOff
+          | srcOff < len1 = decode 0 bs1
+          | srcOff < len = decode len1 bs2
+          -- finished (for now)
+          | otherwise = goodSoFar
+          where
+            decode bsOff bs = do
+              mOuterArgs <- bulkDecodeF bsOff bs dst dstLen srcOff dstOff
               case mOuterArgs of
                 Just outerArgs -> outerArgs outer
                 _ -> if len - srcOff < wordByteSize
@@ -280,39 +254,37 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length ->
                         dst' <- A.resizeM dst dstLen'
                         outer dst' dstLen' srcOff dstOff
                       else
-                        case decodeF index len srcOff of
+                        case incrementalDecodeF index len srcOff of
                           WriteAndAdvance c srcOff' -> do
                             d <- unsafeWrite dst dstOff c
                             inner srcOff' $ dstOff + d
                           NeedMore -> goodSoFar
                           Invalid -> invalid
-            where
-              contin off res = do
-                A.shrinkM dst dstOff
-                arr <- A.unsafeFreeze dst
-                pure . res (Text arr 0 dstOff) $ if off >= len1
-                  then B.drop (off - len1) bs2
-                  else B.drop off $ bs1 `B.append` bs2
-              goodSoFar =
-                contin srcOff $ \ t bs' ->
-                  DecodeResult t Nothing bs' srcOff
-              invalid =
-                let srcOff' = srcOff + wordByteSize
-                    bytesToWord n word =
-                      if n > 0
-                        then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                        else word
-                in
-                contin srcOff' $ \ t bs' ->
-                  DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
+            contin off res = do
+              A.shrinkM dst dstOff
+              arr <- A.unsafeFreeze dst
+              pure . res (Text arr 0 dstOff) $ if off >= len1
+                then B.drop (off - len1) bs2
+                else B.drop off $ bs1 `B.append` bs2
+            goodSoFar =
+              contin srcOff $ \ t bs' ->
+                DecodeResult t Nothing bs' srcOff
+            invalid =
+              let srcOff' = srcOff + wordByteSize
+                  bytesToWord n word
+                    | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                    | otherwise = word
+              in
+              contin srcOff' $ \ t bs' ->
+                DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
 
 decodeChunksProxy :: (Bits w, Num w, Storable w)
-  => ( ByteString
+  => ( Int
     -> ByteString
-    -> Int
     -> A.MArray s
     -> Int
     -> Int
+    -> Int
     -> ST s (Maybe (
         (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
         -> ST s (DecodeResult Text ByteString w)
@@ -328,82 +300,140 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- so that it doesn't have to be passed as an arugment. Storable.sizeOf
 -- discards the actual value without evaluating it.
 
-queryUtf8DecodeOptimization
-  :: ByteString
-  -> ByteString
+bulkCharCopy
+  :: Ptr Word8
   -> Int
   -> A.MArray s
   -> Int
   -> Int
+  -> IO (A.MArray s, Int, Int)
+bulkCharCopy src srcLen dst dstLen dstOff = do
+  let minLen = dstOff + srcLen
+  (dst', dstLen') <-
+    if minLen > dstLen
+      then
+        let newLen = minLen + 4 in
+        (, newLen) <$> (unsafeSTToIO $ A.resizeM dst newLen)
+      else pure (dst, dstLen)
+  unsafeSTToIO $ A.copyFromPointer dst' dstOff src srcLen
+  pure (dst', dstLen', minLen)
+
+-- | Decode two 'ByteString's containing ASCII as though they were one
+-- continuous 'ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.2
+decodeAsciiChunks
+  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second 'ByteString' chunk to decode.
+  -> DecodeResult Text ByteString Word8
+decodeAsciiChunks bs1 bs2 = runST $ decodeChunksProxy (
+  \ bsOff bs dst dstLen srcOff dstOff -> withBS bs $ \ fp len ->
+    unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do
+      let srcOff' = srcOff - bsOff
+      asciiPrefixLen <- fmap fromIntegral . c_is_ascii (src `plusPtr` srcOff') $ src `plusPtr` len
+      if asciiPrefixLen == 0
+        then pure Nothing
+        else do
+          (dst', dstLen', dstOff') <- bulkCharCopy src asciiPrefixLen dst dstLen dstOff
+          pure . Just $ \ f -> f dst' dstLen' (srcOff' + asciiPrefixLen + bsOff) dstOff'
+  ) (\ _ _ _ -> Invalid) bs1 bs2
+
+decodeUtf8Bulk
+  :: Int
+  -> ByteString
+  -> A.MArray s
+  -> Int
+  -> Int
+  -> Int
   -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t))
-queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff dst dstLen dstOff
-  | srcOff >= len1
+decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff
   -- potential valid utf8 content endpoint
-  , utf8End <- len1 + guessUtf8Boundary
+  | utf8End <- bsOff + guessUtf8Boundary bs
+  -- not yet reached the UTF-8 boundary
   , srcOff < utf8End
   -- potential valid utf8 content length
   , utf8Len <- utf8End - srcOff
-  , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2
-  , isValidBS bs'
-  , minLen <- utf8Len + dstOff = do
-    (dst', dstLen') <-
-      if minLen > dstLen
-        then
-          let newLen = minLen + 4 in
-          (, newLen) <$> A.resizeM dst newLen
-        else pure (dst, dstLen)
-    withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-      unsafeSTToIO $ A.copyFromPointer dst' dstOff src utf8Len
-    pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) minLen
+  -- this ByteString chunk's offset
+  , srcOff2 <- srcOff - bsOff
+  -- check for UTF-8 compliance
+  , isValidBS bs srcOff2 utf8Len = do
+    (dst', dstLen', dstOff') <- withBS bs $ \ fp _ ->
+      unsafeIOToST $ unsafeWithForeignPtr fp $ \ src ->
+        bulkCharCopy (src `plusPtr` srcOff2) utf8Len dst dstLen dstOff
+    pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) dstOff'
   | otherwise = pure Nothing
   where
     -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-    guessUtf8Boundary :: Int
-    guessUtf8Boundary
-      | len2 >= 1 && w0 <  0x80 = len2     -- last char is ASCII
-      | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
-      | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
-      | len2 >= 3 && w2 >= 0xC0 = len2 - 3
-      | len2 >= 4 && w3 >= 0xC0 = len2 - 4
-      | otherwise = 0
+    guessUtf8Boundary :: ByteString -> Int
+    guessUtf8Boundary bs'@(B.length -> len)
+      | len >= 1 && w 1 <  0x80 = len     -- last char is ASCII
+      | len >= 1 && w 1 >= 0xc0 = len - 1 -- last char starts a two-(or more-)word code point
+      | len >= 2 && w 2 >= 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point
+      | len >= 3 && w 3 >= 0xf0 = len - 3 -- third to last char starts a four-word code point
+      | otherwise = 0 -- gonna have to resolve this with an incremental approach
       where
-        w0 = B.index bs2 (len2 - 1)
-        w1 = B.index bs2 (len2 - 2)
-        w2 = B.index bs2 (len2 - 3)
-        w3 = B.index bs2 (len2 - 4)
+        w n = B.index bs' (len - n)
+
+    isValidBS :: ByteString -> Int -> Int -> Bool
+    isValidBS bs' off count = if off + count > B.length bs'
+      then False
+      else
+#ifdef SIMDUTF
+        withBS bs' $ \ fp _ -> unsafeDupablePerformIO $
+          unsafeWithForeignPtr fp $ \ ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral count)
+#else
+#if MIN_VERSION_bytestring(0,11,2)
+        B.isValidUtf8 (B.take count $ B.drop off bs')
+#else
+        start off
+      where
+        start ix
+          | ix >= count = True
+          | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of
+            Accept{} -> start (ix + 1)
+            Reject{} -> False
+            Incomplete st _ -> step (ix + 1) st
+        step ix st
+          | ix >= count = False
+          -- We do not use decoded code point, so passing a dummy value to save an argument.
+          | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of
+            Accept{} -> start (ix + 1)
+            Reject{} -> False
+            Incomplete st' _ -> step (ix + 1) st'
+#endif
+#endif
 
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf8Chunks
-  :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff ->
+decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy decodeUtf8Bulk (\ index len srcOff ->
   let step i (Incomplete a b)
         | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
       step i st = (st, i)
-      (dr, srcOff') = step (srcOff + 1) . utf8DecodeStart $ index srcOff
   in
-  case dr of
-    Accept c -> WriteAndAdvance c srcOff'
-    Reject -> Invalid
-    Incomplete{} -> NeedMore) bs1 bs2
+  case step (srcOff + 1) . utf8DecodeStart $ index srcOff of
+    (Accept c, srcOff') -> WriteAndAdvance c srcOff'
+    (Reject, _) -> Invalid
+    (Incomplete{}, _) -> NeedMore) bs1 bs2
 
-noOptimization :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a)
-noOptimization _ _ _ _ _ _ = pure Nothing
+noBulkDecodeOp :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a)
+noBulkDecodeOp _ _ _ _ _ _ = pure Nothing
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf16Chunks
-  :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
-  -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word16
-decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff ->
+decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index len srcOff ->
   -- get next Word8 pair
   let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n
       b0 = index $ if isBE then srcOff else srcOff + 1
@@ -427,13 +457,13 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf32Chunks
-  :: Bool       -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
-  -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> ByteString -- ^ The second `ByteString` chunk to decode.
+  :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word32
-decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index _ srcOff ->
+decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index _ srcOff ->
   -- get next Word8 quartet
   case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3)
       (index $ srcOff + (if isBE then 1 else 2))
@@ -442,22 +472,6 @@ decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind
     Just c -> WriteAndAdvance c $ srcOff + 4
     _ -> Invalid) bs1 bs2
 
--- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
---
--- This is a total function: On success the decoded 'Text' is within a
--- 'Right' value, and an error @('Left' 'Int')@ indicates the position
--- of the offending 'Word8'.
---
--- @since 2.0.1
-decodeAsciiE :: ByteString -> Either Int Text
-decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do
-  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-    c_is_ascii src (src `plusPtr` len)
-  pure $ if asciiPrefixLen == len
-  then let !(SBS.SBS arr) = SBS.toShort bs in
-        Right (Text (A.ByteArray arr) 0 len)
-  else Left asciiPrefixLen
-
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
 -- Surrogate code points in replacement character returned by 'OnDecodeError'
@@ -783,9 +797,6 @@ encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
 
-cSizeToInt :: CSize -> Int
-cSizeToInt = fromIntegral
-
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
index 385c6025..0f3bd2db 100644
--- a/src/Data/Text/Encoding/Common.hs
+++ b/src/Data/Text/Encoding/Common.hs
@@ -53,7 +53,7 @@ import Numeric (showHex)
 --      and
 --   4. the byte position of remaining undecoded data.
 --
--- @since 2.0.1
+-- @since 2.0.2
 data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int
   deriving (Eq, Ord, Show, Read)
 
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 14fa6ef8..992654fa 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,8 +23,8 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
-    , decodeAsciiE
     , DecodeResult(..)
+    , decodeAsciiChunks
     , decodeUtf8Chunks
     , decodeUtf16Chunks
     , decodeUtf32Chunks
@@ -109,16 +109,6 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
--- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
---
--- This is a total function: On success the decoded 'Text' is within a
--- 'Right' value, and an error ('Left' 'Int') indicates the postion of
--- the offending 'Word8'.
---
--- @since 2.0.1
-decodeAsciiE :: B.ByteString -> Either Int Text
-decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks
-
 decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
   -> B.ByteString
   -> B.ByteString
@@ -135,35 +125,45 @@ decodeChunks decoder = g id 0 mempty
     g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty
     g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos
 
+-- | Decode two 'ByteString's containing ASCII as though they were one
+-- continuous 'ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.2
+decodeAsciiChunks
+  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
+  -> DecodeResult Text B.ByteString Word8
+decodeAsciiChunks = decodeChunks TE.decodeAsciiChunks
+
 -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf8Chunks
-  :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word8
 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
 
 -- | Decode two 'B.ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf16Chunks
-  :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
-  -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word16
 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
 
 -- | Decode two 'B.ByteString's containing UTF-32-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
--- @since 2.0.1
+-- @since 2.0.2
 decodeUtf32Chunks
-  :: Bool         -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`)
-  -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second `B.ByteString` chunk to decode.
+  :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word32
 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks
 
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 580d15cc..b61e603c 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -13,6 +13,7 @@ import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
 import Tests.QuickCheckUtils
 import qualified Control.Exception as Exception
+import qualified Control.Monad.Fix as F
 import qualified Data.Bits as Bits (shiftL, shiftR)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Builder as B
@@ -27,11 +28,6 @@ import qualified Data.Text.Encoding.Common as C
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
-t_asciiE t    = E.decodeAsciiE (E.encodeUtf8 a) === Right a
-    where a  = T.map (\c -> chr (ord c `mod` 128)) t
-tl_asciiE t    = EL.decodeAsciiE (EL.encodeUtf8 a) === Right a
-    where a  = TL.map (\c -> chr (ord c `mod` 128)) t
-
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
 tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
@@ -211,9 +207,33 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
+t_chunk_decode_ascii_1 =
+  let bs1 = B.pack [0x68, 0x69, 0x2c, 0x20, 0x83,
+        0x68, 0x65, 0x6c, 0x6c]
+      bs2 = B.pack [0x6f, 0x2c, 0x20, 0x94,
+        0x68, 0x6f, 0x77, 0x20, 0xcc,
+        0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
+      decode = E.decodeAsciiChunks
+      decodeResult0 = decode bs1 mempty
+      expectedBs0 = B.pack [0x68, 0x65, 0x6c, 0x6c]
+  in
+  whenEqProp decodeResult0 (E.DecodeResult "hi, " (Just 0x83) expectedBs0 5) $
+    let decodeResult1 = decode expectedBs0 bs2
+        expectedBs1 = B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
+          0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
+    in
+    whenEqProp decodeResult1 (E.DecodeResult "hello, " (Just 0x94) expectedBs1 8) $
+      let decodeResult2 = decode expectedBs1 mempty
+          expectedBs2 = B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
+      in
+      whenEqProp decodeResult2 (E.DecodeResult "how " (Just 0xcc) expectedBs2 5) $
+        decode expectedBs2 mempty === E.DecodeResult "are ya?" Nothing mempty 7
+
 t_chunk_decode_utf8_1 =
-  let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in
-  decodeResult0 === E.DecodeResult "hi, ☃!" Nothing mempty 8
+  let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20,
+        0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64,
+        0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] in
+  decodeResult0 === E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22
 t_chunk_decode_utf8_2 =
   let decode = E.decodeUtf8Chunks mempty
       decodeResult0 = decode $ B.pack [97, 0xC2, 97]
@@ -235,6 +255,24 @@ t_chunk_decode_utf8_3 =
       let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in
       decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5
 
+-- test multi-word code points split across bytestring chunks
+t_chunk_decode_utf8_4 =
+  F.fix (\ f bs s ->
+    case s of
+      (input, expected) : s' -> whenEqProp (E.decodeUtf8Chunks bs input) expected $
+        case expected of
+          E.DecodeResult _ _ bs' _ -> f bs' s'
+      _ -> counterexample "" True
+    ) mempty
+    [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4)
+    , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4)
+    , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0)
+    , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9)
+    , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0)
+    , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0)
+    , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5)
+    ]
+
 t_chunk_decode_utf16BE =
   let decode = E.decodeUtf16Chunks True
       expectedBs0 = B.pack [0]
@@ -335,8 +373,6 @@ t_infix_concat bs1 text bs2 =
 testTranscoding :: TestTree
 testTranscoding =
   testGroup "transcoding" [
-    testProperty "t_asciiE" t_asciiE,
-    testProperty "tl_asciiE" tl_asciiE,
     testProperty "t_ascii" t_ascii,
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,
@@ -381,6 +417,8 @@ testTranscoding =
       testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1,
       testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2,
       testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3,
+      testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4,
+      testProperty "t_chunk_decode_ascii_1" t_chunk_decode_ascii_1,
       testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE,
       testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
       testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,

From e1b25698894db4c0f2565b5aee71102860949b4a Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 23 Jul 2022 13:26:32 -0600
Subject: [PATCH 27/87] minor mistake in calculating inital array length for
 text array in decodeChunks

---
 src/Data/Text/Encoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 799fedcb..85c192dc 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -229,7 +229,7 @@ decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.leng
     len :: Int
     len = len1 + len2
     len' :: Int
-    len' = len `div` wordByteSize + 4
+    len' = (len `div` wordByteSize) + 4
 
     outer dst dstLen = inner
       where

From c5594dc4f5b02c40c62ab98f98a972e2274b8ebf Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 23 Jul 2022 16:30:51 -0600
Subject: [PATCH 28/87] Fixed default isValidBS function

---
 src/Data/Text/Encoding.hs | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 85c192dc..433b3348 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -366,13 +366,19 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff
     -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
     guessUtf8Boundary :: ByteString -> Int
     guessUtf8Boundary bs'@(B.length -> len)
-      | len >= 1 && w 1 <  0x80 = len     -- last char is ASCII
-      | len >= 1 && w 1 >= 0xc0 = len - 1 -- last char starts a two-(or more-)word code point
-      | len >= 2 && w 2 >= 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point
-      | len >= 3 && w 3 >= 0xf0 = len - 3 -- third to last char starts a four-word code point
-      | otherwise = 0 -- gonna have to resolve this with an incremental approach
+      | len >= 1 && wr 1 < 0x80 -- last char is ASCII
+      , wb 2 0xe0 0xc0          -- last two chars are a two-word code point
+      , wb 3 0xf0 0xe0          -- last three chars are a three-word code point
+      , wb 2 0xf8 0xf0          -- last four chars are a four-word code point
+        = len
+      | w 1 0xc0 = len - 1      -- last char starts a two-(or more-)word code point
+      | w 2 0xe0 = len - 2      -- pre-last char starts a three- or four-word code point
+      | w 3 0xf0 = len - 3      -- third to last char starts a four-word code point
+      | otherwise = 0           -- gonna have to resolve this with an incremental approach
       where
-        w n = B.index bs' (len - n)
+        wr n = B.index bs' (len - n)
+        w n bt = len >= n && wr n >= bt
+        wb n mask bt = len >= n && wr n .&. mask == bt
 
     isValidBS :: ByteString -> Int -> Int -> Bool
     isValidBS bs' off count = if off + count > B.length bs'
@@ -388,13 +394,13 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff
         start off
       where
         start ix
-          | ix >= count = True
+          | ix >= off + count = True
           | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of
             Accept{} -> start (ix + 1)
             Reject{} -> False
             Incomplete st _ -> step (ix + 1) st
         step ix st
-          | ix >= count = False
+          | ix >= off + count = False
           -- We do not use decoded code point, so passing a dummy value to save an argument.
           | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of
             Accept{} -> start (ix + 1)

From 0fefb3a4470ca1c234783f980462c124ad57f3b9 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 23 Jul 2022 19:56:05 -0600
Subject: [PATCH 29/87] Corrected logic for UTF-8 boundary checking

---
 src/Data/Text/Encoding.hs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 433b3348..91f77509 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -367,9 +367,9 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff
     guessUtf8Boundary :: ByteString -> Int
     guessUtf8Boundary bs'@(B.length -> len)
       | len >= 1 && wr 1 < 0x80 -- last char is ASCII
-      , wb 2 0xe0 0xc0          -- last two chars are a two-word code point
-      , wb 3 0xf0 0xe0          -- last three chars are a three-word code point
-      , wb 2 0xf8 0xf0          -- last four chars are a four-word code point
+        || wb 2 0xe0 0xc0       -- last two chars are a two-word code point
+        || wb 3 0xf0 0xe0       -- last three chars are a three-word code point
+        || wb 2 0xf8 0xf0       -- last four chars are a four-word code point
         = len
       | w 1 0xc0 = len - 1      -- last char starts a two-(or more-)word code point
       | w 2 0xe0 = len - 2      -- pre-last char starts a three- or four-word code point

From d09864938ecbac981d6d5e2d59039bc6dd759f69 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 23 Jul 2022 20:49:35 -0600
Subject: [PATCH 30/87] Cleaned up decodeChunks tests

---
 tests/Tests/Properties/Transcoding.hs | 219 +++++++++++++-------------
 1 file changed, 106 insertions(+), 113 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index b61e603c..8aab9ae3 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -13,7 +13,6 @@ import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
 import Tests.QuickCheckUtils
 import qualified Control.Exception as Exception
-import qualified Control.Monad.Fix as F
 import qualified Data.Bits as Bits (shiftL, shiftR)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Builder as B
@@ -207,123 +206,117 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
-t_chunk_decode_ascii_1 =
-  let bs1 = B.pack [0x68, 0x69, 0x2c, 0x20, 0x83,
-        0x68, 0x65, 0x6c, 0x6c]
-      bs2 = B.pack [0x6f, 0x2c, 0x20, 0x94,
+-- test multi-word code points split across bytestring chunks
+chunksTests decodeF insExpectedOuts =
+  f mempty insExpectedOuts
+  where
+    f bs s =
+      case s of
+        (input, expected) : s' -> whenEqProp (decodeF bs input) expected $
+          case expected of
+            E.DecodeResult _ _ bs' _ -> f bs' s'
+        _ -> counterexample "" True
+
+t_chunk_decode_ascii_1 = chunksTests E.decodeAsciiChunks
+  [ ( B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c]
+    , E.DecodeResult "hi, " (Just 0x83) (B.pack [0x68, 0x65, 0x6c, 0x6c]) 5
+    )
+  , ( B.pack [0x6f, 0x2c, 0x20, 0x94,
         0x68, 0x6f, 0x77, 0x20, 0xcc,
         0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
-      decode = E.decodeAsciiChunks
-      decodeResult0 = decode bs1 mempty
-      expectedBs0 = B.pack [0x68, 0x65, 0x6c, 0x6c]
-  in
-  whenEqProp decodeResult0 (E.DecodeResult "hi, " (Just 0x83) expectedBs0 5) $
-    let decodeResult1 = decode expectedBs0 bs2
-        expectedBs1 = B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
-          0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult "hello, " (Just 0x94) expectedBs1 8) $
-      let decodeResult2 = decode expectedBs1 mempty
-          expectedBs2 = B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
-      in
-      whenEqProp decodeResult2 (E.DecodeResult "how " (Just 0xcc) expectedBs2 5) $
-        decode expectedBs2 mempty === E.DecodeResult "are ya?" Nothing mempty 7
-
-t_chunk_decode_utf8_1 =
-  let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20,
-        0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64,
-        0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] in
-  decodeResult0 === E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22
-t_chunk_decode_utf8_2 =
-  let decode = E.decodeUtf8Chunks mempty
-      decodeResult0 = decode $ B.pack [97, 0xC2, 97]
-      expectedBs0 = B.singleton 97
-  in
-  whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $
-    let decodeResult1 = decode expectedBs0 in
-    decodeResult1 === E.DecodeResult (T.singleton 'a') Nothing mempty 1
-t_chunk_decode_utf8_3 =
-  let decode = E.decodeUtf8Chunks
-      decodeResult0 = decode mempty $ B.pack [104, 105, 32, 0xe2]
-      expectedBs0 = B.singleton 0xe2
-  in -- hi \xe2
-  whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $
-    let decodeResult1 = decode expectedBs0 $ B.singleton 0x98
-        expectedBs1 = B.pack [0xe2, 0x98]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $
-      let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in
-      decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5
+    , E.DecodeResult "hello, " (Just 0x94) (B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
+          0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 8
+    )
+  , ( mempty
+    , E.DecodeResult "how " (Just 0xcc) (B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 5
+    )
+  , ( mempty
+    , E.DecodeResult "are ya?" Nothing mempty 7
+    )
+  ]
 
+t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks
+  [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20,
+        0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64,
+        0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21]
+    , E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22
+    )
+  ]
+t_chunk_decode_utf8_2 = chunksTests E.decodeUtf8Chunks
+  [ ( B.pack [97, 0xC2, 97]
+    , E.DecodeResult (T.singleton 'a') (Just 0xC2) (B.singleton 97) 2
+    )
+  , ( mempty
+    , E.DecodeResult (T.singleton 'a') Nothing mempty 1
+    )
+  ]
+t_chunk_decode_utf8_3 = chunksTests E.decodeUtf8Chunks
+  [ ( B.pack [104, 105, 32, 0xe2]
+    , E.DecodeResult "hi " Nothing (B.singleton 0xe2) 3
+    )
+  , ( B.singleton 0x98
+    , E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0
+    )
+  , ( B.pack [0x83, 32, 0xFF]
+    , E.DecodeResult "☃ " (Just 0xFF) mempty 5
+    )
+  ]
 -- test multi-word code points split across bytestring chunks
-t_chunk_decode_utf8_4 =
-  F.fix (\ f bs s ->
-    case s of
-      (input, expected) : s' -> whenEqProp (E.decodeUtf8Chunks bs input) expected $
-        case expected of
-          E.DecodeResult _ _ bs' _ -> f bs' s'
-      _ -> counterexample "" True
-    ) mempty
-    [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4)
-    , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4)
-    , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0)
-    , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9)
-    , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0)
-    , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0)
-    , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5)
-    ]
+t_chunk_decode_utf8_4 = chunksTests E.decodeUtf8Chunks
+  [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4)
+  , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4)
+  , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0)
+  , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9)
+  , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0)
+  , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0)
+  , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5)
+  ]
 
-t_chunk_decode_utf16BE =
-  let decode = E.decodeUtf16Chunks True
-      expectedBs0 = B.pack [0]
-      decodeResult0 = decode mempty expectedBs0 in
-  whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
-    let decodeResult1 = decode expectedBs0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
-        expectedBs1 = B.pack [0xD8, 0x01]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
-      let decodeResult2 = decode expectedBs1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in
-      decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8
-t_chunk_decode_utf16LE =
-  let decode = E.decodeUtf16Chunks False
-      expectedBs0 = B.pack [104]
-      decodeResult0 = decode mempty expectedBs0 in
-  whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $
-    let decodeResult1 = decode expectedBs0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
-        expectedBs1 = B.pack [0x01, 0xD8]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $
-      let decodeResult2 = decode expectedBs1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in
-      decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC) mempty 8
-
-t_chunk_decode_utf32BE =
-  let decode = E.decodeUtf32Chunks True
-      expBs0 = B.pack [0, 0]
-      decodeResult0 = decode mempty $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
-  in
-  whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
-    let expBs1 = B.pack [0, 0, 0x26]
-        decodeResult1 = decode expBs0 $ B.pack [0, 32, 0, 0, 0x26]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
-      let expBs2 = mempty
-          decodeResult2 = decode expBs1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]
-      in
-      decodeResult2 === E.DecodeResult "☃ " (Just 0xD800) expBs2 12
-t_chunk_decode_utf32LE =
-  let decode = E.decodeUtf32Chunks False
-      expBs0 = B.pack [0x20, 0]
-      decodeResult0 = decode mempty $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
-  in
-  whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $
-    let expBs1 = B.pack [0x03, 0x26, 0]
-        decodeResult1 = decode expBs0 $ B.pack [0, 0, 0x03, 0x26, 0]
-    in
-    whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $
-      let expBs2 = mempty
-          decodeResult2 = decode expBs1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]
-      in
-      decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12
+t_chunk_decode_utf16BE = chunksTests (E.decodeUtf16Chunks True)
+  [ ( B.pack [0]
+    , E.DecodeResult T.empty Nothing (B.pack [0]) 0
+    )
+  , ( B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
+    , E.DecodeResult "hi " Nothing (B.pack [0xD8, 0x01]) 6
+    )
+  , ( B.pack [0xDC, 0x37, 0, 32, 0xDC, 0]
+    , E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8
+    )
+  ]
+t_chunk_decode_utf16LE = chunksTests (E.decodeUtf16Chunks False)
+  [ ( B.pack [104]
+    , E.DecodeResult T.empty Nothing (B.pack [104]) 0
+    )
+  , ( B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
+    , E.DecodeResult "hi " Nothing (B.pack [0x01, 0xD8]) 6
+    )
+  , ( B.pack [0x37, 0xDC, 32, 0, 0, 0xDC]
+    , E.DecodeResult "\x10437 " (Just 0xDC) mempty 8
+    )
+  ]
+
+t_chunk_decode_utf32BE = chunksTests (E.decodeUtf32Chunks True)
+  [ ( B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
+    , E.DecodeResult "hi" Nothing (B.pack [0, 0]) 8
+    )
+  , ( B.pack [0, 32, 0, 0, 0x26]
+    , E.DecodeResult " " Nothing (B.pack [0, 0, 0x26]) 4
+    )
+  , ( B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]
+    , E.DecodeResult "☃ " (Just 0xD800) mempty 12
+    )
+  ]
+t_chunk_decode_utf32LE = chunksTests (E.decodeUtf32Chunks False)
+  [ ( B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
+    , E.DecodeResult "hi" Nothing (B.pack [0x20, 0]) 8
+    )
+  , ( B.pack [0, 0, 0x03, 0x26, 0]
+    , E.DecodeResult " " Nothing (B.pack [0x03, 0x26, 0]) 4
+    )
+  , ( B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]
+    , E.DecodeResult "☃ " (Just 0xD80000) mempty 12
+    )
+  ]
 
 decodeLL :: BL.ByteString -> TL.Text
 decodeLL = EL.decodeUtf8With C.lenientDecode

From 048327910d8f196a45c609bdbc377b1be4f3252e Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 26 Jul 2022 16:32:07 -0600
Subject: [PATCH 31/87] Refactored chunk and ASCII decoders

---
 src/Data/Text/Encoding.hs               | 338 +++++++++---------------
 src/Data/Text/Internal/Encoding/Utf8.hs |  28 +-
 src/Data/Text/Lazy/Encoding.hs          |  48 ++--
 tests/Tests/Properties/Transcoding.hs   |  47 ++--
 4 files changed, 190 insertions(+), 271 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 91f77509..cbe5fea6 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -31,8 +31,8 @@ module Data.Text.Encoding
     -- $total
       decodeLatin1
     , decodeUtf8Lenient
+    , decodeAsciiPrefix
     , DecodeResult(..)
-    , decodeAsciiChunks
     , decodeUtf8Chunks
     , decodeUtf16Chunks
     , decodeUtf32Chunks
@@ -75,7 +75,8 @@ module Data.Text.Encoding
     ) where
 
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (ST, runST)
+import Control.Monad (when)
+import Control.Monad.ST (runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
 import Data.ByteString (ByteString)
@@ -97,24 +98,18 @@ import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
-import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..))
+import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DetectUtf8Result(..))
 import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes)
 import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
 import Data.Text.Internal.ByteStringCompat
+import Numeric (showHex)
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
 
-#ifdef SIMDUTF
-import Foreign.C.Types (CInt(..))
-#elif !MIN_VERSION_bytestring(0,11,2)
-import qualified Data.ByteString.Unsafe as B
-import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
-#endif
-
 -- $strict
 --
 -- All of the single-parameter functions for decoding bytestrings
@@ -146,11 +141,10 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
 -- anything except ASCII and copies buffer or throws an error otherwise.
 --
 decodeASCII :: ByteString -> Text
-decodeASCII bs = case decodeAsciiChunks bs mempty of
-  DecodeResult t mW _ pos ->
-    case mW of
-      Just _ -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show (pos - 1)
-      _ -> t
+decodeASCII bs@(B.length -> len) = case decodeAsciiPrefix bs of
+  (t, Nothing) -> t
+  (_, Just (w, (B.length -> len'))) ->
+    error $ "decodeASCII: detected non-ASCII codepoint (\\x" ++ showHex w (") at " ++ show (len - len' - 1))
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --
@@ -189,6 +183,83 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
+-- | Decode a 'ByteString' containing ASCII.
+--
+-- This is a total function. The 'ByteString' is decoded until either
+-- the end is reached or it errors with the first non-ASCII 'Word8' is
+-- encountered. In either case the function will return what 'Text' was
+-- decoded. On error, the non-ASCII 'Word8' is also returned followed
+-- by the rest of the undecoded 'ByteString'.
+--
+-- @since 2.0.2
+decodeAsciiPrefix
+  :: ByteString
+  -> (Text, Maybe (Word8, ByteString))
+decodeAsciiPrefix bs = if B.null bs
+  then (empty, Nothing)
+  else runST $ withBS bs $ \ fp len ->
+    unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do
+      asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len
+      (, if asciiPrefixLen < len
+        then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs)
+        else Nothing) <$> if asciiPrefixLen == 0
+          then pure empty
+          else unsafeSTToIO $ do
+            dst <- A.new asciiPrefixLen
+            A.copyFromPointer dst 0 src asciiPrefixLen
+            arr <- A.unsafeFreeze dst
+            pure $ Text arr 0 asciiPrefixLen
+
+-- | Decode two 'ByteString's containing UTF-8-encoded text as though
+-- they were one continuous 'ByteString' returning a 'DecodeResult'.
+--
+-- @since 2.0.2
+decodeUtf8Chunks
+  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
+  -> ByteString -- ^ The second 'ByteString' chunk to decode.
+  -> DecodeResult Text ByteString Word8
+decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
+  let len = len1 + len2 in
+  if len == 0
+    then DecodeResult empty Nothing bs1 0
+    else
+      let index i
+            | i < len1  = B.index bs1 i
+            | otherwise = B.index bs2 $ i - len1
+          step i _ Reject = (Reject, i)
+          step i i' (Incomplete a) =
+            if i' < len
+              then step i (i' + 1) $ utf8DecodeContinue (index i') a
+              else (Incomplete a, i)
+          step _ i' st@_ =
+            if i' < len
+              then step i' (i' + 1) . utf8DecodeStart $ index i'
+              else (st, i')
+          (st', t) = case step 0 1 . utf8DecodeStart $ index 0 of
+            (st, count) ->
+              (st, ) $ if count > 0
+                then runST $ do
+                  dst <- A.new count
+                  let crossesBSBoundary = count > len1
+                      copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ ->
+                        unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+                          unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count'
+                  copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0
+                  when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1
+                  arr <- A.unsafeFreeze dst
+                  pure $ Text arr 0 count
+                else empty
+          decodeResult off mErr = DecodeResult t mErr (if off >= len1
+              then B.drop (off - len1) bs2
+              else B.drop off bs1 `B.append` bs2) off
+          tLen = case t of Text _ _ len' -> len'
+      in
+      case st' of
+        Reject ->
+          let tLen' = tLen + 1 in
+          decodeResult tLen' . Just $ index tLen
+        _ -> decodeResult tLen Nothing
+
 data Progression
   = WriteAndAdvance Char Int
   | NeedMore
@@ -200,22 +271,11 @@ decodeChunks ::
 #endif
   (Bits w, Num w, Storable w)
   => w
-  -> ( Int
-    -> ByteString
-    -> A.MArray s
-    -> Int
-    -> Int
-    -> Int
-    -> ST s (Maybe (
-        (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
-        -> ST s (DecodeResult Text ByteString w))
-      )
-    )
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
-  -> ST s (DecodeResult Text ByteString w)
-decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = do
+  -> DecodeResult Text ByteString w
+decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
   marr <- A.new len'
   outer marr len' 0 0
   where
@@ -234,212 +294,59 @@ decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.leng
     outer dst dstLen = inner
       where
         inner srcOff dstOff
-          | srcOff < len1 = decode 0 bs1
-          | srcOff < len = decode len1 bs2
+          | srcOff < len
+          , len >= srcOff + wordByteSize =
+            if dstOff + 4 > dstLen
+              -- need more space in destination
+              then do
+                let dstLen' = dstLen + 4
+                dst' <- A.resizeM dst dstLen'
+                outer dst' dstLen' srcOff dstOff
+              else
+                case transcodeF index len srcOff of
+                  WriteAndAdvance c srcOff' -> do
+                    d <- unsafeWrite dst dstOff c
+                    inner srcOff' $ dstOff + d
+                  NeedMore -> goodSoFar
+                  Invalid ->
+                    let srcOff' = srcOff + wordByteSize
+                        bytesToWord n word
+                          | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                          | otherwise = word
+                    in
+                    wrapUp srcOff' . Just $ bytesToWord wordByteSize 0
           -- finished (for now)
           | otherwise = goodSoFar
           where
-            decode bsOff bs = do
-              mOuterArgs <- bulkDecodeF bsOff bs dst dstLen srcOff dstOff
-              case mOuterArgs of
-                Just outerArgs -> outerArgs outer
-                _ -> if len - srcOff < wordByteSize
-                  -- incomplete code point
-                  then goodSoFar
-                  else
-                    if dstOff + 4 > dstLen
-                      -- need more space in destination
-                      then do
-                        let dstLen' = dstLen + 4
-                        dst' <- A.resizeM dst dstLen'
-                        outer dst' dstLen' srcOff dstOff
-                      else
-                        case incrementalDecodeF index len srcOff of
-                          WriteAndAdvance c srcOff' -> do
-                            d <- unsafeWrite dst dstOff c
-                            inner srcOff' $ dstOff + d
-                          NeedMore -> goodSoFar
-                          Invalid -> invalid
-            contin off res = do
+            wrapUp off mW = do
               A.shrinkM dst dstOff
               arr <- A.unsafeFreeze dst
-              pure . res (Text arr 0 dstOff) $ if off >= len1
+              pure $ DecodeResult (Text arr 0 dstOff) mW (if off >= len1
                 then B.drop (off - len1) bs2
-                else B.drop off $ bs1 `B.append` bs2
+                else B.drop off $ bs1 `B.append` bs2) off
             goodSoFar =
-              contin srcOff $ \ t bs' ->
-                DecodeResult t Nothing bs' srcOff
-            invalid =
-              let srcOff' = srcOff + wordByteSize
-                  bytesToWord n word
-                    | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                    | otherwise = word
-              in
-              contin srcOff' $ \ t bs' ->
-                DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff'
+              wrapUp srcOff Nothing
 
 decodeChunksProxy :: (Bits w, Num w, Storable w)
-  => ( Int
-    -> ByteString
-    -> A.MArray s
-    -> Int
-    -> Int
-    -> Int
-    -> ST s (Maybe (
-        (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w))
-        -> ST s (DecodeResult Text ByteString w)
-        )
-      )
-    )
-  -> ((Int -> Word8) -> Int -> Int -> Progression)
+  => ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
   -> ByteString
-  -> ST s (DecodeResult Text ByteString w)
+  -> DecodeResult Text ByteString w
 decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- determine the size in bytes of a data type using Storable.sizeOf
 -- so that it doesn't have to be passed as an arugment. Storable.sizeOf
 -- discards the actual value without evaluating it.
 
-bulkCharCopy
-  :: Ptr Word8
-  -> Int
-  -> A.MArray s
-  -> Int
-  -> Int
-  -> IO (A.MArray s, Int, Int)
-bulkCharCopy src srcLen dst dstLen dstOff = do
-  let minLen = dstOff + srcLen
-  (dst', dstLen') <-
-    if minLen > dstLen
-      then
-        let newLen = minLen + 4 in
-        (, newLen) <$> (unsafeSTToIO $ A.resizeM dst newLen)
-      else pure (dst, dstLen)
-  unsafeSTToIO $ A.copyFromPointer dst' dstOff src srcLen
-  pure (dst', dstLen', minLen)
-
--- | Decode two 'ByteString's containing ASCII as though they were one
--- continuous 'ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeAsciiChunks
-  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> ByteString -- ^ The second 'ByteString' chunk to decode.
-  -> DecodeResult Text ByteString Word8
-decodeAsciiChunks bs1 bs2 = runST $ decodeChunksProxy (
-  \ bsOff bs dst dstLen srcOff dstOff -> withBS bs $ \ fp len ->
-    unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do
-      let srcOff' = srcOff - bsOff
-      asciiPrefixLen <- fmap fromIntegral . c_is_ascii (src `plusPtr` srcOff') $ src `plusPtr` len
-      if asciiPrefixLen == 0
-        then pure Nothing
-        else do
-          (dst', dstLen', dstOff') <- bulkCharCopy src asciiPrefixLen dst dstLen dstOff
-          pure . Just $ \ f -> f dst' dstLen' (srcOff' + asciiPrefixLen + bsOff) dstOff'
-  ) (\ _ _ _ -> Invalid) bs1 bs2
-
-decodeUtf8Bulk
-  :: Int
-  -> ByteString
-  -> A.MArray s
-  -> Int
-  -> Int
-  -> Int
-  -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t))
-decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff
-  -- potential valid utf8 content endpoint
-  | utf8End <- bsOff + guessUtf8Boundary bs
-  -- not yet reached the UTF-8 boundary
-  , srcOff < utf8End
-  -- potential valid utf8 content length
-  , utf8Len <- utf8End - srcOff
-  -- this ByteString chunk's offset
-  , srcOff2 <- srcOff - bsOff
-  -- check for UTF-8 compliance
-  , isValidBS bs srcOff2 utf8Len = do
-    (dst', dstLen', dstOff') <- withBS bs $ \ fp _ ->
-      unsafeIOToST $ unsafeWithForeignPtr fp $ \ src ->
-        bulkCharCopy (src `plusPtr` srcOff2) utf8Len dst dstLen dstOff
-    pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) dstOff'
-  | otherwise = pure Nothing
-  where
-    -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
-    guessUtf8Boundary :: ByteString -> Int
-    guessUtf8Boundary bs'@(B.length -> len)
-      | len >= 1 && wr 1 < 0x80 -- last char is ASCII
-        || wb 2 0xe0 0xc0       -- last two chars are a two-word code point
-        || wb 3 0xf0 0xe0       -- last three chars are a three-word code point
-        || wb 2 0xf8 0xf0       -- last four chars are a four-word code point
-        = len
-      | w 1 0xc0 = len - 1      -- last char starts a two-(or more-)word code point
-      | w 2 0xe0 = len - 2      -- pre-last char starts a three- or four-word code point
-      | w 3 0xf0 = len - 3      -- third to last char starts a four-word code point
-      | otherwise = 0           -- gonna have to resolve this with an incremental approach
-      where
-        wr n = B.index bs' (len - n)
-        w n bt = len >= n && wr n >= bt
-        wb n mask bt = len >= n && wr n .&. mask == bt
-
-    isValidBS :: ByteString -> Int -> Int -> Bool
-    isValidBS bs' off count = if off + count > B.length bs'
-      then False
-      else
-#ifdef SIMDUTF
-        withBS bs' $ \ fp _ -> unsafeDupablePerformIO $
-          unsafeWithForeignPtr fp $ \ ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral count)
-#else
-#if MIN_VERSION_bytestring(0,11,2)
-        B.isValidUtf8 (B.take count $ B.drop off bs')
-#else
-        start off
-      where
-        start ix
-          | ix >= off + count = True
-          | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of
-            Accept{} -> start (ix + 1)
-            Reject{} -> False
-            Incomplete st _ -> step (ix + 1) st
-        step ix st
-          | ix >= off + count = False
-          -- We do not use decoded code point, so passing a dummy value to save an argument.
-          | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of
-            Accept{} -> start (ix + 1)
-            Reject{} -> False
-            Incomplete st' _ -> step (ix + 1) st'
-#endif
-#endif
-
--- | Decode two 'ByteString's containing UTF-8-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeUtf8Chunks
-  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> ByteString -- ^ The second 'ByteString' chunk to decode.
-  -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy decodeUtf8Bulk (\ index len srcOff ->
-  let step i (Incomplete a b)
-        | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b
-      step i st = (st, i)
-  in
-  case step (srcOff + 1) . utf8DecodeStart $ index srcOff of
-    (Accept c, srcOff') -> WriteAndAdvance c srcOff'
-    (Reject, _) -> Invalid
-    (Incomplete{}, _) -> NeedMore) bs1 bs2
-
-noBulkDecodeOp :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a)
-noBulkDecodeOp _ _ _ _ _ _ = pure Nothing
-
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
 -- @since 2.0.2
 decodeUtf16Chunks
   :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word16
-decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index len srcOff ->
+decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff ->
   -- get next Word8 pair
   let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n
       b0 = index $ if isBE then srcOff else srcOff + 1
@@ -466,10 +373,10 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ ind
 -- @since 2.0.2
 decodeUtf32Chunks
   :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word32
-decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index _ srcOff ->
+decodeUtf32Chunks isBE bs1 bs2 = decodeChunksProxy (\ index _ srcOff ->
   -- get next Word8 quartet
   case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3)
       (index $ srcOff + (if isBE then 1 else 2))
@@ -488,7 +395,7 @@ decodeUtf8With ::
 #endif
   OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of
-  Some t unencoded _ -> codePointToInvalid unencoded t
+  Some t undecoded _ -> codePointToInvalid undecoded t
   where
     codePointToInvalid bs' txt =
       case B.uncons bs' of
@@ -802,8 +709,3 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
 encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
-
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index bcf6a778..e9cc9e68 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -34,7 +34,7 @@ module Data.Text.Internal.Encoding.Utf8
     , validate3
     , validate4
     -- * Naive decoding
-    , DecoderResult(..)
+    , DetectUtf8Result(..)
     , DecoderState(..)
     , CodePoint(..)
     , utf8DecodeStart
@@ -46,7 +46,7 @@ import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
 import Data.Bits (Bits(..), FiniteBits(..))
-import Data.Char (ord, chr)
+import Data.Char (ord)
 import GHC.Exts
 import GHC.Word (Word8(..))
 
@@ -264,29 +264,27 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
 newtype CodePoint = CodePoint Int
 
 -- | @since 2.0
-data DecoderResult
-  = Accept !Char
-  | Incomplete !DecoderState !CodePoint
+data DetectUtf8Result
+  = Accept
+  | Incomplete !DecoderState
   | Reject
 
 -- | @since 2.0
-utf8DecodeStart :: Word8 -> DecoderResult
+utf8DecodeStart :: Word8 -> DetectUtf8Result
 utf8DecodeStart !w
-  | st == utf8AcceptState = Accept (chr (word8ToInt w))
+  | st == utf8AcceptState = Accept
   | st == utf8RejectState = Reject
-  | otherwise             = Incomplete st (CodePoint cp)
+  | otherwise             = Incomplete st
   where
-    cl@(ByteClass cl') = byteToClass w
+    cl = byteToClass w
     st = updateState cl utf8AcceptState
-    cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w
 
 -- | @since 2.0
-utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
-utf8DecodeContinue !w !st (CodePoint !cp)
-  | st' == utf8AcceptState = Accept (chr cp')
+utf8DecodeContinue :: Word8 -> DecoderState -> DetectUtf8Result
+utf8DecodeContinue !w !st
+  | st' == utf8AcceptState = Accept
   | st' == utf8RejectState = Reject
-  | otherwise              = Incomplete st' (CodePoint cp')
+  | otherwise              = Incomplete st'
   where
     cl  = byteToClass w
     st' = updateState cl st
-    cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f)
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 992654fa..91f36dc8 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,8 +23,8 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
+    , decodeAsciiPrefix
     , DecodeResult(..)
-    , decodeAsciiChunks
     , decodeUtf8Chunks
     , decodeUtf16Chunks
     , decodeUtf32Chunks
@@ -100,15 +100,35 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 -- (preferably not at all). See "Data.Text.Lazy.Encoding#g:total" for better
 -- solutions.
 
--- | Decode a 'ByteString' containing 7-bit ASCII
+-- | Decode a 'B.ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: B.ByteString -> Text
 decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 
--- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+-- | Decode a 'B.ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
+-- | Decode a 'B.ByteString' containing ASCII.
+--
+-- This is a total function. The 'B.ByteString' is decoded until either
+-- the end is reached or it errors with the first non-ASCII 'Word8' is
+-- encountered. In either case the function will return what 'Text' was
+-- decoded. On error, the non-ASCII 'Word8' is also returned followed
+-- by the rest of the undecoded 'B.ByteString'.
+--
+-- @since 2.0.2
+decodeAsciiPrefix
+  :: B.ByteString
+  -> (Text, Maybe (Word8, B.ByteString))
+decodeAsciiPrefix = g id
+  where
+    g tDiff (B.Chunk sb lb) =
+      case TE.decodeAsciiPrefix sb of
+        (t, Nothing) -> g (tDiff . chunk t) lb
+        (t, Just (w, sb')) -> (tDiff $ chunk t Empty, Just (w, B.chunk sb' lb))
+    g tDiff _ = (tDiff Empty, Nothing)
+
 decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
   -> B.ByteString
   -> B.ByteString
@@ -125,22 +145,12 @@ decodeChunks decoder = g id 0 mempty
     g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty
     g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos
 
--- | Decode two 'ByteString's containing ASCII as though they were one
--- continuous 'ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeAsciiChunks
-  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
-  -> DecodeResult Text B.ByteString Word8
-decodeAsciiChunks = decodeChunks TE.decodeAsciiChunks
-
 -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
 --
 -- @since 2.0.2
 decodeUtf8Chunks
-  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word8
 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
@@ -151,7 +161,7 @@ decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
 -- @since 2.0.2
 decodeUtf16Chunks
   :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word16
 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
@@ -162,12 +172,12 @@ decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
 -- @since 2.0.2
 decodeUtf32Chunks
   :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function.
+  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
   -> DecodeResult Text B.ByteString Word32
 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks
 
--- | Decode a 'ByteString' containing UTF-8 encoded text.
+-- | Decode a 'B.ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr (B.Chunk b0 bs0) =
     case TE.streamDecodeUtf8With onErr b0 of
@@ -188,7 +198,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
     desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
 decodeUtf8With _ _ = empty
 
--- | Decode a 'ByteString' containing UTF-8 encoded text that is known
+-- | Decode a 'B.ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
 -- If the input contains any invalid UTF-8 data, an exception will be
@@ -199,7 +209,7 @@ decodeUtf8 :: B.ByteString -> Text
 decodeUtf8 = decodeUtf8With strictDecode
 {-# INLINE[0] decodeUtf8 #-}
 
--- | Decode a 'ByteString' containing UTF-8 encoded text..
+-- | Decode a 'B.ByteString' containing UTF-8 encoded text..
 --
 -- If the input contains any invalid UTF-8 data, the relevant
 -- exception will be returned, otherwise the decoded text.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 8aab9ae3..d8d0ce58 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -206,6 +206,33 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
+t_prefix_decode_ascii_1 =
+  let g bs es is =
+        case es of
+          expected : es' -> whenEqProp (E.decodeAsciiPrefix bs) expected $
+            case expected of
+              (_, Just (_, bs')) -> g bs' es' is
+              _ -> f (is, es')
+          _ -> counterexample ("More input than expected: " ++ show is) False
+      f s = case s of
+        (input : is, es) -> g input es is
+        (_, es@(_ : _)) -> counterexample ("More expected output than input: " ++ show es) False
+        _ -> counterexample "" True
+  in
+  f ( [ B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c]
+      , B.pack [0x6f, 0x2c, 0x20, 0x94,
+            0x68, 0x6f, 0x77, 0x20, 0xcc,
+            0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
+      ]
+    , [ ("hi, ", Just (0x83, B.pack [0x68, 0x65, 0x6c, 0x6c]))
+      , ("hell", Nothing)
+      , ("o, ", Just (0x94, B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
+              0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]))
+      , ("how ", Just (0xcc, B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]))
+      , ("are ya?", Nothing)
+      ]
+    )
+
 -- test multi-word code points split across bytestring chunks
 chunksTests decodeF insExpectedOuts =
   f mempty insExpectedOuts
@@ -217,24 +244,6 @@ chunksTests decodeF insExpectedOuts =
             E.DecodeResult _ _ bs' _ -> f bs' s'
         _ -> counterexample "" True
 
-t_chunk_decode_ascii_1 = chunksTests E.decodeAsciiChunks
-  [ ( B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c]
-    , E.DecodeResult "hi, " (Just 0x83) (B.pack [0x68, 0x65, 0x6c, 0x6c]) 5
-    )
-  , ( B.pack [0x6f, 0x2c, 0x20, 0x94,
-        0x68, 0x6f, 0x77, 0x20, 0xcc,
-        0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
-    , E.DecodeResult "hello, " (Just 0x94) (B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
-          0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 8
-    )
-  , ( mempty
-    , E.DecodeResult "how " (Just 0xcc) (B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 5
-    )
-  , ( mempty
-    , E.DecodeResult "are ya?" Nothing mempty 7
-    )
-  ]
-
 t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks
   [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20,
         0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64,
@@ -411,7 +420,7 @@ testTranscoding =
       testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2,
       testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3,
       testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4,
-      testProperty "t_chunk_decode_ascii_1" t_chunk_decode_ascii_1,
+      testProperty "t_prefix_decode_ascii_1" t_prefix_decode_ascii_1,
       testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE,
       testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
       testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,

From 82ab16a4c05dff0feb93d3338a3398842af938df Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 26 Jul 2022 16:56:09 -0600
Subject: [PATCH 32/87] version typo

---
 changelog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index f198612a..b0347e05 100644
--- a/changelog.md
+++ b/changelog.md
@@ -207,7 +207,7 @@
 * Bumped lower bound on deepseq to 1.4 for compatibility with the
   upcoming GHC 7.10
 
-#### 1.2.0.2
+#### 1.2.0.1
 
 * Fixed a buffer overflow in rendering of large Integers
   (https://github.com/bos/text/issues/99)

From 0b6e5c583124bce1816a7ac188fd0f80ceedcf84 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 27 Jul 2022 16:42:54 -0600
Subject: [PATCH 33/87] rename from 'Decode' to 'Detect'

---
 changelog.md                            | 12 +++++++
 src/Data/Text/Encoding.hs               | 42 ++++++++++++-------------
 src/Data/Text/Internal/Encoding/Utf8.hs | 30 +++++++++---------
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/changelog.md b/changelog.md
index b0347e05..2f529fc6 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,15 @@
+### 2.0.2
+
+* A new suite of total decoders have been added in `Data.Text.Encoding`
+  and `Data.Text.Lazy.Encoding` that allow decoding to be aborted on
+  errors without the need to raise an `error` and `catch` it elsewhere:
+  * `decodeAsciiPrefix`
+  * `decodeUtf8Chunks`
+  * `decodeUtf16Chunks`
+  * `decodeUtf32Chunks`
+
+### 2.0.1
+
 * Improve portability of C and C++ code.
 * [Make `Lift` instance more efficient](https://github.com/haskell/text/pull/413)
 * [Make `toCaseFold` idempotent](https://github.com/haskell/text/pull/402)
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index cbe5fea6..600ca7b6 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -98,7 +98,7 @@ import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
-import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DetectUtf8Result(..))
+import Data.Text.Internal.Encoding.Utf8 (utf8DetectStart, utf8DetectContinue, DetectUtf8Result(..))
 import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes)
 import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes)
 import qualified Data.Text.Array as A
@@ -161,24 +161,22 @@ decodeLatin1 ::
   ByteString -> Text
 decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
   dst <- A.new (2 * len)
-  let inner srcOff dstOff = if srcOff >= len then return dstOff else do
-        asciiPrefixLen <- fmap fromIntegral . unsafeIOToST . unsafeWithForeignPtr fp $ \src ->
-          c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
-        if asciiPrefixLen == 0
-        then do
-          byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
-          A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
-          A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
-          inner (srcOff + 1) (dstOff + 2)
-        else do
-          unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-            unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
-          inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
-
-  actualLen <- inner 0 0
-  dst' <- A.resizeM dst actualLen
-  arr <- A.unsafeFreeze dst'
-  return $ Text arr 0 actualLen
+  unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> do
+    let inner srcOff dstOff = if srcOff >= len then pure dstOff else do
+          let src' = src `plusPtr` srcOff
+          asciiPrefixLen <- fmap fromIntegral . c_is_ascii src' $ src `plusPtr` len
+          if asciiPrefixLen == 0
+          then do
+            byte <- peekByteOff src srcOff
+            unsafeSTToIO $ A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) *>
+              A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
+            inner (srcOff + 1) (dstOff + 2)
+          else do
+            unsafeSTToIO $ A.copyFromPointer dst dstOff src' asciiPrefixLen
+            inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
+    actualLen <- inner 0 0
+    arr <- unsafeSTToIO $ A.resizeM dst actualLen >>= A.unsafeFreeze
+    pure $ Text arr 0 actualLen
 
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
@@ -229,13 +227,13 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
           step i _ Reject = (Reject, i)
           step i i' (Incomplete a) =
             if i' < len
-              then step i (i' + 1) $ utf8DecodeContinue (index i') a
+              then step i (i' + 1) $ utf8DetectContinue (index i') a
               else (Incomplete a, i)
           step _ i' st@_ =
             if i' < len
-              then step i' (i' + 1) . utf8DecodeStart $ index i'
+              then step i' (i' + 1) . utf8DetectStart $ index i'
               else (st, i')
-          (st', t) = case step 0 1 . utf8DecodeStart $ index 0 of
+          (st', t) = case step 0 1 . utf8DetectStart $ index 0 of
             (st, count) ->
               (st, ) $ if count > 0
                 then runST $ do
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index e9cc9e68..f5b96fb6 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -35,10 +35,10 @@ module Data.Text.Internal.Encoding.Utf8
     , validate4
     -- * Naive decoding
     , DetectUtf8Result(..)
-    , DecoderState(..)
+    , DetectState(..)
     , CodePoint(..)
-    , utf8DecodeStart
-    , utf8DecodeContinue
+    , utf8DetectStart
+    , utf8DetectContinue
     ) where
 
 #if defined(ASSERTS)
@@ -243,17 +243,17 @@ byteToClass n = ByteClass (W8# el#)
     table# :: Addr#
     table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
 
-newtype DecoderState = DecoderState Word8
+newtype DetectState = DetectState Word8
   deriving (Eq)
 
-utf8AcceptState :: DecoderState
-utf8AcceptState = DecoderState 0
+utf8AcceptState :: DetectState
+utf8AcceptState = DetectState 0
 
-utf8RejectState :: DecoderState
-utf8RejectState = DecoderState 12
+utf8RejectState :: DetectState
+utf8RejectState = DetectState 12
 
-updateState :: ByteClass -> DecoderState -> DecoderState
-updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
+updateState :: ByteClass -> DetectState -> DetectState
+updateState (ByteClass c) (DetectState s) = DetectState (W8# el#)
   where
     !(I# n#) = word8ToInt (c + s)
     el# = indexWord8OffAddr# table# n#
@@ -266,12 +266,12 @@ newtype CodePoint = CodePoint Int
 -- | @since 2.0
 data DetectUtf8Result
   = Accept
-  | Incomplete !DecoderState
+  | Incomplete !DetectState
   | Reject
 
 -- | @since 2.0
-utf8DecodeStart :: Word8 -> DetectUtf8Result
-utf8DecodeStart !w
+utf8DetectStart :: Word8 -> DetectUtf8Result
+utf8DetectStart !w
   | st == utf8AcceptState = Accept
   | st == utf8RejectState = Reject
   | otherwise             = Incomplete st
@@ -280,8 +280,8 @@ utf8DecodeStart !w
     st = updateState cl utf8AcceptState
 
 -- | @since 2.0
-utf8DecodeContinue :: Word8 -> DecoderState -> DetectUtf8Result
-utf8DecodeContinue !w !st
+utf8DetectContinue :: Word8 -> DetectState -> DetectUtf8Result
+utf8DetectContinue !w !st
   | st' == utf8AcceptState = Accept
   | st' == utf8RejectState = Reject
   | otherwise              = Incomplete st'

From 63790fc26e9a9dbdc645af95f827c7bf361e92bd Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 27 Jul 2022 16:50:11 -0600
Subject: [PATCH 34/87] Remove unused newtype CodePoint

---
 src/Data/Text/Internal/Encoding/Utf8.hs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index f5b96fb6..15796870 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -36,7 +36,6 @@ module Data.Text.Internal.Encoding.Utf8
     -- * Naive decoding
     , DetectUtf8Result(..)
     , DetectState(..)
-    , CodePoint(..)
     , utf8DetectStart
     , utf8DetectContinue
     ) where
@@ -261,8 +260,6 @@ updateState (ByteClass c) (DetectState s) = DetectState (W8# el#)
     table# :: Addr#
     table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
 
-newtype CodePoint = CodePoint Int
-
 -- | @since 2.0
 data DetectUtf8Result
   = Accept

From f4bf3a7edb1075484d490935aba3576c1cb4b020 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 27 Jul 2022 17:19:05 -0600
Subject: [PATCH 35/87] Added missing error message in streamDecodeUtf8With

---
 src/Data/Text/Encoding.hs | 146 +++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 74 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 600ca7b6..3a76664f 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -219,44 +219,42 @@ decodeUtf8Chunks
 decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
   let len = len1 + len2 in
   if len == 0
-    then DecodeResult empty Nothing bs1 0
-    else
-      let index i
-            | i < len1  = B.index bs1 i
-            | otherwise = B.index bs2 $ i - len1
-          step i _ Reject = (Reject, i)
-          step i i' (Incomplete a) =
-            if i' < len
-              then step i (i' + 1) $ utf8DetectContinue (index i') a
-              else (Incomplete a, i)
-          step _ i' st@_ =
-            if i' < len
-              then step i' (i' + 1) . utf8DetectStart $ index i'
-              else (st, i')
-          (st', t) = case step 0 1 . utf8DetectStart $ index 0 of
-            (st, count) ->
-              (st, ) $ if count > 0
-                then runST $ do
-                  dst <- A.new count
-                  let crossesBSBoundary = count > len1
-                      copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ ->
-                        unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-                          unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count'
-                  copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0
-                  when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1
-                  arr <- A.unsafeFreeze dst
-                  pure $ Text arr 0 count
-                else empty
-          decodeResult off mErr = DecodeResult t mErr (if off >= len1
-              then B.drop (off - len1) bs2
-              else B.drop off bs1 `B.append` bs2) off
-          tLen = case t of Text _ _ len' -> len'
-      in
-      case st' of
-        Reject ->
-          let tLen' = tLen + 1 in
-          decodeResult tLen' . Just $ index tLen
-        _ -> decodeResult tLen Nothing
+  then DecodeResult empty Nothing bs1 0
+  else
+    let index i
+          | i < len1  = B.index bs1 i
+          | otherwise = B.index bs2 $ i - len1
+        step i _ Reject = (Reject, i)
+        step i i' (Incomplete a)
+          | i' < len = step i (i' + 1) $ utf8DetectContinue (index i') a
+          | otherwise = (Incomplete a, i)
+        step _ i' _
+          | i' < len = step i' (i' + 1) . utf8DetectStart $ index i'
+          | otherwise = (Accept, i')
+        (st', t) = case step 0 1 . utf8DetectStart $ index 0 of
+          (st, count) ->
+            (st, ) $ if count > 0
+              then runST $ do
+                dst <- A.new count
+                let crossesBSBoundary = count > len1
+                    copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ ->
+                      unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+                        unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count'
+                copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0
+                when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1
+                arr <- A.unsafeFreeze dst
+                pure $ Text arr 0 count
+              else empty
+        decodeResult off mErr = DecodeResult t mErr (if off >= len1
+            then B.drop (off - len1) bs2
+            else B.drop off bs1 `B.append` bs2) off
+        tLen = case t of Text _ _ len' -> len'
+    in
+    case st' of
+      Reject ->
+        let tLen' = tLen + 1 in
+        decodeResult tLen' . Just $ index tLen
+      _ -> decodeResult tLen Nothing
 
 data Progression
   = WriteAndAdvance Char Int
@@ -295,24 +293,24 @@ decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST
           | srcOff < len
           , len >= srcOff + wordByteSize =
             if dstOff + 4 > dstLen
-              -- need more space in destination
-              then do
-                let dstLen' = dstLen + 4
-                dst' <- A.resizeM dst dstLen'
-                outer dst' dstLen' srcOff dstOff
-              else
-                case transcodeF index len srcOff of
-                  WriteAndAdvance c srcOff' -> do
-                    d <- unsafeWrite dst dstOff c
-                    inner srcOff' $ dstOff + d
-                  NeedMore -> goodSoFar
-                  Invalid ->
-                    let srcOff' = srcOff + wordByteSize
-                        bytesToWord n word
-                          | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                          | otherwise = word
-                    in
-                    wrapUp srcOff' . Just $ bytesToWord wordByteSize 0
+            -- need more space in destination
+            then do
+              let dstLen' = dstLen + 4
+              dst' <- A.resizeM dst dstLen'
+              outer dst' dstLen' srcOff dstOff
+            else
+              case transcodeF index len srcOff of
+                WriteAndAdvance c srcOff' -> do
+                  d <- unsafeWrite dst dstOff c
+                  inner srcOff' $ dstOff + d
+                NeedMore -> goodSoFar
+                Invalid ->
+                  let srcOff' = srcOff + wordByteSize
+                      bytesToWord n word
+                        | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
+                        | otherwise = word
+                  in
+                  wrapUp srcOff' . Just $ bytesToWord wordByteSize 0
           -- finished (for now)
           | otherwise = goodSoFar
           where
@@ -354,15 +352,15 @@ decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff ->
     OneWord16 c -> writeAndAdvance c 2
     TwoWord16 g ->
       if len - srcOff < 4
-        -- not enough Word8s to finish the code point
-        then NeedMore
-        else
-          let b2 = index $ srcOff + (if isBE then 2 else 3)
-              b3 = index $ srcOff + (if isBE then 3 else 2)
-          in
-          case g b2 b3 of
-            Just c -> writeAndAdvance c 4
-            _ -> Invalid
+      -- not enough Word8s to finish the code point
+      then NeedMore
+      else
+        let b2 = index $ srcOff + (if isBE then 2 else 3)
+            b3 = index $ srcOff + (if isBE then 3 else 2)
+        in
+        case g b2 b3 of
+          Just c -> writeAndAdvance c 4
+          _ -> Invalid
     _ -> Invalid) bs1 bs2
 
 -- | Decode two 'ByteString's containing UTF-16-encoded text as though
@@ -497,15 +495,15 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = g empty mempty
   where
     g t bs0 bs1 =
-      let DecodeResult t' mW bs1' _ = decodeUtf8Chunks bs0 bs1
-          txt = t `append` t'
-      in
-      (case (mW :: Maybe Word8) of
-        Just _ ->
-          g (case onErr "" mW of
-            Just c -> txt `append` T.singleton c
-            _ -> txt) mempty
-        _ -> Some txt bs1' . g empty) bs1'
+      case decodeUtf8Chunks bs0 bs1 of
+        DecodeResult t' mW bs1' _ ->
+          case t `append` t' of
+            txt -> (case mW of
+              Just _ ->
+                g (case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" mW of
+                  Just c -> txt `append` T.singleton c
+                  _ -> txt) mempty
+              _ -> Some txt bs1' . g empty) bs1'
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.

From 18edcb2c2301a02bec6bf7c8c2d9745f7e4c64d6 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Thu, 4 Aug 2022 17:51:28 -0600
Subject: [PATCH 36/87] Reintroduce simdutf

---
 src/Data/Text/Encoding.hs               | 151 ++++++++++++++++++------
 src/Data/Text/Internal/Encoding/Utf8.hs |   3 +-
 2 files changed, 114 insertions(+), 40 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 3a76664f..5cd5906f 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -110,6 +110,12 @@ import Numeric (showHex)
 import GHC.Stack (HasCallStack)
 #endif
 
+#ifdef SIMDUTF
+import Foreign.C.Types (CInt(..))
+#elif !MIN_VERSION_bytestring(0,11,2)
+import qualified Data.ByteString.Unsafe as B
+#endif
+
 -- $strict
 --
 -- All of the single-parameter functions for decoding bytestrings
@@ -208,53 +214,120 @@ decodeAsciiPrefix bs = if B.null bs
             arr <- A.unsafeFreeze dst
             pure $ Text arr 0 asciiPrefixLen
 
+#ifdef SIMDUTF
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
+#endif
+
 -- | Decode two 'ByteString's containing UTF-8-encoded text as though
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
 -- @since 2.0.2
-decodeUtf8Chunks
-  :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
+decodeUtf8Chunks ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+     ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word8
 decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
-  let len = len1 + len2 in
-  if len == 0
-  then DecodeResult empty Nothing bs1 0
+  let len = len1 + len2
+      index i
+        | i < len1  = B.index bs1 i
+        | otherwise = B.index bs2 $ i - len1
+      guessUtf8Boundary len'
+        | wi 3 0xf0 = len' - 3  -- third to last char starts a four-byte code point
+        | wi 2 0xe0 = len' - 2  -- pre-last char starts a three-or-four-byte code point
+        | wi 1 0xc0 = len' - 1  -- last char starts a two-(or more-)byte code point
+        | wc 4 0xf8 0xf0 ||     -- last four bytes are a four-byte code point
+          wc 3 0xf0 0xe0 ||     -- last three bytes are a three-byte code point
+          wc 2 0xe0 0xc0 ||     -- last two bytes are a two-byte code point
+          w 1 (< 0x80) = len'   -- last char is ASCII
+        | otherwise = 0
+        where
+          w n test = len' >= n && test (index $ len' - n)
+          wc n mask word8 = w n $ (word8 ==) . (mask .&.)
+          wi n word8 = w n (>= word8)
+      bs1Utf8Boundary = guessUtf8Boundary len1
+      bs2Utf8Boundary = guessUtf8Boundary len
+      isValidBS :: Int -> Int -> ByteString -> Bool
+#ifdef SIMDUTF
+      isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen)
+#else
+#if MIN_VERSION_bytestring(0,11,2)
+      isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off
+#else
+      isValidBS off bLen bs = start off
+        where
+          start ix
+            | ix >= off + bLen = True
+            | otherwise = case utf8DetectStart (B.unsafeIndex bs ix) of
+              Accept -> start (ix + 1)
+              Reject -> False
+              Incomplete st -> step (ix + 1) st
+          step ix st
+            | ix >= off + bLen = False
+            | otherwise = case utf8DetectContinue (B.unsafeIndex bs ix) st of
+              Accept -> start (ix + 1)
+              Reject -> False
+              Incomplete st' -> step (ix + 1) st'
+#endif
+#endif
+      bsToText bs count dst dstOff =
+        when (count > 0) . withBS bs $ \ fp _ ->
+          unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+            unsafeSTToIO $ A.copyFromPointer dst dstOff src count
+      chunksToText utf8Len
+        | utf8Len > 0 = runST $ do
+          dst <- A.new utf8Len
+          let crossesBsBoundary = utf8Len > len1
+          bsToText bs1 (if crossesBsBoundary then len1 else utf8Len) dst 0
+          when crossesBsBoundary $ bsToText bs2 (utf8Len - len1) dst len1
+          arr <- A.unsafeFreeze dst
+          pure $ Text arr 0 utf8Len
+        | otherwise = empty
+      decodeResult isErr off =
+        let off' = if isErr then off + 1 else off in
+        DecodeResult
+          (chunksToText off)
+          (if isErr then Just $ index off else Nothing)
+          (if off' < len1
+            then B.drop off' bs1 `B.append` bs2
+            else B.drop (off' - len1) bs2)
+          off'
+      countValidUtf8 i _ Reject = decodeResult True i
+      countValidUtf8 i i' (Incomplete a)
+        | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a
+        | otherwise = decodeResult False i
+      countValidUtf8 _ i' _
+        | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i'
+        | otherwise = decodeResult False i'
+      wrapUpBs1 off = countValidUtf8 off off Accept
+      wrapUpBs2 off =
+        wrapUpBs1 $ if bs2Utf8Boundary > off && isValidBS (off - len1) (bs2Utf8Boundary - off) bs2
+          then bs2Utf8Boundary
+          else off
+  in
+  if bs1Utf8Boundary > 0
+  then
+    if isValidBS 0 bs1Utf8Boundary bs1
+    then
+      if bs1Utf8Boundary < len1
+      then
+        let checkCodePointAccrossBoundary Reject _ = decodeResult True bs1Utf8Boundary
+            checkCodePointAccrossBoundary (Incomplete a) off = if off < len
+              then checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1
+              else decodeResult False bs1Utf8Boundary
+            checkCodePointAccrossBoundary Accept off = wrapUpBs2 off
+        in
+        checkCodePointAccrossBoundary (utf8DetectStart $ index bs1Utf8Boundary) $ bs1Utf8Boundary + 1
+      else wrapUpBs2 len1
+    else wrapUpBs1 0
   else
-    let index i
-          | i < len1  = B.index bs1 i
-          | otherwise = B.index bs2 $ i - len1
-        step i _ Reject = (Reject, i)
-        step i i' (Incomplete a)
-          | i' < len = step i (i' + 1) $ utf8DetectContinue (index i') a
-          | otherwise = (Incomplete a, i)
-        step _ i' _
-          | i' < len = step i' (i' + 1) . utf8DetectStart $ index i'
-          | otherwise = (Accept, i')
-        (st', t) = case step 0 1 . utf8DetectStart $ index 0 of
-          (st, count) ->
-            (st, ) $ if count > 0
-              then runST $ do
-                dst <- A.new count
-                let crossesBSBoundary = count > len1
-                    copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ ->
-                      unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-                        unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count'
-                copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0
-                when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1
-                arr <- A.unsafeFreeze dst
-                pure $ Text arr 0 count
-              else empty
-        decodeResult off mErr = DecodeResult t mErr (if off >= len1
-            then B.drop (off - len1) bs2
-            else B.drop off bs1 `B.append` bs2) off
-        tLen = case t of Text _ _ len' -> len'
-    in
-    case st' of
-      Reject ->
-        let tLen' = tLen + 1 in
-        decodeResult tLen' . Just $ index tLen
-      _ -> decodeResult tLen Nothing
+    (if len1 > 0
+    then wrapUpBs1
+    else wrapUpBs2) 0
 
 data Progression
   = WriteAndAdvance Char Int
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 15796870..17ee7cfb 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -243,7 +243,7 @@ byteToClass n = ByteClass (W8# el#)
     table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
 
 newtype DetectState = DetectState Word8
-  deriving (Eq)
+  deriving (Eq, Ord, Show, Read)
 
 utf8AcceptState :: DetectState
 utf8AcceptState = DetectState 0
@@ -265,6 +265,7 @@ data DetectUtf8Result
   = Accept
   | Incomplete !DetectState
   | Reject
+  deriving (Eq, Ord, Show, Read)
 
 -- | @since 2.0
 utf8DetectStart :: Word8 -> DetectUtf8Result

From d69964fddc2f21957d9b5b5571c2e103b9f25065 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Mon, 8 Aug 2022 20:59:52 -0600
Subject: [PATCH 37/87] Realized the otherwise scenario in guessUtf8Boundary
 was acting like a magic number and obfuscating genuine 0 values

---
 src/Data/Text/Encoding.hs | 62 +++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 5cd5906f..c4878b5c 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -236,14 +236,14 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
         | i < len1  = B.index bs1 i
         | otherwise = B.index bs2 $ i - len1
       guessUtf8Boundary len'
-        | wi 3 0xf0 = len' - 3  -- third to last char starts a four-byte code point
-        | wi 2 0xe0 = len' - 2  -- pre-last char starts a three-or-four-byte code point
-        | wi 1 0xc0 = len' - 1  -- last char starts a two-(or more-)byte code point
-        | wc 4 0xf8 0xf0 ||     -- last four bytes are a four-byte code point
-          wc 3 0xf0 0xe0 ||     -- last three bytes are a three-byte code point
-          wc 2 0xe0 0xc0 ||     -- last two bytes are a two-byte code point
-          w 1 (< 0x80) = len'   -- last char is ASCII
-        | otherwise = 0
+        | wi 3 0xf0 = Just $ len' - 3 -- third to last char starts a four-byte code point
+        | wi 2 0xe0 = Just $ len' - 2 -- pre-last char starts a three-or-four-byte code point
+        | wi 1 0xc0 = Just $ len' - 1 -- last char starts a two-(or more-)byte code point
+        | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
+          wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
+          wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
+          w 1 (< 0x80) = Just $ len'  -- last char is ASCII
+        | otherwise = Nothing         -- no clue
         where
           w n test = len' >= n && test (index $ len' - n)
           wc n mask word8 = w n $ (word8 ==) . (mask .&.)
@@ -254,8 +254,7 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
 #ifdef SIMDUTF
       isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $
         unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen)
-#else
-#if MIN_VERSION_bytestring(0,11,2)
+#elif MIN_VERSION_bytestring(0,11,2)
       isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off
 #else
       isValidBS off bLen bs = start off
@@ -272,7 +271,6 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
               Accept -> start (ix + 1)
               Reject -> False
               Incomplete st' -> step (ix + 1) st'
-#endif
 #endif
       bsToText bs count dst dstOff =
         when (count > 0) . withBS bs $ \ fp _ ->
@@ -304,27 +302,33 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
         | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i'
         | otherwise = decodeResult False i'
       wrapUpBs1 off = countValidUtf8 off off Accept
-      wrapUpBs2 off =
-        wrapUpBs1 $ if bs2Utf8Boundary > off && isValidBS (off - len1) (bs2Utf8Boundary - off) bs2
-          then bs2Utf8Boundary
+      wrapUpBs2 off = wrapUpBs1 $
+        case bs2Utf8Boundary of
+        Just n -> if n > off && isValidBS (off - len1) (n - off) bs2
+          then n
           else off
+        _ -> off
   in
-  if bs1Utf8Boundary > 0
-  then
-    if isValidBS 0 bs1Utf8Boundary bs1
+  case bs1Utf8Boundary of
+  Just n ->
+    let checkCodePointAccrossBoundary Reject _ = decodeResult True n
+        checkCodePointAccrossBoundary (Incomplete a) off
+          | off < len =
+            checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1
+          | otherwise = decodeResult False n
+        checkCodePointAccrossBoundary Accept off = wrapUpBs2 off
+        spanChunks
+          | n < len1 =
+            checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1
+          | otherwise = wrapUpBs2 len1
+    in
+    if n > 0
     then
-      if bs1Utf8Boundary < len1
-      then
-        let checkCodePointAccrossBoundary Reject _ = decodeResult True bs1Utf8Boundary
-            checkCodePointAccrossBoundary (Incomplete a) off = if off < len
-              then checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1
-              else decodeResult False bs1Utf8Boundary
-            checkCodePointAccrossBoundary Accept off = wrapUpBs2 off
-        in
-        checkCodePointAccrossBoundary (utf8DetectStart $ index bs1Utf8Boundary) $ bs1Utf8Boundary + 1
-      else wrapUpBs2 len1
-    else wrapUpBs1 0
-  else
+      if isValidBS 0 n bs1
+      then spanChunks
+      else wrapUpBs1 0
+    else spanChunks
+  _ ->
     (if len1 > 0
     then wrapUpBs1
     else wrapUpBs2) 0

From 0db3c8abc5400b7999dbaedaade31b501d2a01c3 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Mon, 8 Aug 2022 22:09:02 -0600
Subject: [PATCH 38/87] A little clean up.

---
 src/Data/Text/Encoding.hs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index c4878b5c..b2e46478 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -242,14 +242,12 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
         | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
           wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
           wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
-          w 1 (< 0x80) = Just $ len'  -- last char is ASCII
+          w 1 (< 0x80) = Just len'    -- last char is ASCII
         | otherwise = Nothing         -- no clue
         where
           w n test = len' >= n && test (index $ len' - n)
           wc n mask word8 = w n $ (word8 ==) . (mask .&.)
           wi n word8 = w n (>= word8)
-      bs1Utf8Boundary = guessUtf8Boundary len1
-      bs2Utf8Boundary = guessUtf8Boundary len
       isValidBS :: Int -> Int -> ByteString -> Bool
 #ifdef SIMDUTF
       isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $
@@ -303,13 +301,13 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
         | otherwise = decodeResult False i'
       wrapUpBs1 off = countValidUtf8 off off Accept
       wrapUpBs2 off = wrapUpBs1 $
-        case bs2Utf8Boundary of
+        case guessUtf8Boundary len of
         Just n -> if n > off && isValidBS (off - len1) (n - off) bs2
           then n
           else off
         _ -> off
   in
-  case bs1Utf8Boundary of
+  case guessUtf8Boundary len1 of
   Just n ->
     let checkCodePointAccrossBoundary Reject _ = decodeResult True n
         checkCodePointAccrossBoundary (Incomplete a) off
@@ -338,11 +336,7 @@ data Progression
   | NeedMore
   | Invalid
 
-decodeChunks ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  (Bits w, Num w, Storable w)
+decodeChunks :: (Bits w, Num w, Storable w)
   => w
   -> ((Int -> Word8) -> Int -> Int -> Progression)
   -> ByteString
@@ -414,8 +408,11 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
 -- @since 2.0.2
-decodeUtf16Chunks
-  :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+decodeUtf16Chunks ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Bool          -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
   -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word16
@@ -444,8 +441,11 @@ decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff ->
 -- they were one continuous 'ByteString' returning a 'DecodeResult'.
 --
 -- @since 2.0.2
-decodeUtf32Chunks
-  :: Bool       -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
+decodeUtf32Chunks ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Bool          -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
   -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
   -> ByteString -- ^ The second 'ByteString' chunk to decode.
   -> DecodeResult Text ByteString Word32

From c48ea341cfd03ee562a0668d4a28f91bed16f89a Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 10 Aug 2022 12:44:51 -0600
Subject: [PATCH 39/87] move test utility function whenEqProp

---
 tests/Tests/Properties/Transcoding.hs | 4 ----
 tests/Tests/QuickCheckUtils.hs        | 8 ++++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index d8d0ce58..bf4d6589 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -35,10 +35,6 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
-whenEqProp a b next = if a == b
-  then next
-  else a === b
-
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 t_utf8_c     = (\ t ->
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index a6d72529..a09b65b1 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,6 +27,8 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
+
+    , whenEqProp
     ) where
 
 import Control.Arrow ((***))
@@ -284,3 +286,9 @@ newtype SkewedBool = Skewed { getSkewed :: Bool }
 
 instance Arbitrary SkewedBool where
   arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)]
+
+-- like Control.Monad.when, but with properties instead of monad values
+whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property
+whenEqProp a b next = if a == b
+  then next
+  else a === b

From 1c5f61adadc5fb4e438995b9bdf23be36e146a6f Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sun, 28 Aug 2022 17:56:34 +0100
Subject: [PATCH 40/87] Refactor decodeAsciiPrefix

---
 src/Data/Text/Encoding.hs | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index b2e46478..b67daf3f 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -201,18 +201,20 @@ decodeAsciiPrefix
   -> (Text, Maybe (Word8, ByteString))
 decodeAsciiPrefix bs = if B.null bs
   then (empty, Nothing)
-  else runST $ withBS bs $ \ fp len ->
-    unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do
+  else unsafeDupablePerformIO $ withBS bs $ \ fp len ->
+    unsafeWithForeignPtr fp $ \src -> do
       asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len
-      (, if asciiPrefixLen < len
-        then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs)
-        else Nothing) <$> if asciiPrefixLen == 0
-          then pure empty
-          else unsafeSTToIO $ do
-            dst <- A.new asciiPrefixLen
-            A.copyFromPointer dst 0 src asciiPrefixLen
-            arr <- A.unsafeFreeze dst
-            pure $ Text arr 0 asciiPrefixLen
+      let !prefix = if asciiPrefixLen == 0
+            then empty
+            else runST $ do
+              dst <- A.new asciiPrefixLen
+              A.copyFromPointer dst 0 src asciiPrefixLen
+              arr <- A.unsafeFreeze dst
+              pure $ Text arr 0 asciiPrefixLen
+      let suffix = if asciiPrefixLen < len
+            then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs)
+            else Nothing
+      pure (prefix, suffix)
 
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8

From 1ae9ede3f42781f8e5cc69b707c2a65e77cbdcdd Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sun, 28 Aug 2022 19:05:05 +0100
Subject: [PATCH 41/87] Stylistic nits in decodeUtf8Chunks

---
 src/Data/Text/Encoding.hs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index b67daf3f..43bab2d3 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -279,9 +279,11 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
       chunksToText utf8Len
         | utf8Len > 0 = runST $ do
           dst <- A.new utf8Len
-          let crossesBsBoundary = utf8Len > len1
-          bsToText bs1 (if crossesBsBoundary then len1 else utf8Len) dst 0
-          when crossesBsBoundary $ bsToText bs2 (utf8Len - len1) dst len1
+          if utf8Len > len1 then do
+            bsToText bs1 len1 dst 0
+            bsToText bs2 (utf8Len - len1) dst len1
+          else
+            bsToText bs1 utf8Len dst 0
           arr <- A.unsafeFreeze dst
           pure $ Text arr 0 utf8Len
         | otherwise = empty
@@ -298,7 +300,7 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
       countValidUtf8 i i' (Incomplete a)
         | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a
         | otherwise = decodeResult False i
-      countValidUtf8 _ i' _
+      countValidUtf8 _ i' Accept
         | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i'
         | otherwise = decodeResult False i'
       wrapUpBs1 off = countValidUtf8 off off Accept
@@ -322,12 +324,9 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
             checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1
           | otherwise = wrapUpBs2 len1
     in
-    if n > 0
-    then
-      if isValidBS 0 n bs1
-      then spanChunks
-      else wrapUpBs1 0
-    else spanChunks
+    if n == 0 || isValidBS 0 n bs1
+    then spanChunks
+    else wrapUpBs1 0
   _ ->
     (if len1 > 0
     then wrapUpBs1

From 0a1110128b5554411fd232d51044ec8422203c17 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 2 Oct 2022 09:53:37 -0600
Subject: [PATCH 42/87] Back-to-the-drawing-board redesign of a UTF-8 decoder
 with more caller control.

---
 benchmarks/haskell/Benchmarks/Stream.hs       |   2 +-
 src/Data/Text/Encoding.hs                     | 669 ++++++++++--------
 src/Data/Text/Encoding/Common.hs              | 137 ----
 src/Data/Text/Encoding/Error.hs               |  96 ++-
 src/Data/Text/Internal/Encoding/Fusion.hs     |   2 +-
 src/Data/Text/Internal/Encoding/Utf16.hs      |  34 +-
 src/Data/Text/Internal/Encoding/Utf32.hs      |  28 -
 src/Data/Text/Internal/Encoding/Utf8.hs       |  54 +-
 .../Text/Internal/Lazy/Encoding/Fusion.hs     |   2 +-
 src/Data/Text/Lazy/Encoding.hs                |  87 +--
 tests/Tests/Properties/Transcoding.hs         | 186 +----
 tests/Tests/QuickCheckUtils.hs                |  10 +-
 tests/Tests/Regressions.hs                    |   2 +-
 text.cabal                                    |   1 -
 14 files changed, 506 insertions(+), 804 deletions(-)
 delete mode 100644 src/Data/Text/Encoding/Common.hs

diff --git a/benchmarks/haskell/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs
index 539b709c..1d53c498 100644
--- a/benchmarks/haskell/Benchmarks/Stream.hs
+++ b/benchmarks/haskell/Benchmarks/Stream.hs
@@ -21,7 +21,7 @@ import qualified Data.Text.Lazy as TL
 import qualified Data.ByteString.Lazy as BL
 import Data.Text.Internal.Fusion.Types (Step (..), Stream (..))
 import qualified Data.Text.Encoding as T
-import qualified Data.Text.Encoding.Common as E
+import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Internal.Encoding.Fusion as T
 import qualified Data.Text.Internal.Encoding.Fusion.Common as F
 import qualified Data.Text.Internal.Fusion as T
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 43bab2d3..db0ef035 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -4,7 +4,6 @@
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE ViewPatterns #-}
-{-# LANGUAGE TupleSections #-}
 -- |
 -- Module      : Data.Text.Encoding
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
@@ -32,10 +31,12 @@ module Data.Text.Encoding
       decodeLatin1
     , decodeUtf8Lenient
     , decodeAsciiPrefix
-    , DecodeResult(..)
-    , decodeUtf8Chunks
-    , decodeUtf16Chunks
-    , decodeUtf32Chunks
+    , Utf8DecodeState
+    , startUtf8State
+    , outAvailableUtf8Text
+    , decodeNextUtf8Chunk
+    , decodeUtf8Chunk
+    , recoverFromUtf8Error
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -49,7 +50,6 @@ module Data.Text.Encoding
 
     -- *** Stream oriented decoding
     -- $stream
-    , streamDecodeUtf8
     , streamDecodeUtf8With
     , Decoding(..)
 
@@ -62,6 +62,9 @@ module Data.Text.Encoding
     , decodeUtf32LE
     , decodeUtf32BE
 
+    -- *** Stream oriented decoding
+    , streamDecodeUtf8
+
     -- * Encoding Text to ByteStrings
     , encodeUtf8
     , encodeUtf16LE
@@ -74,38 +77,34 @@ module Data.Text.Encoding
     , encodeUtf8BuilderEscaped
     ) where
 
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+
 import Control.Exception (evaluate, try)
-import Control.Monad (when)
 import Control.Monad.ST (runST)
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.))
+import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
-import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty, append)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8, Word16, Word32)
+import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
-import Foreign.Storable (Storable(..), poke, peekByteOff)
+import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
-import Data.Text.Internal.Encoding.Utf8 (utf8DetectStart, utf8DetectContinue, DetectUtf8Result(..))
-import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes)
-import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes)
+import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
 import Data.Text.Internal.ByteStringCompat
-import Numeric (showHex)
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
@@ -147,10 +146,10 @@ import qualified Data.ByteString.Unsafe as B
 -- anything except ASCII and copies buffer or throws an error otherwise.
 --
 decodeASCII :: ByteString -> Text
-decodeASCII bs@(B.length -> len) = case decodeAsciiPrefix bs of
-  (t, Nothing) -> t
-  (_, Just (w, (B.length -> len'))) ->
-    error $ "decodeASCII: detected non-ASCII codepoint (\\x" ++ showHex w (") at " ++ show (len - len' - 1))
+decodeASCII bs =
+  case decodeAsciiPrefix bs of
+    (_, Just errPos) -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show errPos
+    (t, Nothing) -> t
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --
@@ -167,22 +166,24 @@ decodeLatin1 ::
   ByteString -> Text
 decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
   dst <- A.new (2 * len)
-  unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> do
-    let inner srcOff dstOff = if srcOff >= len then pure dstOff else do
-          let src' = src `plusPtr` srcOff
-          asciiPrefixLen <- fmap fromIntegral . c_is_ascii src' $ src `plusPtr` len
-          if asciiPrefixLen == 0
-          then do
-            byte <- peekByteOff src srcOff
-            unsafeSTToIO $ A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) *>
-              A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
-            inner (srcOff + 1) (dstOff + 2)
-          else do
-            unsafeSTToIO $ A.copyFromPointer dst dstOff src' asciiPrefixLen
-            inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
-    actualLen <- inner 0 0
-    arr <- unsafeSTToIO $ A.resizeM dst actualLen >>= A.unsafeFreeze
-    pure $ Text arr 0 actualLen
+  let inner srcOff dstOff = if srcOff >= len then return dstOff else do
+        asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+          c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
+        if asciiPrefixLen == 0
+        then do
+          byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
+          A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
+          A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
+          inner (srcOff + 1) (dstOff + 2)
+        else do
+          unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+            unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
+          inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
+
+  actualLen <- inner 0 0
+  dst' <- A.resizeM dst actualLen
+  arr <- A.unsafeFreeze dst'
+  return $ Text arr 0 actualLen
 
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
@@ -192,13 +193,12 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
 -- This is a total function. The 'ByteString' is decoded until either
 -- the end is reached or it errors with the first non-ASCII 'Word8' is
 -- encountered. In either case the function will return what 'Text' was
--- decoded. On error, the non-ASCII 'Word8' is also returned followed
--- by the rest of the undecoded 'ByteString'.
+-- decoded. On error, the index of the non-ASCII 'Word8' is also returned.
 --
 -- @since 2.0.2
 decodeAsciiPrefix
   :: ByteString
-  -> (Text, Maybe (Word8, ByteString))
+  -> (Text, Maybe (Word8, Int))
 decodeAsciiPrefix bs = if B.null bs
   then (empty, Nothing)
   else unsafeDupablePerformIO $ withBS bs $ \ fp len ->
@@ -212,7 +212,7 @@ decodeAsciiPrefix bs = if B.null bs
               arr <- A.unsafeFreeze dst
               pure $ Text arr 0 asciiPrefixLen
       let suffix = if asciiPrefixLen < len
-            then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs)
+            then Just (B.index bs asciiPrefixLen, asciiPrefixLen)
             else Nothing
       pure (prefix, suffix)
 
@@ -221,243 +221,321 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
 #endif
 
--- | Decode two 'ByteString's containing UTF-8-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
+-- | A value that represents the state of a UTF-8 decoding process potentionally
+-- across multiple 'ByteString's.
 --
 -- @since 2.0.2
-decodeUtf8Chunks ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-     ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> ByteString -- ^ The second 'ByteString' chunk to decode.
-  -> DecodeResult Text ByteString Word8
-decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) =
-  let len = len1 + len2
-      index i
-        | i < len1  = B.index bs1 i
-        | otherwise = B.index bs2 $ i - len1
-      guessUtf8Boundary len'
-        | wi 3 0xf0 = Just $ len' - 3 -- third to last char starts a four-byte code point
-        | wi 2 0xe0 = Just $ len' - 2 -- pre-last char starts a three-or-four-byte code point
-        | wi 1 0xc0 = Just $ len' - 1 -- last char starts a two-(or more-)byte code point
-        | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
-          wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
-          wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
-          w 1 (< 0x80) = Just len'    -- last char is ASCII
-        | otherwise = Nothing         -- no clue
-        where
-          w n test = len' >= n && test (index $ len' - n)
-          wc n mask word8 = w n $ (word8 ==) . (mask .&.)
-          wi n word8 = w n (>= word8)
-      isValidBS :: Int -> Int -> ByteString -> Bool
+data Utf8DecodeState = Utf8DecodeState
+  (Maybe (Utf8CodePointState, Int))
+  [ByteString]
+  Int
+  [Either (Text, Int) (ByteString, Int, Int, Int)]
+  Int
+  deriving (Show)
+
+-- | This represents the begining state of a UTF-8 decoding process.
+--
+-- @since 2.0.2
+startUtf8State :: Utf8DecodeState
+startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0
+
+-- | Takes whatever data has been decoded thus far and spits it out as a `Text`
+-- value and a `Utf8DecodeState` value that no longer references the decoded
+-- data. This function operates on error states, but does not clear the error.
+-- (See 'recoverFromUtf8Error'.)
+--
+-- @since 2.0.2
+outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState)
+outAvailableUtf8Text (Utf8DecodeState mCpSt bss bs1Off dataStack tLen) =
+  if tLen > 0
+  then runST $ do
+    dst <- A.new tLen
+    mapM_ (\ dat ->
+      case dat of
+        Left ((Text arr0 off utf8Len), dstOff) -> A.copyI utf8Len dst dstOff arr0 off
+        Right (bs, bsOff, utf8Len, dstOff) ->
+          withBS bs $ \ fp _ ->
+            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+              unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len
+      ) dataStack
+    arr <- A.unsafeFreeze dst
+    pure (Text arr 0 tLen, Utf8DecodeState mCpSt bss bs1Off [] 0)
+  else (empty, Utf8DecodeState mCpSt bss bs1Off dataStack tLen)
+
+getCodePointStateOrError :: Utf8DecodeState -> Either Int Int
+getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _) =
+  case mCpSt of
+    Nothing ->
+      let (lenInit, _) = foldr
+            (\ bs@(B.length -> len') (lenInit', (lenN', _)) ->
+              (lenInit' + lenN', (len', bs))) (0, (len1 - bs1Off, bs1)) bss'
+      in
+      Left (bs1Off - lenInit)
+    Just (_, cpLen) -> Right cpLen
+getCodePointStateOrError _ = Right 0
+
+decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
+decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _) = (getCodePointStateOrError st, st)
+decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st)
+decodeUtf8Chunks
+  st@(Utf8DecodeState
+    (Just (cpSt, cpPos))
+    bss@(bs1@(B.length -> len1) : bss')
+    bs1Off
+    dataStack
+    tLen
+  )
+  =
+  {-
+bs1Off = 0  cpPos = Δbs1Off
+   |    bs1Off     |   boundary = Δbs1Off
+   v       v   bsX v           v
+  |. . bs1 . .|. .|. . .bsN. . .|
+  ^-----------^   ^-------------^
+      len1            lenN
+          ^---^   ^-----------^
+          len1_   isValidBS span
+          ^-------^
+          lenInit
+          ^---------------------^
+                len
+  --}
+  let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytesting that's to be evaluated.
+      (lenInit, (lenN, bsN)) = foldr
+        (\ bs@(B.length -> len') (lenInit', (lenN', _)) ->
+          (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss'
+      len = lenInit + lenN
+  in
+  if len == cpPos
+  then (getCodePointStateOrError st, st)
+  else
+    let index i =
+          if i < len1_
+          then B.index bs1 (i + bs1Off)
+          else
+            let index' i' (bs@(B.length -> len0) : bss'') =
+                  if i' < len0
+                  then B.index bs i'
+                  else index' (i' - len0) bss''
+                index' i' _ = B.index bsN i'
+            in
+            index' (i - len1_) bss'
+        guessUtf8Boundary
+          | wi 3 0xf0 = Just $ len - 3  -- third to last char starts a four-byte code point
+          | wi 2 0xe0 = Just $ len - 2  -- pre-last char starts a three-or-four-byte code point
+          | wi 1 0xc2 = Just $ len - 1  -- last char starts a two-(or more-)byte code point
+          | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
+            wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
+            wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
+            w 1 (< 0x80) = Just len     -- last char is ASCII
+          | otherwise = Nothing         -- no clue
+          where
+            w n test = len >= n && test (index $ len - n)
+            wc n mask word8 = w n $ (word8 ==) . (mask .&.)
+            wi n word8 = w n (>= word8)
+        -- queue the available valid data, trim the fat, and leave a spot for the code point state/error.
+        stackValidUtf8 wordCount mCps =
+          let bs1Off' = bs1Off + wordCount
+              (bs1Off''', bss'''', dataStack'') =
+                if wordCount < len1_
+                then (bs1Off', bss, Right (bs1, bs1Off, wordCount, tLen) : dataStack)
+                else
+                  let stackValidUtf8' wordCount' bs1Off'' tLen' bss''@(bs'@(B.length -> len') : bss''') dataStack' =
+                        if wordCount' < len'
+                        then (bs1Off'', bss'', Right (bs', 0, wordCount', tLen') : dataStack')
+                        else stackValidUtf8' (wordCount' - len') (bs1Off'' - len') (tLen' + len') bss''' (Right (bs', 0, len', tLen') : dataStack')
+                      stackValidUtf8' _ _ _ _ dataStack' = (0, [], dataStack')
+                  in
+                  stackValidUtf8' (wordCount - len1_) (bs1Off' - len1) (tLen + len1_) bss' (Right (bs1, bs1Off, len1_, tLen) : dataStack)
+              st' = Utf8DecodeState mCps bss'''' bs1Off''' dataStack'' $ tLen + wordCount
+          in
+          (getCodePointStateOrError st', st')
+        huntDownError off ndx cps =
+          if ndx < len
+          then
+            case updateUtf8State (index ndx) cps of
+              Just cps' ->
+                let ndx' = ndx + 1 in
+                huntDownError (
+                  if isUtf8StateIsComplete cps'
+                  then ndx'
+                  else off
+                ) ndx' cps'
+              Nothing -> stackValidUtf8 off Nothing
+          else stackValidUtf8 off $ Just (cps, ndx - off)
+    in
+    -- did we find the boundary?
+    case guessUtf8Boundary of
+      -- yes
+      Just boundary ->
+        -- are we before it?
+        if cpPos < boundary
+        -- yes: let's check this incomplete code point before checking the rest up to the boundary
+        then
+          let checkIncompleteCodePoint cpSt' cpPos'
+                -- a complete code point
+                | isUtf8StateIsComplete cpSt' =
+                  let getEndState ndx cpSt''
+                        | ndx < len =
+                          case updateUtf8State (index ndx) cpSt'' of
+                            Nothing -> Nothing
+                            Just cpSt''' -> getEndState (ndx + 1) cpSt'''
+                        | otherwise = Just (cpSt'', ndx - boundary)
+                      soFarSoGood =
+                        stackValidUtf8 boundary $ getEndState boundary cpSt'
+                  in
+                  -- are we at the boundary?
+                  if boundary == cpPos'
+                  -- yes: get the state of the last code point
+                  then soFarSoGood
+                  -- no:
+                  else
+                    -- are we before bsN?
+                    if cpPos' < lenInit
+                    -- yes
+                    then
+                      -- keep walking the data until we get to bsN or an error
+                      case updateUtf8State (index cpPos') cpSt' of
+                        Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
+                        Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen)
+                    -- no: we're in bsN
+                    else let
+                          off = (if lenInit > 0
+                            then cpPos' - lenInit
+                            else cpPos' + bs1Off)
+                      in
+                      -- is the rest of the bytestring valid utf-8 up to the boundary?
+                      if (
 #ifdef SIMDUTF
-      isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $
-        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen)
+                          withBS bsN $ \ fp _ -> unsafeDupablePerformIO $
+                            unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
+                              c_is_valid_utf8 (plusPtr ptr off) (fromIntegral $ boundary - cpPos')
 #elif MIN_VERSION_bytestring(0,11,2)
-      isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off
+                          B.isValidUtf8 . B.take (boundary - cpPos') $ B.drop off bsN
 #else
-      isValidBS off bLen bs = start off
-        where
-          start ix
-            | ix >= off + bLen = True
-            | otherwise = case utf8DetectStart (B.unsafeIndex bs ix) of
-              Accept -> start (ix + 1)
-              Reject -> False
-              Incomplete st -> step (ix + 1) st
-          step ix st
-            | ix >= off + bLen = False
-            | otherwise = case utf8DetectContinue (B.unsafeIndex bs ix) st of
-              Accept -> start (ix + 1)
-              Reject -> False
-              Incomplete st' -> step (ix + 1) st'
+                          let bLen = boundary - cpPos'
+                              step ndx cps
+                                | ndx < off + bLen =
+                                  case updateUtf8State (B.unsafeIndex bsN ndx) cps of
+                                  Just cps' -> step (ndx + 1) cps'
+                                  Nothing -> False
+                                | otherwise = isUtf8StateIsComplete cps
+                          in
+                          step off utf8StartState
 #endif
-      bsToText bs count dst dstOff =
-        when (count > 0) . withBS bs $ \ fp _ ->
-          unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-            unsafeSTToIO $ A.copyFromPointer dst dstOff src count
-      chunksToText utf8Len
-        | utf8Len > 0 = runST $ do
-          dst <- A.new utf8Len
-          if utf8Len > len1 then do
-            bsToText bs1 len1 dst 0
-            bsToText bs2 (utf8Len - len1) dst len1
-          else
-            bsToText bs1 utf8Len dst 0
-          arr <- A.unsafeFreeze dst
-          pure $ Text arr 0 utf8Len
-        | otherwise = empty
-      decodeResult isErr off =
-        let off' = if isErr then off + 1 else off in
-        DecodeResult
-          (chunksToText off)
-          (if isErr then Just $ index off else Nothing)
-          (if off' < len1
-            then B.drop off' bs1 `B.append` bs2
-            else B.drop (off' - len1) bs2)
-          off'
-      countValidUtf8 i _ Reject = decodeResult True i
-      countValidUtf8 i i' (Incomplete a)
-        | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a
-        | otherwise = decodeResult False i
-      countValidUtf8 _ i' Accept
-        | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i'
-        | otherwise = decodeResult False i'
-      wrapUpBs1 off = countValidUtf8 off off Accept
-      wrapUpBs2 off = wrapUpBs1 $
-        case guessUtf8Boundary len of
-        Just n -> if n > off && isValidBS (off - len1) (n - off) bs2
-          then n
-          else off
-        _ -> off
-  in
-  case guessUtf8Boundary len1 of
-  Just n ->
-    let checkCodePointAccrossBoundary Reject _ = decodeResult True n
-        checkCodePointAccrossBoundary (Incomplete a) off
-          | off < len =
-            checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1
-          | otherwise = decodeResult False n
-        checkCodePointAccrossBoundary Accept off = wrapUpBs2 off
-        spanChunks
-          | n < len1 =
-            checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1
-          | otherwise = wrapUpBs2 len1
-    in
-    if n == 0 || isValidBS 0 n bs1
-    then spanChunks
-    else wrapUpBs1 0
-  _ ->
-    (if len1 > 0
-    then wrapUpBs1
-    else wrapUpBs2) 0
-
-data Progression
-  = WriteAndAdvance Char Int
-  | NeedMore
-  | Invalid
-
-decodeChunks :: (Bits w, Num w, Storable w)
-  => w
-  -> ((Int -> Word8) -> Int -> Int -> Progression)
-  -> ByteString
-  -> ByteString
-  -> DecodeResult Text ByteString w
-decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
-  marr <- A.new len'
-  outer marr len' 0 0
-  where
-    wordByteSize = sizeOf w
-
-    index :: Int -> Word8
-    index i
-      | i < len1  = B.index bs1 i
-      | otherwise = B.index bs2 $ i - len1
-
-    len :: Int
-    len = len1 + len2
-    len' :: Int
-    len' = (len `div` wordByteSize) + 4
-
-    outer dst dstLen = inner
-      where
-        inner srcOff dstOff
-          | srcOff < len
-          , len >= srcOff + wordByteSize =
-            if dstOff + 4 > dstLen
-            -- need more space in destination
-            then do
-              let dstLen' = dstLen + 4
-              dst' <- A.resizeM dst dstLen'
-              outer dst' dstLen' srcOff dstOff
-            else
-              case transcodeF index len srcOff of
-                WriteAndAdvance c srcOff' -> do
-                  d <- unsafeWrite dst dstOff c
-                  inner srcOff' $ dstOff + d
-                NeedMore -> goodSoFar
-                Invalid ->
-                  let srcOff' = srcOff + wordByteSize
-                      bytesToWord n word
-                        | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8)
-                        | otherwise = word
-                  in
-                  wrapUp srcOff' . Just $ bytesToWord wordByteSize 0
-          -- finished (for now)
-          | otherwise = goodSoFar
-          where
-            wrapUp off mW = do
-              A.shrinkM dst dstOff
-              arr <- A.unsafeFreeze dst
-              pure $ DecodeResult (Text arr 0 dstOff) mW (if off >= len1
-                then B.drop (off - len1) bs2
-                else B.drop off $ bs1 `B.append` bs2) off
-            goodSoFar =
-              wrapUp srcOff Nothing
-
-decodeChunksProxy :: (Bits w, Num w, Storable w)
-  => ((Int -> Word8) -> Int -> Int -> Progression)
-  -> ByteString
-  -> ByteString
-  -> DecodeResult Text ByteString w
-decodeChunksProxy = decodeChunks undefined -- This allows Haskell to
--- determine the size in bytes of a data type using Storable.sizeOf
--- so that it doesn't have to be passed as an arugment. Storable.sizeOf
--- discards the actual value without evaluating it.
-
--- | Decode two 'ByteString's containing UTF-16-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
+                        )
+                      -- Yes
+                      then soFarSoGood
+                      -- No
+                      else huntDownError cpPos' cpPos' cpSt'
+                -- We're mid code point
+                | otherwise =
+                  if cpPos' < len
+                  then
+                    -- try to complete the code point
+                    case updateUtf8State (index cpPos') cpSt' of
+                      Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
+                      -- just enough additional data to find an error with the code point
+                      Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen)
+                  else
+                    -- didn't get enough additional data to complete the code point
+                    (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off dataStack tLen)
+          in
+          checkIncompleteCodePoint cpSt cpPos
+        -- no, we're past the boundary
+        else
+          -- the code point is the only thing that (potentially) changes
+          let getEndCodePointState cpPos' cpSt'
+                | cpPos' < len =
+                  case updateUtf8State (index cpPos') cpSt' of
+                    Nothing -> Nothing
+                    Just cpSt'' -> getEndCodePointState (cpPos' + 1) cpSt''
+                | otherwise = Just (cpSt', cpPos' - boundary)
+              mCpStLen = getEndCodePointState cpPos cpSt
+          in
+          ( case mCpStLen of
+            Nothing -> Left (-lenInit)
+            Just _ -> Right len
+          , Utf8DecodeState mCpStLen bss bs1Off dataStack tLen
+          )
+      -- no: there's an error
+      Nothing -> huntDownError 0 cpPos cpSt
+
+-- | Decodes a UTF-8 'ByteString' in the context of what has already been
+-- decoded which is represented by the 'Utf8DecodeState' value. Returned is the
+-- new decode state and either ('Right') the number of 'Word8's that make up the
+-- incomplete code point at the end of the input, or ('Left') the start position
+-- of an invalid code point that was encountered. The position is relative to
+-- the start of the input 'ByteString'.
+--
+-- If the previous 'ByteString' ended with an incomplete code point, the
+-- beginning of the input data will be treated as a continuation of the code
+-- point. NOTE: That in this case if the input causes the previous incomplete
+-- code point to be invalid, the returned error ('Left') position value will be
+-- negative.
+--
+-- If decoding the last 'ByteString' resulted in a error. The input is ignored,
+-- and the state value is returned unchanged. Error states can be handled with
+-- 'recoverFromUtf8Error'.
 --
 -- @since 2.0.2
-decodeUtf16Chunks ::
+decodeNextUtf8Chunk ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  Bool          -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> ByteString -- ^ The second 'ByteString' chunk to decode.
-  -> DecodeResult Text ByteString Word16
-decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff ->
-  -- get next Word8 pair
-  let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n
-      b0 = index $ if isBE then srcOff else srcOff + 1
-      b1 = index $ if isBE then srcOff + 1 else srcOff
-  in
-  case queryUtf16Bytes b0 b1 of
-    OneWord16 c -> writeAndAdvance c 2
-    TwoWord16 g ->
-      if len - srcOff < 4
-      -- not enough Word8s to finish the code point
-      then NeedMore
-      else
-        let b2 = index $ srcOff + (if isBE then 2 else 3)
-            b3 = index $ srcOff + (if isBE then 3 else 2)
-        in
-        case g b2 b3 of
-          Just c -> writeAndAdvance c 4
-          _ -> Invalid
-    _ -> Invalid) bs1 bs2
-
--- | Decode two 'ByteString's containing UTF-16-encoded text as though
--- they were one continuous 'ByteString' returning a 'DecodeResult'.
+ ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
+decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st)
+decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off dataStack tLen)
+  | len == 0 = (getCodePointStateOrError st, st)
+  | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off dataStack tLen
+
+-- | Decodes a 'ByteString' from a clean state.
+--
+-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State'@
 --
 -- @since 2.0.2
-decodeUtf32Chunks ::
+decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState)
+decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State
+
+-- | If the 'Utf8DecodeState' value indicates an error state, the 'Word8' that
+-- the state value point to is replaced with the input 'Text' value which may
+-- be empty. Decoding resumes after the text is inserted and produces the result
+-- described by 'decodeNextUtf8Chunk'.
+--
+-- If not in an error state, the 'Text' is inserted at the end of the data, but
+-- before an incomplete code point at the end of the last input 'ByteString'.
+--
+-- @since 2.0.2
+recoverFromUtf8Error ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  Bool          -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> ByteString -- ^ The second 'ByteString' chunk to decode.
-  -> DecodeResult Text ByteString Word32
-decodeUtf32Chunks isBE bs1 bs2 = decodeChunksProxy (\ index _ srcOff ->
-  -- get next Word8 quartet
-  case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3)
-      (index $ srcOff + (if isBE then 1 else 2))
-      (index $ srcOff + (if isBE then 2 else 1))
-      (index $ if isBE then srcOff + 3 else srcOff) of
-    Just c -> WriteAndAdvance c $ srcOff + 4
-    _ -> Invalid) bs1 bs2
+ Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
+recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@((B.length -> len) : bss') bs1Off dataStack tLen) =
+  let dammit mCpSt' bss'' bs1Off' = decodeUtf8Chunks . Utf8DecodeState mCpSt' bss'' bs1Off'
+        ( if utf8Len > 0
+          then Left (t, tLen) : dataStack
+          else dataStack
+        ) $ tLen + utf8Len
+  in
+  case mCpSt of
+    Nothing ->
+      let g = dammit (Just (utf8StartState, 0))
+          bs1Off' = bs1Off + 1
+      in
+      if bs1Off' == len
+      then g bss' 0
+      else g bss bs1Off'
+    Just _ -> dammit mCpSt bss bs1Off
+recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ dataStack tLen) =
+  decodeUtf8Chunks $ Utf8DecodeState
+    (Just (utf8StartState, 0))
+    []
+    0
+    (if utf8Len > 0
+      then Left (t, tLen) : dataStack
+      else dataStack
+    ) $ tLen + utf8Len
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -468,17 +546,46 @@ decodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of
-  Some t undecoded _ -> codePointToInvalid undecoded t
+decodeUtf8With onErr bs
+  | B.null undecoded = txt
+  | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
+    Nothing -> txt'
+    Just c  -> T.singleton c `append` txt')
   where
-    codePointToInvalid bs' txt =
-      case B.uncons bs' of
-        Just (x, bs'') -> codePointToInvalid bs'' $ case onErr desc $ Just x of
-          Just c -> append txt $ T.singleton c
-          _ -> txt
-        _ -> txt
+    (txt, undecoded) = decodeUtf8With2 onErr mempty bs
+    txt' = decodeUtf8With onErr (B.tail undecoded)
+    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
-    desc = "Data.Text.Internal.Encoding: Incomplete UTF-8 code point"
+-- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
+decodeUtf8With2 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
+decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) =
+  let g res isSecondBs =
+        case res of
+          ((Left pos), st) ->
+            g ( recoverFromUtf8Error
+                ( case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 code point" . Just $
+                    if pos >= 0
+                    then B.index (if isSecondBs then bs2 else bs1) pos
+                    else B.index bs1 (len1 + pos) of
+                  Just c -> T.singleton c
+                  Nothing -> empty
+                ) st
+              ) isSecondBs
+          ((Right cpLen), st) ->
+            if isSecondBs
+            then
+              ( fst $ outAvailableUtf8Text st
+              , if cpLen > len2
+                then B.drop (len1 + len2 - cpLen) bs1 `B.append` bs2
+                else B.drop (len2 - cpLen) bs2
+              )
+            else g (decodeNextUtf8Chunk bs2 st) True
+  in
+  g (decodeUtf8Chunk bs1) False
 
 -- $stream
 --
@@ -561,7 +668,7 @@ streamDecodeUtf8 ::
   ByteString -> Decoding
 streamDecodeUtf8 = streamDecodeUtf8With strictDecode
 
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
 -- encoded text.
 --
 -- @since 1.0.0.0
@@ -570,18 +677,11 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = g empty mempty
+streamDecodeUtf8With onErr = go mempty
   where
-    g t bs0 bs1 =
-      case decodeUtf8Chunks bs0 bs1 of
-        DecodeResult t' mW bs1' _ ->
-          case t `append` t' of
-            txt -> (case mW of
-              Just _ ->
-                g (case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" mW of
-                  Just c -> txt `append` T.singleton c
-                  _ -> txt) mempty
-              _ -> Some txt bs1' . g empty) bs1'
+    go bs1 bs2 = Some txt undecoded (go undecoded)
+      where
+        (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
@@ -783,3 +883,6 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
 encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
+
+cSizeToInt :: CSize -> Int
+cSizeToInt = fromIntegral
diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs
deleted file mode 100644
index 0f3bd2db..00000000
--- a/src/Data/Text/Encoding/Common.hs
+++ /dev/null
@@ -1,137 +0,0 @@
-{-# LANGUAGE CPP, DeriveDataTypeable #-}
-{-# LANGUAGE Safe #-}
--- |
--- Module      : Data.Text.Encoding.Common
--- Copyright   : (c) Bryan O'Sullivan 2009
---
--- License     : BSD-style
--- Maintainer  : bos@serpentine.com
--- Portability : GHC
---
--- Common functions and types for both lazy and strict encoding and
--- decoding including error handling.
---
--- The standard functions for encoding and decoding text are strict,
--- which is to say that they throw exceptions on invalid input.  This
--- is often unhelpful on real world input, so alternative functions
--- exist that accept custom handlers for dealing with invalid inputs.
--- These 'OnError' handlers are normal Haskell functions.  You can use
--- one of the presupplied functions in this module, or you can write a
--- custom handler of your own.
-
-module Data.Text.Encoding.Common
-  (
-  -- * Full-service result type
-    DecodeResult(..)
-  -- * Error handling types
-  , UnicodeException(..)
-  , OnError
-  , OnDecodeError
-  , OnEncodeError
-  -- * Useful error handling functions
-  , lenientDecode
-  , strictDecode
-  , strictEncode
-  , ignore
-  , replace
-  )
-  where
-
-import Control.DeepSeq (NFData (..))
-import Control.Exception (Exception, throw)
-import Data.Typeable (Typeable)
-import Data.Word (Word8)
-import Numeric (showHex)
-
--- | A decoding result on encoded data. It contains:
---
---   1. the decoded data up to an incomplete code point at the end of
---      the input data, an invalid word, or to the end of the input,
---   2. the first word of an invalid code point if one was encountered,
---   3. the remaining undecoded data, which is either an incomplete
---      code point, the data after the invalid code point, or empty,
---      and
---   4. the byte position of remaining undecoded data.
---
--- @since 2.0.2
-data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int
-  deriving (Eq, Ord, Show, Read)
-
--- | Function type for handling a coding error.  It is supplied with
--- two inputs:
---
--- * A 'String' that describes the error.
---
--- * The input value that caused the error.  If the error arose
---   because the end of input was reached or could not be identified
---   precisely, this value will be 'Nothing'.
---
--- If the handler returns a value wrapped with 'Just', that value will
--- be used in the output as the replacement for the invalid input.  If
--- it returns 'Nothing', no value will be used in the output.
---
--- Should the handler need to abort processing, it should use 'error'
--- or 'throw' an exception (preferably a 'UnicodeException').  It may
--- use the description provided to construct a more helpful error
--- report.
-type OnError a b = String -> Maybe a -> Maybe b
-
--- | A handler for a decoding error.
-type OnDecodeError = OnError Word8 Char
-
--- | A handler for an encoding error.
-{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
-type OnEncodeError = OnError Char Word8
-
--- | An exception type for representing Unicode encoding errors.
-data UnicodeException =
-    DecodeError String (Maybe Word8)
-    -- ^ Could not decode a byte sequence because it was invalid under
-    -- the given encoding, or ran out of input in mid-decode.
-  | EncodeError String (Maybe Char)
-    -- ^ Tried to encode a character that could not be represented
-    -- under the given encoding, or ran out of input in mid-encode.
-    deriving (Eq, Typeable)
-
-{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
-
-showUnicodeException :: UnicodeException -> String
-showUnicodeException (DecodeError desc (Just w))
-    = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
-showUnicodeException (DecodeError desc Nothing)
-    = "Cannot decode input: " ++ desc
-showUnicodeException (EncodeError desc (Just c))
-    = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
-showUnicodeException (EncodeError desc Nothing)
-    = "Cannot encode input: " ++ desc
-
-instance Show UnicodeException where
-    show = showUnicodeException
-
-instance Exception UnicodeException
-
-instance NFData UnicodeException where
-    rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
-    rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
-
--- | Throw a 'UnicodeException' if decoding fails.
-strictDecode :: OnDecodeError
-strictDecode desc c = throw (DecodeError desc c)
-
--- | Replace an invalid input byte with the Unicode replacement
--- character U+FFFD.
-lenientDecode :: OnDecodeError
-lenientDecode _ _ = Just '\xfffd'
-
--- | Throw a 'UnicodeException' if encoding fails.
-{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
-strictEncode :: OnEncodeError
-strictEncode desc c = throw (EncodeError desc c)
-
--- | Ignore an invalid input, substituting nothing in the output.
-ignore :: OnError a b
-ignore _ _ = Nothing
-
--- | Replace an invalid input with a valid output.
-replace :: b -> OnError a b
-replace c _ _ = Just c
diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs
index 83fa3a76..ea9e0997 100644
--- a/src/Data/Text/Encoding/Error.hs
+++ b/src/Data/Text/Encoding/Error.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
 {-# LANGUAGE Safe #-}
 -- |
 -- Module      : Data.Text.Encoding.Error
@@ -33,14 +34,87 @@ module Data.Text.Encoding.Error
     , replace
     ) where
 
-import Data.Text.Encoding.Common
-  ( UnicodeException(..)
-  , OnError
-  , OnDecodeError
-  , OnEncodeError
-  , lenientDecode
-  , strictDecode
-  , strictEncode
-  , ignore
-  , replace
-  )
+import Control.DeepSeq (NFData (..))
+import Control.Exception (Exception, throw)
+import Data.Typeable (Typeable)
+import Data.Word (Word8)
+import Numeric (showHex)
+
+-- | Function type for handling a coding error.  It is supplied with
+-- two inputs:
+--
+-- * A 'String' that describes the error.
+--
+-- * The input value that caused the error.  If the error arose
+--   because the end of input was reached or could not be identified
+--   precisely, this value will be 'Nothing'.
+--
+-- If the handler returns a value wrapped with 'Just', that value will
+-- be used in the output as the replacement for the invalid input.  If
+-- it returns 'Nothing', no value will be used in the output.
+--
+-- Should the handler need to abort processing, it should use 'error'
+-- or 'throw' an exception (preferably a 'UnicodeException').  It may
+-- use the description provided to construct a more helpful error
+-- report.
+type OnError a b = String -> Maybe a -> Maybe b
+
+-- | A handler for a decoding error.
+type OnDecodeError = OnError Word8 Char
+
+-- | A handler for an encoding error.
+{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
+type OnEncodeError = OnError Char Word8
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+    DecodeError String (Maybe Word8)
+    -- ^ Could not decode a byte sequence because it was invalid under
+    -- the given encoding, or ran out of input in mid-decode.
+  | EncodeError String (Maybe Char)
+    -- ^ Tried to encode a character that could not be represented
+    -- under the given encoding, or ran out of input in mid-encode.
+    deriving (Eq, Typeable)
+
+{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
+
+showUnicodeException :: UnicodeException -> String
+showUnicodeException (DecodeError desc (Just w))
+    = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
+showUnicodeException (DecodeError desc Nothing)
+    = "Cannot decode input: " ++ desc
+showUnicodeException (EncodeError desc (Just c))
+    = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
+showUnicodeException (EncodeError desc Nothing)
+    = "Cannot encode input: " ++ desc
+
+instance Show UnicodeException where
+    show = showUnicodeException
+
+instance Exception UnicodeException
+
+instance NFData UnicodeException where
+    rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
+    rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
+
+-- | Throw a 'UnicodeException' if decoding fails.
+strictDecode :: OnDecodeError
+strictDecode desc c = throw (DecodeError desc c)
+
+-- | Replace an invalid input byte with the Unicode replacement
+-- character U+FFFD.
+lenientDecode :: OnDecodeError
+lenientDecode _ _ = Just '\xfffd'
+
+-- | Throw a 'UnicodeException' if encoding fails.
+{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
+strictEncode :: OnEncodeError
+strictEncode desc c = throw (EncodeError desc c)
+
+-- | Ignore an invalid input, substituting nothing in the output.
+ignore :: OnError a b
+ignore _ _ = Nothing
+
+-- | Replace an invalid input with a valid output.
+replace :: b -> OnError a b
+replace c _ _ = Just c
diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs
index aa3019ab..aa8f0d02 100644
--- a/src/Data/Text/Internal/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Encoding/Fusion.hs
@@ -41,7 +41,7 @@ import Data.Bits (shiftL, shiftR)
 import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
-import Data.Text.Encoding.Common
+import Data.Text.Encoding.Error
 import Data.Text.Internal.Encoding.Fusion.Common
 import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs
index 45b0eeb3..4fe11a62 100644
--- a/src/Data/Text/Internal/Encoding/Utf16.hs
+++ b/src/Data/Text/Internal/Encoding/Utf16.hs
@@ -23,17 +23,14 @@ module Data.Text.Internal.Encoding.Utf16
       chr2
     , validate1
     , validate2
-    , Utf16Result(..)
-    , queryUtf16Bytes
     ) where
 
-import Data.Bits ((.&.))
 import GHC.Exts
-import GHC.Word (Word16(..), Word8(..))
+import GHC.Word (Word16(..))
 
 #if !MIN_VERSION_base(4,16,0)
 -- harmless to import, except for warnings that it is unused.
-import Data.Text.Internal.PrimCompat ( word16ToWord#, word8ToWord# )
+import Data.Text.Internal.PrimCompat ( word16ToWord# )
 #endif
 
 chr2 :: Word16 -> Word16 -> Char
@@ -53,30 +50,3 @@ validate2       ::  Word16 -> Word16 -> Bool
 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
                   x2 >= 0xDC00 && x2 <= 0xDFFF
 {-# INLINE validate2 #-}
-
-data Utf16Result
-  = OneWord16 Char
-  | TwoWord16 (Word8 -> Word8 -> Maybe Char)
-  | Invalid16
-
-queryUtf16Bytes :: Word8 -> Word8 -> Utf16Result
-queryUtf16Bytes b0@(W8# w0#) (W8# w1#)
-  | b0 < 0xD8 || b0 >= 0xE0 = OneWord16 $ C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#))))
-  -- 110110xx: start of surrogate pair
-  | b0 .&. 0xFC == 0xD8 = TwoWord16 $ \ b2@(W8# w2#) (W8# w3#) ->
-    if b2 .&. 0xFC == 0xDC
-      -- valid surrogate
-      then Just $
-        C# (chr# (
-          (orI#
-            (orI#
-              (orI#
-                (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#))
-                (word2Int# (shiftL# (word8ToWord# w1#) 10#))
-              )
-              (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#)))
-            (word2Int# (word8ToWord# w3#))) +# 0x10000#
-        ))
-      else Nothing
-  | otherwise = Invalid16
-{-# INLINE queryUtf16Bytes #-}
diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs
index 40a0e25d..4e8e9b46 100644
--- a/src/Data/Text/Internal/Encoding/Utf32.hs
+++ b/src/Data/Text/Internal/Encoding/Utf32.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP, MagicHash #-}
-
 -- |
 -- Module      : Data.Text.Internal.Encoding.Utf32
 -- Copyright   : (c) 2008, 2009 Tom Harper,
@@ -19,36 +17,10 @@
 module Data.Text.Internal.Encoding.Utf32
     (
       validate
-    , queryUtf32Bytes
     ) where
 
 import Data.Word (Word32)
-import GHC.Exts
-import GHC.Word (Word8(..))
-
-#if !MIN_VERSION_base(4,16,0)
--- harmless to import, except for warnings that it is unused.
-import Data.Text.Internal.PrimCompat (word8ToWord#)
-#endif
 
 validate    :: Word32 -> Bool
 validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
 {-# INLINE validate #-}
-
-queryUtf32Bytes :: (Eq a, Num a) => a -> Word8 -> Word8 -> Word8 -> Maybe Char
-queryUtf32Bytes b0 b1@(W8# w1#) b2@(W8# w2#) (W8# w3#)
-  | b0 == 0
-  , b1 < 0x11
-  , b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 =
-    Just $
-      C# (chr#
-          (orI#
-            (orI#
-              (word2Int# (shiftL# (word8ToWord# w1#) 16#))
-              (word2Int# (shiftL# (word8ToWord# w2#) 8#))
-            )
-            (word2Int# (word8ToWord# w3#))
-          )
-        )
-  | otherwise = Nothing
-{-# INLINE queryUtf32Bytes #-}
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 17ee7cfb..70f518df 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -34,10 +34,10 @@ module Data.Text.Internal.Encoding.Utf8
     , validate3
     , validate4
     -- * Naive decoding
-    , DetectUtf8Result(..)
-    , DetectState(..)
-    , utf8DetectStart
-    , utf8DetectContinue
+    , Utf8CodePointState
+    , utf8StartState
+    , updateUtf8State
+    , isUtf8StateIsComplete
     ) where
 
 #if defined(ASSERTS)
@@ -242,17 +242,14 @@ byteToClass n = ByteClass (W8# el#)
     table# :: Addr#
     table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
 
-newtype DetectState = DetectState Word8
+newtype Utf8CodePointState = Utf8CodePointState Word8
   deriving (Eq, Ord, Show, Read)
 
-utf8AcceptState :: DetectState
-utf8AcceptState = DetectState 0
+utf8StartState :: Utf8CodePointState
+utf8StartState = Utf8CodePointState 0
 
-utf8RejectState :: DetectState
-utf8RejectState = DetectState 12
-
-updateState :: ByteClass -> DetectState -> DetectState
-updateState (ByteClass c) (DetectState s) = DetectState (W8# el#)
+transitionUtf8State :: ByteClass -> Utf8CodePointState -> Utf8CodePointState
+transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W8# el#)
   where
     !(I# n#) = word8ToInt (c + s)
     el# = indexWord8OffAddr# table# n#
@@ -260,29 +257,12 @@ updateState (ByteClass c) (DetectState s) = DetectState (W8# el#)
     table# :: Addr#
     table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
 
--- | @since 2.0
-data DetectUtf8Result
-  = Accept
-  | Incomplete !DetectState
-  | Reject
-  deriving (Eq, Ord, Show, Read)
-
--- | @since 2.0
-utf8DetectStart :: Word8 -> DetectUtf8Result
-utf8DetectStart !w
-  | st == utf8AcceptState = Accept
-  | st == utf8RejectState = Reject
-  | otherwise             = Incomplete st
-  where
-    cl = byteToClass w
-    st = updateState cl utf8AcceptState
+updateUtf8State :: Word8 -> Utf8CodePointState -> Maybe Utf8CodePointState
+updateUtf8State w st = case transitionUtf8State (byteToClass w) st of
+  Utf8CodePointState 12 -> Nothing
+  st' -> Just st'
 
--- | @since 2.0
-utf8DetectContinue :: Word8 -> DetectState -> DetectUtf8Result
-utf8DetectContinue !w !st
-  | st' == utf8AcceptState = Accept
-  | st' == utf8RejectState = Reject
-  | otherwise              = Incomplete st'
-  where
-    cl  = byteToClass w
-    st' = updateState cl st
+isUtf8StateIsComplete :: Utf8CodePointState -> Bool
+isUtf8StateIsComplete (Utf8CodePointState s)
+  | s == 0 = True
+  | otherwise = False
diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
index 5ebad0ad..69149779 100644
--- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -38,7 +38,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Unsafe as B
 import Data.Text.Internal.ByteStringCompat
 import Data.Text.Internal.Encoding.Fusion.Common
-import Data.Text.Encoding.Common
+import Data.Text.Encoding.Error
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
 import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 91f36dc8..ad361af5 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -23,11 +23,6 @@ module Data.Text.Lazy.Encoding
     -- ** Total Functions #total#
     -- $total
       decodeLatin1
-    , decodeAsciiPrefix
-    , DecodeResult(..)
-    , decodeUtf8Chunks
-    , decodeUtf16Chunks
-    , decodeUtf32Chunks
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -62,9 +57,9 @@ module Data.Text.Lazy.Encoding
 
 import Control.Exception (evaluate, try)
 import Data.Monoid (Monoid(..))
-import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
-import Data.Word (Word16, Word32, Word8)
+import Data.Word (Word8)
 import qualified Data.ByteString as S
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Prim as BP
@@ -100,84 +95,16 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 -- (preferably not at all). See "Data.Text.Lazy.Encoding#g:total" for better
 -- solutions.
 
--- | Decode a 'B.ByteString' containing 7-bit ASCII
+-- | Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: B.ByteString -> Text
 decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 
--- | Decode a 'B.ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 decodeLatin1 :: B.ByteString -> Text
 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
--- | Decode a 'B.ByteString' containing ASCII.
---
--- This is a total function. The 'B.ByteString' is decoded until either
--- the end is reached or it errors with the first non-ASCII 'Word8' is
--- encountered. In either case the function will return what 'Text' was
--- decoded. On error, the non-ASCII 'Word8' is also returned followed
--- by the rest of the undecoded 'B.ByteString'.
---
--- @since 2.0.2
-decodeAsciiPrefix
-  :: B.ByteString
-  -> (Text, Maybe (Word8, B.ByteString))
-decodeAsciiPrefix = g id
-  where
-    g tDiff (B.Chunk sb lb) =
-      case TE.decodeAsciiPrefix sb of
-        (t, Nothing) -> g (tDiff . chunk t) lb
-        (t, Just (w, sb')) -> (tDiff $ chunk t Empty, Just (w, B.chunk sb' lb))
-    g tDiff _ = (tDiff Empty, Nothing)
-
-decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w)
-  -> B.ByteString
-  -> B.ByteString
-  -> DecodeResult Text B.ByteString w
-decodeChunks decoder = g id 0 mempty
-  where
-    g tDiff pos sb0 (B.Chunk sb1 lb1) lb2 =
-      let DecodeResult t mW sb1' pos' = decoder sb0 sb1
-          pos1 = pos + pos'
-      in
-      case mW of
-        Just _ -> DecodeResult (tDiff $ chunk t Empty) mW (B.chunk sb1' $ lb1 `B.append` lb2) pos1
-        _ -> g (tDiff . chunk t) pos1 sb1' lb1 lb2
-    g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty
-    g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos
-
--- | Decode two 'B.ByteString's containing UTF-8-encoded text as though
--- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeUtf8Chunks
-  :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
-  -> DecodeResult Text B.ByteString Word8
-decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks
-
--- | Decode two 'B.ByteString's containing UTF-16-encoded text as though
--- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeUtf16Chunks
-  :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
-  -> DecodeResult Text B.ByteString Word16
-decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks
-
--- | Decode two 'B.ByteString's containing UTF-32-encoded text as though
--- they were one continuous 'B.ByteString' returning a 'DecodeResult'.
---
--- @since 2.0.2
-decodeUtf32Chunks
-  :: Bool         -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False')
-  -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function.
-  -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode.
-  -> DecodeResult Text B.ByteString Word32
-decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks
-
--- | Decode a 'B.ByteString' containing UTF-8 encoded text.
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr (B.Chunk b0 bs0) =
     case TE.streamDecodeUtf8With onErr b0 of
@@ -198,7 +125,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
     desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
 decodeUtf8With _ _ = empty
 
--- | Decode a 'B.ByteString' containing UTF-8 encoded text that is known
+-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
 -- If the input contains any invalid UTF-8 data, an exception will be
@@ -209,7 +136,7 @@ decodeUtf8 :: B.ByteString -> Text
 decodeUtf8 = decodeUtf8With strictDecode
 {-# INLINE[0] decodeUtf8 #-}
 
--- | Decode a 'B.ByteString' containing UTF-8 encoded text..
+-- | Decode a 'ByteString' containing UTF-8 encoded text..
 --
 -- If the input contains any invalid UTF-8 data, the relevant
 -- exception will be returned, otherwise the decoded text.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index bf4d6589..0fabf62e 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -23,7 +23,7 @@ import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Char8 as BLC
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
-import qualified Data.Text.Encoding.Common as C
+import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
@@ -37,46 +37,16 @@ tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
-t_utf8_c     = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunks mempty $ E.encodeUtf8 t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
 tl_utf8'     = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
-tl_utf8_c    = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunks mempty $ EL.encodeUtf8 t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16LE    = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
-t_utf16LE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks False mempty $ E.encodeUtf16LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16LE   = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
-tl_utf16LE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks False mempty $ EL.encodeUtf16LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf16BE    = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
-t_utf16BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks True mempty $ E.encodeUtf16BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf16BE   = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
-tl_utf16BE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks True mempty $ EL.encodeUtf16BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32LE    = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
-t_utf32LE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks False mempty $ E.encodeUtf32LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
-tl_utf32LE_c = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks False mempty $ EL.encodeUtf32LE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 t_utf32BE    = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
-t_utf32BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks True mempty $ E.encodeUtf32BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
-tl_utf32BE_c  = (\ t ->
-                let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks True mempty $ EL.encodeUtf32BE t in
-                (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty))
 
 runBuilder :: B.Builder -> B.ByteString
 runBuilder =
@@ -154,13 +124,6 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro
       length (show err) >= 0
     Right _  -> counterexample (show (decoded, l)) $ de /= Strict
 
-t_utf8_c_err :: InvalidUtf8 -> Property
-t_utf8_c_err bad =
-  let E.DecodeResult t mW bs _ = E.decodeUtf8Chunks mempty $ toByteString bad in
-  case mW of
-    Just w -> counterexample (show w) True
-    _  -> counterexample (show t) $ B.length bs > 0
-
 t_utf8_err' :: B.ByteString -> Bool
 t_utf8_err' bs = case E.decodeUtf8' bs of
   Left err -> length (show err) >= 0
@@ -202,132 +165,11 @@ genInvalidUTF8 = B.pack <$> oneof [
     ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
     ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80]
 
-t_prefix_decode_ascii_1 =
-  let g bs es is =
-        case es of
-          expected : es' -> whenEqProp (E.decodeAsciiPrefix bs) expected $
-            case expected of
-              (_, Just (_, bs')) -> g bs' es' is
-              _ -> f (is, es')
-          _ -> counterexample ("More input than expected: " ++ show is) False
-      f s = case s of
-        (input : is, es) -> g input es is
-        (_, es@(_ : _)) -> counterexample ("More expected output than input: " ++ show es) False
-        _ -> counterexample "" True
-  in
-  f ( [ B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c]
-      , B.pack [0x6f, 0x2c, 0x20, 0x94,
-            0x68, 0x6f, 0x77, 0x20, 0xcc,
-            0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]
-      ]
-    , [ ("hi, ", Just (0x83, B.pack [0x68, 0x65, 0x6c, 0x6c]))
-      , ("hell", Nothing)
-      , ("o, ", Just (0x94, B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc,
-              0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]))
-      , ("how ", Just (0xcc, B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]))
-      , ("are ya?", Nothing)
-      ]
-    )
-
--- test multi-word code points split across bytestring chunks
-chunksTests decodeF insExpectedOuts =
-  f mempty insExpectedOuts
-  where
-    f bs s =
-      case s of
-        (input, expected) : s' -> whenEqProp (decodeF bs input) expected $
-          case expected of
-            E.DecodeResult _ _ bs' _ -> f bs' s'
-        _ -> counterexample "" True
-
-t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks
-  [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20,
-        0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64,
-        0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21]
-    , E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22
-    )
-  ]
-t_chunk_decode_utf8_2 = chunksTests E.decodeUtf8Chunks
-  [ ( B.pack [97, 0xC2, 97]
-    , E.DecodeResult (T.singleton 'a') (Just 0xC2) (B.singleton 97) 2
-    )
-  , ( mempty
-    , E.DecodeResult (T.singleton 'a') Nothing mempty 1
-    )
-  ]
-t_chunk_decode_utf8_3 = chunksTests E.decodeUtf8Chunks
-  [ ( B.pack [104, 105, 32, 0xe2]
-    , E.DecodeResult "hi " Nothing (B.singleton 0xe2) 3
-    )
-  , ( B.singleton 0x98
-    , E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0
-    )
-  , ( B.pack [0x83, 32, 0xFF]
-    , E.DecodeResult "☃ " (Just 0xFF) mempty 5
-    )
-  ]
--- test multi-word code points split across bytestring chunks
-t_chunk_decode_utf8_4 = chunksTests E.decodeUtf8Chunks
-  [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4)
-  , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4)
-  , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0)
-  , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9)
-  , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0)
-  , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0)
-  , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5)
-  ]
-
-t_chunk_decode_utf16BE = chunksTests (E.decodeUtf16Chunks True)
-  [ ( B.pack [0]
-    , E.DecodeResult T.empty Nothing (B.pack [0]) 0
-    )
-  , ( B.pack [104, 0, 105, 0, 32, 0xD8, 0x01]
-    , E.DecodeResult "hi " Nothing (B.pack [0xD8, 0x01]) 6
-    )
-  , ( B.pack [0xDC, 0x37, 0, 32, 0xDC, 0]
-    , E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8
-    )
-  ]
-t_chunk_decode_utf16LE = chunksTests (E.decodeUtf16Chunks False)
-  [ ( B.pack [104]
-    , E.DecodeResult T.empty Nothing (B.pack [104]) 0
-    )
-  , ( B.pack [0, 105, 0, 32, 0, 0x01, 0xD8]
-    , E.DecodeResult "hi " Nothing (B.pack [0x01, 0xD8]) 6
-    )
-  , ( B.pack [0x37, 0xDC, 32, 0, 0, 0xDC]
-    , E.DecodeResult "\x10437 " (Just 0xDC) mempty 8
-    )
-  ]
-
-t_chunk_decode_utf32BE = chunksTests (E.decodeUtf32Chunks True)
-  [ ( B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0]
-    , E.DecodeResult "hi" Nothing (B.pack [0, 0]) 8
-    )
-  , ( B.pack [0, 32, 0, 0, 0x26]
-    , E.DecodeResult " " Nothing (B.pack [0, 0, 0x26]) 4
-    )
-  , ( B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]
-    , E.DecodeResult "☃ " (Just 0xD800) mempty 12
-    )
-  ]
-t_chunk_decode_utf32LE = chunksTests (E.decodeUtf32Chunks False)
-  [ ( B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0]
-    , E.DecodeResult "hi" Nothing (B.pack [0x20, 0]) 8
-    )
-  , ( B.pack [0, 0, 0x03, 0x26, 0]
-    , E.DecodeResult " " Nothing (B.pack [0x03, 0x26, 0]) 4
-    )
-  , ( B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]
-    , E.DecodeResult "☃ " (Just 0xD80000) mempty 12
-    )
-  ]
-
 decodeLL :: BL.ByteString -> TL.Text
-decodeLL = EL.decodeUtf8With C.lenientDecode
+decodeLL = EL.decodeUtf8With E.lenientDecode
 
 decodeL :: B.ByteString -> T.Text
-decodeL = E.decodeUtf8With C.lenientDecode
+decodeL = E.decodeUtf8With E.lenientDecode
 
 -- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
 -- and it should behave the same as decoding of strict bytestrings.
@@ -360,7 +202,7 @@ t_decode_with_error4' =
 t_decode_with_error5' = ioProperty $ do
   ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81])
   pure $ case ret of
-    Left (_ :: C.UnicodeException) -> True
+    Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
 t_infix_concat bs1 text bs2 =
@@ -377,28 +219,18 @@ testTranscoding =
     testProperty "tl_latin1" tl_latin1,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
-    testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8_incr" t_utf8_incr,
     testProperty "t_utf8_undecoded" t_utf8_undecoded,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
-    testProperty "tl_utf8_c" tl_utf8_c,
     testProperty "t_utf16LE" t_utf16LE,
-    testProperty "t_utf16LE_c" t_utf16LE_c,
     testProperty "tl_utf16LE" tl_utf16LE,
-    testProperty "tl_utf16LE_c" tl_utf16LE_c,
     testProperty "t_utf16BE" t_utf16BE,
-    testProperty "t_utf16BE_c" t_utf16BE_c,
     testProperty "tl_utf16BE" tl_utf16BE,
-    testProperty "tl_utf16BE_c" tl_utf16BE_c,
     testProperty "t_utf32LE" t_utf32LE,
-    testProperty "t_utf32LE_c" t_utf32LE_c,
     testProperty "tl_utf32LE" tl_utf32LE,
-    testProperty "tl_utf32LE_c" tl_utf32LE_c,
     testProperty "t_utf32BE" t_utf32BE,
-    testProperty "t_utf32BE_c" t_utf32BE_c,
     testProperty "tl_utf32BE" tl_utf32BE,
-    testProperty "tl_utf32BE_c" tl_utf32BE_c,
     testGroup "builder" [
       testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder,
       testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset,
@@ -408,19 +240,9 @@ testTranscoding =
     ],
     testGroup "errors" [
       testProperty "t_utf8_err" t_utf8_err,
-      testProperty "t_utf8_c_err" t_utf8_c_err,
       testProperty "t_utf8_err'" t_utf8_err'
     ],
     testGroup "error recovery" [
-      testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1,
-      testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2,
-      testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3,
-      testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4,
-      testProperty "t_prefix_decode_ascii_1" t_prefix_decode_ascii_1,
-      testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE,
-      testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE,
-      testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE,
-      testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE,
       testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index a09b65b1..8f36f7ec 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,8 +27,6 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
-
-    , whenEqProp
     ) where
 
 import Control.Arrow ((***))
@@ -44,7 +42,7 @@ import Tests.Utils
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
-import qualified Data.Text.Encoding.Common as T
+import qualified Data.Text.Encoding.Error as T
 import qualified Data.Text.Internal.Fusion as TF
 import qualified Data.Text.Internal.Fusion.Common as TF
 import qualified Data.Text.Internal.Lazy as TL
@@ -286,9 +284,3 @@ newtype SkewedBool = Skewed { getSkewed :: Bool }
 
 instance Arbitrary SkewedBool where
   arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)]
-
--- like Control.Monad.when, but with properties instead of monad values
-whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property
-whenEqProp a b next = if a == b
-  then next
-  else a === b
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
index 7828bc75..cf011680 100644
--- a/tests/Tests/Regressions.hs
+++ b/tests/Tests/Regressions.hs
@@ -21,7 +21,7 @@ import qualified Data.ByteString.Lazy as LB
 import qualified Data.Text as T
 import qualified Data.Text.Array as TA
 import qualified Data.Text.Encoding as TE
-import qualified Data.Text.Encoding.Common as E
+import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Internal as T
 import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as LT
diff --git a/text.cabal b/text.cabal
index cabb1582..488f1451 100644
--- a/text.cabal
+++ b/text.cabal
@@ -137,7 +137,6 @@ library
     Data.Text.Array
     Data.Text.Encoding
     Data.Text.Encoding.Error
-    Data.Text.Encoding.Common
     Data.Text.Foreign
     Data.Text.IO
     Data.Text.Internal

From 423ef2baac797bb737b99c320c0c7b70018f8bba Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 2 Oct 2022 10:10:54 -0600
Subject: [PATCH 43/87] Update changelog

---
 changelog.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/changelog.md b/changelog.md
index 2f529fc6..1097f8af 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,12 +1,13 @@
 ### 2.0.2
 
 * A new suite of total decoders have been added in `Data.Text.Encoding`
-  and `Data.Text.Lazy.Encoding` that allow decoding to be aborted on
-  errors without the need to raise an `error` and `catch` it elsewhere:
+  that allow decoding to be aborted on errors without the need to raise
+  an `error` and `catch` it elsewhere:
   * `decodeAsciiPrefix`
-  * `decodeUtf8Chunks`
-  * `decodeUtf16Chunks`
-  * `decodeUtf32Chunks`
+  * `decodeUtf8Chunk`
+  * `decodeNextUtf8Chunk`
+  * `recoverFromUtf8Error`
+  * `outAvailableUtf8Text`
 
 ### 2.0.1
 

From d2aa0312739ec1284621730b417ffd3a522c4e80 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 5 Oct 2022 19:46:02 -0600
Subject: [PATCH 44/87] When possible copy whole bytestring at once regardless
 of code point boundaries

---
 changelog.md                            |   6 +-
 src/Data/Text/Encoding.hs               | 150 +++++++++++++++---------
 src/Data/Text/Internal/Encoding/Utf8.hs |   4 +-
 tests/Tests/Properties/Transcoding.hs   |   4 +
 4 files changed, 101 insertions(+), 63 deletions(-)

diff --git a/changelog.md b/changelog.md
index 1097f8af..c2c72420 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,8 +1,8 @@
 ### 2.0.2
 
-* A new suite of total decoders have been added in `Data.Text.Encoding`
-  that allow decoding to be aborted on errors without the need to raise
-  an `error` and `catch` it elsewhere:
+* A suite of functions have been added in `Data.Text.Encoding` that
+  allow decoding to be aborted on errors without the need to raise an
+  `error` and `catch` it elsewhere:
   * `decodeAsciiPrefix`
   * `decodeUtf8Chunk`
   * `decodeNextUtf8Chunk`
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index db0ef035..5fe6b98a 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -80,6 +80,7 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
+import Control.Monad (when)
 import Control.Monad.ST (runST)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
@@ -226,10 +227,24 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 --
 -- @since 2.0.2
 data Utf8DecodeState = Utf8DecodeState
-  (Maybe (Utf8CodePointState, Int))
+  -- Code point decode state or error
+  (Maybe
+      -- self-explanatory (I hope)
+    ( Utf8CodePointState
+      -- Count of Word8s that have been evaluated so far for this code point.
+      -- The first word is specified by Position indicator below
+    , Int
+    ))
+  -- ByteStrings containing data whose evaluations are unfinished
   [ByteString]
+  -- Postion within the lead ByteString of either unfinished evaluated data or
+  -- the first word of an invalid code point.
   Int
-  [Either (Text, Int) (ByteString, Int, Int, Int)]
+  -- the first non-listed Word8 in the lead ByteString.
+  Int
+  -- Queued text data.
+  [Either Text (ByteString, Int, Int)]
+  -- Word8 length of listed data.
   Int
   deriving (Show)
 
@@ -237,7 +252,7 @@ data Utf8DecodeState = Utf8DecodeState
 --
 -- @since 2.0.2
 startUtf8State :: Utf8DecodeState
-startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0
+startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 0 [] 0
 
 -- | Takes whatever data has been decoded thus far and spits it out as a `Text`
 -- value and a `Utf8DecodeState` value that no longer references the decoded
@@ -246,24 +261,36 @@ startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0
 --
 -- @since 2.0.2
 outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState)
-outAvailableUtf8Text (Utf8DecodeState mCpSt bss bs1Off dataStack tLen) =
-  if tLen > 0
+outAvailableUtf8Text st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) =
+  let tbpLen = bs1Off - tbpPos
+      totalLen = tLen + tbpLen
+  in
+  if totalLen > 0
   then runST $ do
-    dst <- A.new tLen
-    mapM_ (\ dat ->
-      case dat of
-        Left ((Text arr0 off utf8Len), dstOff) -> A.copyI utf8Len dst dstOff arr0 off
-        Right (bs, bsOff, utf8Len, dstOff) ->
-          withBS bs $ \ fp _ ->
-            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-              unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len
-      ) dataStack
+    dst <- A.new totalLen
+    when (tbpLen > 0) . withBS (head bss) $ \ fp _ ->
+      unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+        unsafeSTToIO $ A.copyFromPointer dst tLen (src `plusPtr` tbpPos) tbpLen
+    let g (dat : dataStack') tLen' =
+            (case dat of
+              Left (Text arr0 off utf8Len) -> do
+                let dstOff = tLen' - utf8Len
+                A.copyI utf8Len dst dstOff arr0 off
+                pure dstOff
+              Right (bs, bsOff, utf8Len) -> do
+                let dstOff = tLen' - utf8Len
+                withBS bs $ \ fp _ ->
+                  unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+                    unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len
+                pure dstOff) >>= g dataStack'
+        g _ _ = pure ()
+    g dataStack tLen
     arr <- A.unsafeFreeze dst
-    pure (Text arr 0 tLen, Utf8DecodeState mCpSt bss bs1Off [] 0)
-  else (empty, Utf8DecodeState mCpSt bss bs1Off dataStack tLen)
+    pure (Text arr 0 totalLen, Utf8DecodeState mCpSt bss bs1Off bs1Off [] 0)
+  else (empty, st)
 
 getCodePointStateOrError :: Utf8DecodeState -> Either Int Int
-getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _) =
+getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _ _) =
   case mCpSt of
     Nothing ->
       let (lenInit, _) = foldr
@@ -275,13 +302,14 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss')
 getCodePointStateOrError _ = Right 0
 
 decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _) = (getCodePointStateOrError st, st)
-decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st)
+decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st)
+decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)
 decodeUtf8Chunks
   st@(Utf8DecodeState
     (Just (cpSt, cpPos))
     bss@(bs1@(B.length -> len1) : bss')
     bs1Off
+    tbpPos
     dataStack
     tLen
   )
@@ -300,7 +328,7 @@ bs1Off = 0  cpPos = Δbs1Off
           ^---------------------^
                 len
   --}
-  let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytesting that's to be evaluated.
+  let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytestring that's to be evaluated.
       (lenInit, (lenN, bsN)) = foldr
         (\ bs@(B.length -> len') (lenInit', (lenN', _)) ->
           (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss'
@@ -333,21 +361,22 @@ bs1Off = 0  cpPos = Δbs1Off
             w n test = len >= n && test (index $ len - n)
             wc n mask word8 = w n $ (word8 ==) . (mask .&.)
             wi n word8 = w n (>= word8)
-        -- queue the available valid data, trim the fat, and leave a spot for the code point state/error.
-        stackValidUtf8 wordCount mCps =
+        -- push the available valid data on to the list, and remove completely evaulated bytestrings.
+        pushValidUtf8 wordCount mCps =
           let bs1Off' = bs1Off + wordCount
-              (bs1Off''', bss'''', dataStack'') =
+              (bss'''', bs1Off''', tbpPos'', dataStack'', tLen'') =
                 if wordCount < len1_
-                then (bs1Off', bss, Right (bs1, bs1Off, wordCount, tLen) : dataStack)
+                then (bss, bs1Off', tbpPos, dataStack, tLen)
                 else
-                  let stackValidUtf8' wordCount' bs1Off'' tLen' bss''@(bs'@(B.length -> len') : bss''') dataStack' =
+                  let pushValidUtf8' wordCount' bss''@(bs'@(B.length -> len') : bss''') bs1Off'' tbpPos' dataStack' tLen' =
                         if wordCount' < len'
-                        then (bs1Off'', bss'', Right (bs', 0, wordCount', tLen') : dataStack')
-                        else stackValidUtf8' (wordCount' - len') (bs1Off'' - len') (tLen' + len') bss''' (Right (bs', 0, len', tLen') : dataStack')
-                      stackValidUtf8' _ _ _ _ dataStack' = (0, [], dataStack')
+                        then (bss'', bs1Off'', tbpPos', dataStack', tLen')
+                        else pushValidUtf8' (wordCount' - len') bss''' (bs1Off'' - len') 0 (Right (bs', 0, len') : dataStack') (tLen' + len')
+                      pushValidUtf8' _ _ bs1Off'' tbpPos' dataStack' tLen' = ([], bs1Off'', tbpPos', dataStack', tLen')
+                      bs1WordCount = bs1Off - tbpPos + len1_
                   in
-                  stackValidUtf8' (wordCount - len1_) (bs1Off' - len1) (tLen + len1_) bss' (Right (bs1, bs1Off, len1_, tLen) : dataStack)
-              st' = Utf8DecodeState mCps bss'''' bs1Off''' dataStack'' $ tLen + wordCount
+                  pushValidUtf8' (wordCount - len1_) bss' (bs1Off' - len1) 0 (Right (bs1, tbpPos, bs1WordCount) : dataStack) (tLen + bs1WordCount)
+              st' = Utf8DecodeState mCps bss'''' bs1Off''' tbpPos'' dataStack'' tLen''
           in
           (getCodePointStateOrError st', st')
         huntDownError off ndx cps =
@@ -361,8 +390,8 @@ bs1Off = 0  cpPos = Δbs1Off
                   then ndx'
                   else off
                 ) ndx' cps'
-              Nothing -> stackValidUtf8 off Nothing
-          else stackValidUtf8 off $ Just (cps, ndx - off)
+              Nothing -> pushValidUtf8 off Nothing
+          else pushValidUtf8 off $ Just (cps, ndx - off)
     in
     -- did we find the boundary?
     case guessUtf8Boundary of
@@ -382,7 +411,7 @@ bs1Off = 0  cpPos = Δbs1Off
                             Just cpSt''' -> getEndState (ndx + 1) cpSt'''
                         | otherwise = Just (cpSt'', ndx - boundary)
                       soFarSoGood =
-                        stackValidUtf8 boundary $ getEndState boundary cpSt'
+                        pushValidUtf8 boundary $ getEndState boundary cpSt'
                   in
                   -- are we at the boundary?
                   if boundary == cpPos'
@@ -397,7 +426,7 @@ bs1Off = 0  cpPos = Δbs1Off
                       -- keep walking the data until we get to bsN or an error
                       case updateUtf8State (index cpPos') cpSt' of
                         Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
-                        Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen)
+                        Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen)
                     -- no: we're in bsN
                     else let
                           off = (if lenInit > 0
@@ -436,10 +465,10 @@ bs1Off = 0  cpPos = Δbs1Off
                     case updateUtf8State (index cpPos') cpSt' of
                       Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
                       -- just enough additional data to find an error with the code point
-                      Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen)
+                      Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen)
                   else
                     -- didn't get enough additional data to complete the code point
-                    (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off dataStack tLen)
+                    (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off tbpPos dataStack tLen)
           in
           checkIncompleteCodePoint cpSt cpPos
         -- no, we're past the boundary
@@ -456,7 +485,7 @@ bs1Off = 0  cpPos = Δbs1Off
           ( case mCpStLen of
             Nothing -> Left (-lenInit)
             Just _ -> Right len
-          , Utf8DecodeState mCpStLen bss bs1Off dataStack tLen
+          , Utf8DecodeState mCpStLen bss bs1Off tbpPos dataStack tLen
           )
       -- no: there's an error
       Nothing -> huntDownError 0 cpPos cpSt
@@ -484,10 +513,10 @@ decodeNextUtf8Chunk ::
   HasCallStack =>
 #endif
  ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st)
-decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off dataStack tLen)
+decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)
+decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen)
   | len == 0 = (getCodePointStateOrError st, st)
-  | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off dataStack tLen
+  | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off tbpPos dataStack tLen
 
 -- | Decodes a 'ByteString' from a clean state.
 --
@@ -511,29 +540,36 @@ recoverFromUtf8Error ::
   HasCallStack =>
 #endif
  Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@((B.length -> len) : bss') bs1Off dataStack tLen) =
-  let dammit mCpSt' bss'' bs1Off' = decodeUtf8Chunks . Utf8DecodeState mCpSt' bss'' bs1Off'
-        ( if utf8Len > 0
-          then Left (t, tLen) : dataStack
-          else dataStack
-        ) $ tLen + utf8Len
+recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@(bs1@(B.length -> len) : bss') bs1Off tbpPos dataStack tLen) =
+  let tbpLen = bs1Off - tbpPos
+      (dataStack', tLen') =
+        if tbpLen > 0
+        then (Right (bs1, tbpPos, tbpLen) : dataStack, tLen + tbpLen)
+        else (dataStack, tLen)
+      (dataStack'', tLen'') =
+        if utf8Len > 0
+        then (Left t : dataStack', tLen' + utf8Len)
+        else (dataStack', tLen')
+      (mCpSt'', bss'', bs1Off'') =
+        case mCpSt of
+          Just _ -> (mCpSt, bss, bs1Off)
+          Nothing ->
+            let bs1Off' = bs1Off + 1
+                mCpSt' = Just (utf8StartState, 0)
+            in
+            if bs1Off' == len
+            then (mCpSt', bss', 0)
+            else (mCpSt', bss, bs1Off')
   in
-  case mCpSt of
-    Nothing ->
-      let g = dammit (Just (utf8StartState, 0))
-          bs1Off' = bs1Off + 1
-      in
-      if bs1Off' == len
-      then g bss' 0
-      else g bss bs1Off'
-    Just _ -> dammit mCpSt bss bs1Off
-recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ dataStack tLen) =
+  decodeUtf8Chunks $ Utf8DecodeState mCpSt'' bss'' bs1Off'' bs1Off'' dataStack'' tLen''
+recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ _ dataStack tLen) =
   decodeUtf8Chunks $ Utf8DecodeState
     (Just (utf8StartState, 0))
     []
     0
+    0
     (if utf8Len > 0
-      then Left (t, tLen) : dataStack
+      then Left t : dataStack
       else dataStack
     ) $ tLen + utf8Len
 
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 70f518df..5d4aaebd 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -263,6 +263,4 @@ updateUtf8State w st = case transitionUtf8State (byteToClass w) st of
   st' -> Just st'
 
 isUtf8StateIsComplete :: Utf8CodePointState -> Bool
-isUtf8StateIsComplete (Utf8CodePointState s)
-  | s == 0 = True
-  | otherwise = False
+isUtf8StateIsComplete (Utf8CodePointState s) = s == 0
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 0fabf62e..81047d8a 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -205,6 +205,10 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
+whenEqProp a b next = if a == b
+  then next
+  else a === b
+
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
   text `T.isInfixOf`

From ac6a31cfc0afa047e9ba61d23da42b5ee4073a59 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 5 Oct 2022 22:56:43 -0600
Subject: [PATCH 45/87] More test cases

---
 tests/Tests/Properties/Transcoding.hs | 21 ++++++++++++++++++---
 tests/Tests/QuickCheckUtils.hs        |  7 +++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 81047d8a..99e6618b 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -35,6 +35,7 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
+t_utf8_c     = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
@@ -205,9 +206,21 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
-whenEqProp a b next = if a == b
-  then next
-  else a === b
+t_decode_chunk =
+  case E.decodeUtf8Chunk $ B.pack [0xC2, 97] of
+    (result, st) -> whenEqProp result (Left 0) $
+      case E.decodeNextUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0x63, 0x63, 0x63]) st of
+        (result', st') -> whenEqProp result' result $
+          case E.recoverFromUtf8Error "bbbb" st' of
+            (result'', st'') -> whenEqProp result'' (Right 0) $
+              case E.decodeNextUtf8Chunk (B.pack [0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0xc2]) st'' of
+                (result''', st''') -> whenEqProp result''' (Right 1) $
+                  case E.outAvailableUtf8Text st''' of
+                    (t, st'''') -> whenEqProp t "bbbbadddddd" $
+                      case E.decodeNextUtf8Chunk (B.singleton 0x80) st'''' of
+                        (result'''', st''''') -> whenEqProp result'''' (Right 0) $
+                          case E.outAvailableUtf8Text st''''' of
+                            (t', _) -> t' === "\x80"
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -221,6 +234,7 @@ testTranscoding =
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,
     testProperty "tl_latin1" tl_latin1,
+    testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
     testProperty "t_utf8_incr" t_utf8_incr,
@@ -255,6 +269,7 @@ testTranscoding =
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
       testProperty "t_decode_with_error5'" t_decode_with_error5',
+      testProperty "t_decode_chunk" t_decode_chunk,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 8f36f7ec..9834e92c 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,6 +27,7 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
+    , whenEqProp
     ) where
 
 import Control.Arrow ((***))
@@ -284,3 +285,9 @@ newtype SkewedBool = Skewed { getSkewed :: Bool }
 
 instance Arbitrary SkewedBool where
   arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)]
+
+-- like 'when' but with 'Property' instead of a monad.
+whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property
+whenEqProp a b next = if a == b
+  then next
+  else a === b

From 340934e7056393b86f2391360f6571ed818d6478 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Mon, 10 Oct 2022 22:01:19 -0600
Subject: [PATCH 46/87] start of decomposition of decodeUtf8Chunks

---
 src/Data/Text/Encoding.hs | 130 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 125 insertions(+), 5 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 5fe6b98a..d09f47cc 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -217,11 +217,6 @@ decodeAsciiPrefix bs = if B.null bs
             else Nothing
       pure (prefix, suffix)
 
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
-
 -- | A value that represents the state of a UTF-8 decoding process potentionally
 -- across multiple 'ByteString's.
 --
@@ -301,6 +296,131 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss')
     Just (_, cpLen) -> Right cpLen
 getCodePointStateOrError _ = Right 0
 
+#ifdef SIMDUTF
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
+#endif
+
+{-
+parseUtf8ChunkFrom chunk from = (n, ms)
+
+n is the end index of the longest prefix of chunk that is valid UTF-8 starting
+from index 'from' where the length of the prefix = n - from
+
+ms
+* When ms = Nothing, there is an error: the bytes from index n and beyond are
+  not part of a valid UTF-8 code point.
+* When ms = Just s, all of the remaining bytes from index n and beyond are the
+  beginning of an incomplete UTF-8 code point, and s is the corresponding
+  intermediate decoding state, which can be used to parse the next chunk with
+  `parseUtf8Chunk`
+-}
+parseUtf8ChunkFrom :: ByteString -> Int -> (Int, Maybe Utf8CodePointState)
+parseUtf8ChunkFrom bs@(B.length -> len) pos =
+  let w n test = (len - pos) >= n && test (B.index bs $ len - n)
+      wi n word8 = w n (>= word8)
+      wc n mask word8 = w n $ (word8 ==) . (mask .&.)
+      guessUtf8Boundary
+        | wi 3 0xf0 = Just $ len - 3  -- third to last char starts a four-byte code point
+        | wi 2 0xe0 = Just $ len - 2  -- pre-last char starts a three-or-four-byte code point
+        | wi 1 0xc2 = Just $ len - 1  -- last char starts a two-(or more-)byte code point
+        | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
+          wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
+          wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
+          w 1 (< 0x80) = Just len     -- last char is ASCII
+        | otherwise = Nothing         -- no clue
+      huntDownError ndx0 ndx s =
+        -- if ndx < len
+        -- then
+          case updateUtf8State (B.index bs ndx) s of
+            Just s' ->
+              let ndx' = ndx + 1 in
+              huntDownError (
+                if isUtf8StateIsComplete s'
+                then ndx'
+                else ndx0
+              ) ndx' s'
+            Nothing -> (ndx0, Nothing)
+        -- else (ndx0, Just s)
+  in
+  case guessUtf8Boundary of
+    Just boundary ->
+      let getEndState ndx s
+            | ndx < len =
+              case updateUtf8State (B.index bs ndx) s of
+                Nothing -> Nothing
+                Just s' -> getEndState (ndx + 1) s'
+            | otherwise = Just s
+      in
+      if pos < boundary
+      then
+        -- is the rest of the bytestring valid utf-8 up to the boundary?
+        if (
+#ifdef SIMDUTF
+            withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+              unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
+                c_is_valid_utf8 (plusPtr ptr pos) (fromIntegral $ boundary - pos)
+#elif MIN_VERSION_bytestring(0,11,2)
+            B.isValidUtf8 . B.take (boundary - pos) $ B.drop pos bs
+#else
+            let bLen = boundary - pos
+                step ndx s
+                  | ndx < pos + bLen =
+                    case updateUtf8State (B.unsafeIndex bs ndx) s of
+                    Just s' -> step (ndx + 1) s'
+                    Nothing -> False
+                  | otherwise = isUtf8StateIsComplete s
+            in
+            step pos utf8StartState
+#endif
+          )
+        -- Yes
+        then (pos, getEndState pos utf8StartState)
+        -- No
+        else huntDownError pos pos utf8StartState
+      else (pos, getEndState pos utf8StartState)
+    Nothing -> huntDownError pos pos utf8StartState
+
+{-
+parseUtf8Chunk chunk s = (n, ms)
+
+n
+* When n >= 0, n is the length of the longest prefix of chunk that is valid
+  UTF-8 starting from state s (i.e., n points after the end of a full codepoint,
+  to the beginning of an incomplete codepoint).
+* When n > 0, the starting code point from the previous input is either still
+  incomplete with the additonal chunk or in error.
+
+ms
+* When ms = Nothing, there is an error: the bytes from index n and beyond are
+  not part of a valid UTF-8 code point.
+* When ms = Just s', all of the remaining bytes from index n and beyond are the
+  beginning of an incomplete UTF-8 code point, and s' is the corresponding
+  intermediate decoding state, which can be used to parse the next chunk with
+  `parseUtf8Chunk`
+-}
+parseUtf8Chunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState)
+parseUtf8Chunk bs@(B.length -> len) s =
+  let g pos s' =
+        -- first things first. let's try to get to the start of the next code point
+        if isUtf8StateIsComplete s'
+        -- found the beginning of the next code point, hand this off to someone else
+        then parseUtf8ChunkFrom bs pos
+        -- no, code point is not complete yet
+        else
+          -- walk the rest of the code point until error, complete, or no more data
+          if pos < len
+          then
+            case updateUtf8State (B.index bs pos) s' of
+              -- error
+              Nothing -> (-1, Nothing)
+              -- keep going
+              Just s'' -> g (pos + 1) s''
+          -- no more data
+          else (-1, Just s')
+  in
+  g 0 s
+
 decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
 decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st)
 decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)

From 05d898eee7f3dc24833c101281bd7e73e2a88f5f Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 11 Oct 2022 21:44:21 -0600
Subject: [PATCH 47/87] little refactorin' on the prototype.

---
 src/Data/Text/Encoding.hs | 108 +++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 66 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index d09f47cc..a4db6d29 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -302,10 +302,9 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 #endif
 
 {-
-parseUtf8ChunkFrom chunk from = (n, ms)
+parseUtf8Chunk chunk = (n, ms)
 
-n is the end index of the longest prefix of chunk that is valid UTF-8 starting
-from index 'from' where the length of the prefix = n - from
+n is the end index of the longest prefix of chunk that is valid UTF-8
 
 ms
 * When ms = Nothing, there is an error: the bytes from index n and beyond are
@@ -313,76 +312,52 @@ ms
 * When ms = Just s, all of the remaining bytes from index n and beyond are the
   beginning of an incomplete UTF-8 code point, and s is the corresponding
   intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8Chunk`
+  `parseUtf8NextChunk`
 -}
-parseUtf8ChunkFrom :: ByteString -> Int -> (Int, Maybe Utf8CodePointState)
-parseUtf8ChunkFrom bs@(B.length -> len) pos =
-  let w n test = (len - pos) >= n && test (B.index bs $ len - n)
-      wi n word8 = w n (>= word8)
-      wc n mask word8 = w n $ (word8 ==) . (mask .&.)
+parseUtf8Chunk :: ByteString -> (Int, Maybe Utf8CodePointState)
+parseUtf8Chunk bs@(B.length -> len)
+#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
+  | guessUtf8Boundary > 0 &&
+    -- the rest of the bytestring valid utf-8 up to the boundary
+    (
+#ifdef SIMDUTF
+      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
+          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
+#else
+      B.isValidUtf8 $ B.take guessUtf8Boundary bs
+#endif
+    ) = getEndState guessUtf8Boundary
+    -- No
+  | otherwise = getEndState 0
+    where
+      getEndState ndx = parseUtf8 ndx ndx utf8StartState
+      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
       guessUtf8Boundary
-        | wi 3 0xf0 = Just $ len - 3  -- third to last char starts a four-byte code point
-        | wi 2 0xe0 = Just $ len - 2  -- pre-last char starts a three-or-four-byte code point
-        | wi 1 0xc2 = Just $ len - 1  -- last char starts a two-(or more-)byte code point
-        | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
-          wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
-          wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
-          w 1 (< 0x80) = Just len     -- last char is ASCII
-        | otherwise = Nothing         -- no clue
-      huntDownError ndx0 ndx s =
-        -- if ndx < len
-        -- then
+        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
+        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
+        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
+        | otherwise = len
+#else
+  = parseUtf8 0 0 utf8StartState
+    where
+#endif
+      parseUtf8 ndx0 ndx s =
+        if ndx < len
+        then
           case updateUtf8State (B.index bs ndx) s of
             Just s' ->
               let ndx' = ndx + 1 in
-              huntDownError (
+              parseUtf8 (
                 if isUtf8StateIsComplete s'
                 then ndx'
                 else ndx0
               ) ndx' s'
             Nothing -> (ndx0, Nothing)
-        -- else (ndx0, Just s)
-  in
-  case guessUtf8Boundary of
-    Just boundary ->
-      let getEndState ndx s
-            | ndx < len =
-              case updateUtf8State (B.index bs ndx) s of
-                Nothing -> Nothing
-                Just s' -> getEndState (ndx + 1) s'
-            | otherwise = Just s
-      in
-      if pos < boundary
-      then
-        -- is the rest of the bytestring valid utf-8 up to the boundary?
-        if (
-#ifdef SIMDUTF
-            withBS bs $ \ fp _ -> unsafeDupablePerformIO $
-              unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
-                c_is_valid_utf8 (plusPtr ptr pos) (fromIntegral $ boundary - pos)
-#elif MIN_VERSION_bytestring(0,11,2)
-            B.isValidUtf8 . B.take (boundary - pos) $ B.drop pos bs
-#else
-            let bLen = boundary - pos
-                step ndx s
-                  | ndx < pos + bLen =
-                    case updateUtf8State (B.unsafeIndex bs ndx) s of
-                    Just s' -> step (ndx + 1) s'
-                    Nothing -> False
-                  | otherwise = isUtf8StateIsComplete s
-            in
-            step pos utf8StartState
-#endif
-          )
-        -- Yes
-        then (pos, getEndState pos utf8StartState)
-        -- No
-        else huntDownError pos pos utf8StartState
-      else (pos, getEndState pos utf8StartState)
-    Nothing -> huntDownError pos pos utf8StartState
-
+        else (ndx0, Just s)
+  
 {-
-parseUtf8Chunk chunk s = (n, ms)
+parseUtf8NextChunk chunk s = (n, ms)
 
 n
 * When n >= 0, n is the length of the longest prefix of chunk that is valid
@@ -397,15 +372,16 @@ ms
 * When ms = Just s', all of the remaining bytes from index n and beyond are the
   beginning of an incomplete UTF-8 code point, and s' is the corresponding
   intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8Chunk`
+  `parseUtf8NextChunk`
 -}
-parseUtf8Chunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState)
-parseUtf8Chunk bs@(B.length -> len) s =
+parseUtf8NextChunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState)
+parseUtf8NextChunk bs@(B.length -> len) s =
   let g pos s' =
         -- first things first. let's try to get to the start of the next code point
         if isUtf8StateIsComplete s'
         -- found the beginning of the next code point, hand this off to someone else
-        then parseUtf8ChunkFrom bs pos
+        then case parseUtf8Chunk $ B.drop pos bs of
+          (len', s'') -> (pos + len', s'')
         -- no, code point is not complete yet
         else
           -- walk the rest of the code point until error, complete, or no more data

From 344c5c1db1b063537c9ea50171fe0e57855ce618 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 16 Oct 2022 19:10:21 -0600
Subject: [PATCH 48/87] parseUtf8Chunk adjustments

---
 src/Data/Text/Encoding.hs             | 219 ++++++++++++++------------
 tests/Tests/Properties/Transcoding.hs |  70 ++++++++
 2 files changed, 184 insertions(+), 105 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index a4db6d29..cf2b9f50 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -22,13 +22,15 @@
 -- <http://hackage.haskell.org/package/text-icu text-icu package>.
 
 module Data.Text.Encoding
-    (
+    ( Utf8ParseState
+    , parseUtf8Chunk
+    , parseUtf8NextChunk
     -- * Decoding ByteStrings to Text
     -- $strict
 
     -- ** Total Functions #total#
     -- $total
-      decodeLatin1
+    , decodeLatin1
     , decodeUtf8Lenient
     , decodeAsciiPrefix
     , Utf8DecodeState
@@ -217,6 +219,114 @@ decodeAsciiPrefix bs = if B.null bs
             else Nothing
       pure (prefix, suffix)
 
+#ifdef SIMDUTF
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
+#endif
+
+data Utf8ParseState = Utf8ParseState [ByteString] Utf8CodePointState
+  deriving (Eq, Ord, Show, Read)
+
+{-
+`parseUtf8Chunk chunk = (n, es)`
+
+n is the end index of the longest prefix of chunk that is valid UTF-8
+
+es
+* When `es = Left p`, there is an error: the bytes from index n and beyond are
+  not part of a valid UTF-8 code point, and is the index of the start of the
+  next (possibly valid) code point. `p - n` is the number of invalid bytes.
+* When `es = Right s`, all of the remaining bytes from index n and beyond are the
+  beginning of an incomplete UTF-8 code point, and s is the corresponding
+  intermediate decoding state, which can be used to parse the next chunk with
+  `parseUtf8NextChunk`
+-}
+parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState)
+parseUtf8Chunk bs@(B.length -> len)
+#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
+  | guessUtf8Boundary > 0 &&
+    -- the rest of the bytestring valid utf-8 up to the boundary
+    (
+#ifdef SIMDUTF
+      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
+          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
+#else
+      B.isValidUtf8 $ B.take guessUtf8Boundary bs
+#endif
+    ) = getEndState guessUtf8Boundary
+    -- No
+  | otherwise = getEndState 0
+    where
+      getEndState ndx = parseUtf8 ndx ndx utf8StartState
+      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
+      guessUtf8Boundary
+        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
+        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
+        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
+        | otherwise = len
+#else
+  = parseUtf8 0 0 utf8StartState
+    where
+#endif
+      parseUtf8 ndx0 ndx s =
+        if ndx < len
+        then
+          let ndx' = ndx + 1 in
+          case updateUtf8State (B.index bs ndx) s of
+            Just s' ->
+              parseUtf8 (
+                if isUtf8StateIsComplete s'
+                then ndx'
+                else ndx0
+              ) ndx' s'
+            Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
+        else (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s)
+
+{-
+parseUtf8NextChunk chunk s = (n, es)
+
+n
+* When n >= 0, n is the length of the longest prefix of chunk that is valid
+  UTF-8 starting from state s (i.e., n points after the end of a full codepoint,
+  to the beginning of an incomplete codepoint).
+* When n > 0, the starting code point from the previous input is either still
+  incomplete with the additonal chunk or in error.
+
+es
+* When es = Left p, there is an error: the bytes from index n and beyond are
+  not part of a valid UTF-8 code point, and is the index of the start of the
+  next (possibly valid) code point. `p - n` is the number of invalid bytes.
+* When es = Right s', all of the remaining bytes from index n and beyond are the
+  beginning of an incomplete UTF-8 code point, and s' is the corresponding
+  intermediate decoding state, which can be used to parse the next chunk with
+  `parseUtf8NextChunk`
+-}
+parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
+parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s)
+  | len > 0 =
+    let g pos s' =
+          -- first things first. let's try to get to the start of the next code point
+          if isUtf8StateIsComplete s'
+          -- found the beginning of the next code point, hand this off to someone else
+          then case parseUtf8Chunk $ B.drop pos bs of
+            (len', mS) -> (pos + len', mS)
+          -- no, code point is not complete yet
+          else
+            -- walk the rest of the code point until error, complete, or no more data
+            if pos < len
+            then
+              case updateUtf8State (B.index bs pos) s' of
+                -- error
+                Nothing -> (leadPos, Left pos)
+                -- keep going
+                Just s'' -> g (pos + 1) s''
+            -- no more data
+            else (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s')
+    in g 0 s
+  | otherwise = (leadPos, Right st)
+    where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead)
+
 -- | A value that represents the state of a UTF-8 decoding process potentionally
 -- across multiple 'ByteString's.
 --
@@ -296,107 +406,6 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss')
     Just (_, cpLen) -> Right cpLen
 getCodePointStateOrError _ = Right 0
 
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
-
-{-
-parseUtf8Chunk chunk = (n, ms)
-
-n is the end index of the longest prefix of chunk that is valid UTF-8
-
-ms
-* When ms = Nothing, there is an error: the bytes from index n and beyond are
-  not part of a valid UTF-8 code point.
-* When ms = Just s, all of the remaining bytes from index n and beyond are the
-  beginning of an incomplete UTF-8 code point, and s is the corresponding
-  intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8NextChunk`
--}
-parseUtf8Chunk :: ByteString -> (Int, Maybe Utf8CodePointState)
-parseUtf8Chunk bs@(B.length -> len)
-#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
-  | guessUtf8Boundary > 0 &&
-    -- the rest of the bytestring valid utf-8 up to the boundary
-    (
-#ifdef SIMDUTF
-      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
-        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
-          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
-#else
-      B.isValidUtf8 $ B.take guessUtf8Boundary bs
-#endif
-    ) = getEndState guessUtf8Boundary
-    -- No
-  | otherwise = getEndState 0
-    where
-      getEndState ndx = parseUtf8 ndx ndx utf8StartState
-      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
-      guessUtf8Boundary
-        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
-        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
-        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
-        | otherwise = len
-#else
-  = parseUtf8 0 0 utf8StartState
-    where
-#endif
-      parseUtf8 ndx0 ndx s =
-        if ndx < len
-        then
-          case updateUtf8State (B.index bs ndx) s of
-            Just s' ->
-              let ndx' = ndx + 1 in
-              parseUtf8 (
-                if isUtf8StateIsComplete s'
-                then ndx'
-                else ndx0
-              ) ndx' s'
-            Nothing -> (ndx0, Nothing)
-        else (ndx0, Just s)
-  
-{-
-parseUtf8NextChunk chunk s = (n, ms)
-
-n
-* When n >= 0, n is the length of the longest prefix of chunk that is valid
-  UTF-8 starting from state s (i.e., n points after the end of a full codepoint,
-  to the beginning of an incomplete codepoint).
-* When n > 0, the starting code point from the previous input is either still
-  incomplete with the additonal chunk or in error.
-
-ms
-* When ms = Nothing, there is an error: the bytes from index n and beyond are
-  not part of a valid UTF-8 code point.
-* When ms = Just s', all of the remaining bytes from index n and beyond are the
-  beginning of an incomplete UTF-8 code point, and s' is the corresponding
-  intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8NextChunk`
--}
-parseUtf8NextChunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState)
-parseUtf8NextChunk bs@(B.length -> len) s =
-  let g pos s' =
-        -- first things first. let's try to get to the start of the next code point
-        if isUtf8StateIsComplete s'
-        -- found the beginning of the next code point, hand this off to someone else
-        then case parseUtf8Chunk $ B.drop pos bs of
-          (len', s'') -> (pos + len', s'')
-        -- no, code point is not complete yet
-        else
-          -- walk the rest of the code point until error, complete, or no more data
-          if pos < len
-          then
-            case updateUtf8State (B.index bs pos) s' of
-              -- error
-              Nothing -> (-1, Nothing)
-              -- keep going
-              Just s'' -> g (pos + 1) s''
-          -- no more data
-          else (-1, Just s')
-  in
-  g 0 s
-
 decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
 decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st)
 decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)
@@ -587,7 +596,7 @@ bs1Off = 0  cpPos = Δbs1Off
       Nothing -> huntDownError 0 cpPos cpSt
 
 -- | Decodes a UTF-8 'ByteString' in the context of what has already been
--- decoded which is represented by the 'Utf8DecodeState' value. Returned is the
+-- decoded which is represented by the 'Utf8ParseState value. Returned is the
 -- new decode state and either ('Right') the number of 'Word8's that make up the
 -- incomplete code point at the end of the input, or ('Left') the start position
 -- of an invalid code point that was encountered. The position is relative to
@@ -622,7 +631,7 @@ decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tb
 decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState)
 decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State
 
--- | If the 'Utf8DecodeState' value indicates an error state, the 'Word8' that
+-- | If the 'Utf8ParseState value indicates an error state, the 'Word8' that
 -- the state value point to is replaced with the input 'Text' value which may
 -- be empty. Decoding resumes after the text is inserted and produces the result
 -- described by 'decodeNextUtf8Chunk'.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 99e6618b..aec54ec0 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -8,6 +8,7 @@ module Tests.Properties.Transcoding
 
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
+import Data.Either (isLeft, isRight)
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -35,6 +36,62 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
+t_p_utf8_1   = case E.parseUtf8Chunk (B.pack [0x63]) of
+  (result, st) -> whenEqProp result 1 . property $ isRight st
+t_p_utf8_2   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of
+  (result, st) -> whenEqProp result 3 . property $ isRight st
+t_p_utf8_3   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of
+  (result, st) -> whenEqProp result 5 . property $ isRight st
+t_p_utf8_4   = case E.parseUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of
+  (result, st) -> whenEqProp result 5 . property $ isRight st
+t_p_utf8_5   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of
+  (result, st) -> whenEqProp result 5 . property $ isRight st
+t_p_utf8_6   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of
+  (result, st) -> whenEqProp result 2 . property $ isRight st
+t_p_utf8_7   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of
+  (result, st) -> whenEqProp result 3 . property $ isRight st
+t_p_utf8_8   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of
+  (result, st) -> whenEqProp result 0 $ st === Left 3
+t_p_utf8_9   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of
+  (result, st) -> whenEqProp result 2 $ st === Left 3
+t_p_utf8_0   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of
+  (result, st) -> whenEqProp result 2 $ st === Left 3
+
+t_pn_utf8_1   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of
+  (result0, mS) -> whenEqProp result0 0 $
+    case mS of
+      Left _ -> counterexample (show mS) False
+      Right s -> case E.parseUtf8NextChunk (B.pack [0x80]) s of
+        (result1, mS1) -> whenEqProp result1 1 $
+          if isLeft mS1
+          then counterexample (show mS1) False
+          else case E.parseUtf8NextChunk (B.pack [0x7f]) s of
+            (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0
+t_pn_utf8_2   = case E.parseUtf8Chunk (B.pack [0xF0]) of
+  (result0, mS0) -> whenEqProp result0 0 $
+    case mS0 of
+      Left _ -> counterexample (show mS0) False
+      Right s0 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s0 of
+        (result1, mS1) -> whenEqProp result1 (-1) .
+          whenEqProp mS1 (Left 0) $
+          case E.parseUtf8NextChunk (B.pack [0x90]) s0 of
+            (result2, mS2) -> whenEqProp result2 (-1) $
+              case mS2 of
+                Left _ -> counterexample (show mS2) False
+                Right s1 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s1 of
+                  (result3, mS3) -> whenEqProp result3 (-2) .
+                    whenEqProp mS3 (Left 0) $
+                    case E.parseUtf8NextChunk (B.pack [0x80]) s1 of
+                      (result4, mS4) -> whenEqProp result4 (-2) $
+                        case mS4 of
+                          Left _ -> counterexample (show mS3) False
+                          Right s2 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s2 of
+                            (result5, mS5) -> whenEqProp result5 (-3) .
+                              whenEqProp mS5 (Left 0) $
+                              case E.parseUtf8NextChunk (B.pack [0x80]) s2 of
+                                (result6, mS6) -> whenEqProp result6 1 $
+                                  property $ isRight mS6
+
 t_utf8_c     = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
@@ -234,6 +291,19 @@ testTranscoding =
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,
     testProperty "tl_latin1" tl_latin1,
+    testProperty "t_p_utf8_1" t_p_utf8_1,
+    testProperty "t_p_utf8_2" t_p_utf8_2,
+    testProperty "t_p_utf8_3" t_p_utf8_3,
+    testProperty "t_p_utf8_4" t_p_utf8_4,
+    testProperty "t_p_utf8_5" t_p_utf8_5,
+    testProperty "t_p_utf8_6" t_p_utf8_6,
+    testProperty "t_p_utf8_7" t_p_utf8_7,
+    testProperty "t_p_utf8_8" t_p_utf8_8,
+    testProperty "t_p_utf8_9" t_p_utf8_9,
+    testProperty "t_p_utf8_0" t_p_utf8_0,
+    testProperty "t_pn_utf8_1" t_pn_utf8_1,
+    testProperty "t_pn_utf8_2" t_pn_utf8_2,
+    -- testProperty "t_pn_utf8_3" t_pn_utf8_3,
     testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',

From dd1f57d8262165a31f0cadeac6b85d21c6ba8bc0 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 16 Oct 2022 22:51:06 -0600
Subject: [PATCH 49/87] a little more prototypin' UTF-8 parser/decoder...

---
 src/Data/Text/Encoding.hs | 117 +++++++++++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 22 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index cf2b9f50..e7e07459 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -23,8 +23,17 @@
 
 module Data.Text.Encoding
     ( Utf8ParseState
+    , partialCodePoint
+    , codePointState
     , parseUtf8Chunk
     , parseUtf8NextChunk
+    , TextDataStack
+    , dataStack
+    , stackLen
+    , startUtf8ParseState
+    , injectText
+    , stackToText
+
     -- * Decoding ByteStrings to Text
     -- $strict
 
@@ -224,9 +233,15 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
 #endif
 
-data Utf8ParseState = Utf8ParseState [ByteString] Utf8CodePointState
+data Utf8ParseState = Utf8ParseState
+  { partialCodePoint :: [ByteString]
+  , codePointState :: Utf8CodePointState
+  }
   deriving (Eq, Ord, Show, Read)
 
+startUtf8ParseState ::Utf8ParseState 
+startUtf8ParseState = Utf8ParseState [] utf8StartState
+
 {-
 `parseUtf8Chunk chunk = (n, es)`
 
@@ -269,9 +284,8 @@ parseUtf8Chunk bs@(B.length -> len)
   = parseUtf8 0 0 utf8StartState
     where
 #endif
-      parseUtf8 ndx0 ndx s =
-        if ndx < len
-        then
+      parseUtf8 ndx0 ndx s
+        | ndx < len =
           let ndx' = ndx + 1 in
           case updateUtf8State (B.index bs ndx) s of
             Just s' ->
@@ -281,7 +295,7 @@ parseUtf8Chunk bs@(B.length -> len)
                 else ndx0
               ) ndx' s'
             Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        else (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s)
+        | otherwise = (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s)
 
 {-
 parseUtf8NextChunk chunk s = (n, es)
@@ -305,28 +319,87 @@ es
 parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
 parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s)
   | len > 0 =
-    let g pos s' =
+    let g pos s'
           -- first things first. let's try to get to the start of the next code point
-          if isUtf8StateIsComplete s'
-          -- found the beginning of the next code point, hand this off to someone else
-          then case parseUtf8Chunk $ B.drop pos bs of
-            (len', mS) -> (pos + len', mS)
-          -- no, code point is not complete yet
-          else
-            -- walk the rest of the code point until error, complete, or no more data
-            if pos < len
-            then
-              case updateUtf8State (B.index bs pos) s' of
-                -- error
-                Nothing -> (leadPos, Left pos)
-                -- keep going
-                Just s'' -> g (pos + 1) s''
-            -- no more data
-            else (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s')
+          | isUtf8StateIsComplete s' =
+            -- found the beginning of the next code point, hand this off to someone else
+            case parseUtf8Chunk $ B.drop pos bs of
+              (len', mS) -> (pos + len', mS)
+          -- code point is not complete yet
+          -- walk the rest of the code point until error, complete, or no more data
+          | pos < len =
+            case updateUtf8State (B.index bs pos) s' of
+              -- error
+              Nothing -> (leadPos, Left pos)
+              -- keep going
+              Just s'' -> g (pos + 1) s''
+          -- no more data
+          | otherwise = (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s')
     in g 0 s
   | otherwise = (leadPos, Right st)
     where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead)
 
+data TextDataStack = TextDataStack
+  { dataStack :: [Either Text ByteString]
+  , stackLen :: Int
+  }
+  deriving Show
+
+injectText :: Text -> TextDataStack -> TextDataStack
+injectText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
+  if tLen > 0
+  then TextDataStack (Left t : stack) $ sLen + tLen
+  else tds
+
+stackToText :: TextDataStack -> Text
+stackToText (TextDataStack stack sLen)
+  | sLen > 0 = runST $
+    do
+      dst <- A.new sLen
+      let g (dat : dataStack') tLen' =
+              (case dat of
+                Left (Text arr0 off utf8Len) -> do
+                  let dstOff = tLen' - utf8Len
+                  A.copyI utf8Len dst dstOff arr0 off
+                  pure dstOff
+                Right bs@(B.length -> utf8Len) -> do
+                  let dstOff = tLen' - utf8Len
+                  withBS bs $ \ fp _ ->
+                    unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+                      unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
+                  pure dstOff) >>= g dataStack'
+          g _ _ = pure ()
+      g stack sLen
+      arr <- A.unsafeFreeze dst
+      pure $ Text arr 0 sLen
+  | otherwise = empty
+
+decodeNextUtf8Chunk'
+  :: ByteString
+  -> Utf8ParseState
+  -> TextDataStack
+  -> (Either (Int, ByteString) Utf8ParseState, TextDataStack)
+decodeNextUtf8Chunk' bs s tds =
+  case parseUtf8NextChunk bs s of
+    (len, res) ->
+      let stackedData'
+            | len >= 0 =
+              let stackedData@(TextDataStack stack' sLen') =
+                    foldr (\ bs'@(B.length -> bLen) (TextDataStack stack sLen) ->
+                      TextDataStack (Right bs' : stack) $ sLen + bLen) tds $ partialCodePoint s
+              in
+              if len > 0
+              then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len
+              else stackedData
+            | otherwise = tds
+      in
+      case res of
+        Left pos -> (Left (pos, B.drop pos bs), stackedData')
+        Right s' -> (Right s', stackedData')
+
+decodeUtf8Chunk' :: ByteString -> (Either (Int, ByteString) Utf8ParseState, TextDataStack)
+decodeUtf8Chunk' bs = decodeNextUtf8Chunk' bs startUtf8ParseState $ TextDataStack [] 0
+
 -- | A value that represents the state of a UTF-8 decoding process potentionally
 -- across multiple 'ByteString's.
 --

From ee53c15fb3e6762eacee011b0cd37ceff06c0f42 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 22 Oct 2022 23:07:01 -0600
Subject: [PATCH 50/87] getting there...

---
 src/Data/Text/Encoding.hs             | 503 ++++----------------------
 tests/Tests/Properties/Transcoding.hs |  62 ++--
 2 files changed, 117 insertions(+), 448 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index e7e07459..1c4298c3 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -26,12 +26,13 @@ module Data.Text.Encoding
     , partialCodePoint
     , codePointState
     , parseUtf8Chunk
-    , parseUtf8NextChunk
+    , parseNextUtf8Chunk
+    , startUtf8ParseState
     , TextDataStack
     , dataStack
     , stackLen
-    , startUtf8ParseState
-    , injectText
+    , emptyStack
+    , pushText
     , stackToText
 
     -- * Decoding ByteStrings to Text
@@ -42,12 +43,8 @@ module Data.Text.Encoding
     , decodeLatin1
     , decodeUtf8Lenient
     , decodeAsciiPrefix
-    , Utf8DecodeState
-    , startUtf8State
-    , outAvailableUtf8Text
     , decodeNextUtf8Chunk
     , decodeUtf8Chunk
-    , recoverFromUtf8Error
 
     -- *** Catchable failure
     , decodeUtf8'
@@ -91,14 +88,13 @@ module Data.Text.Encoding
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try)
-import Control.Monad (when)
 import Control.Monad.ST (runST)
 import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), empty, append)
+import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
@@ -254,7 +250,7 @@ es
 * When `es = Right s`, all of the remaining bytes from index n and beyond are the
   beginning of an incomplete UTF-8 code point, and s is the corresponding
   intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8NextChunk`
+  `parseNextUtf8Chunk`
 -}
 parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState)
 parseUtf8Chunk bs@(B.length -> len)
@@ -295,10 +291,10 @@ parseUtf8Chunk bs@(B.length -> len)
                 else ndx0
               ) ndx' s'
             Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        | otherwise = (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s)
+        | otherwise = (ndx0, Right $ Utf8ParseState (if ndx0 < len then [B.drop ndx0 bs] else []) s)
 
 {-
-parseUtf8NextChunk chunk s = (n, es)
+parseNextUtf8Chunk chunk s = (n, es)
 
 n
 * When n >= 0, n is the length of the longest prefix of chunk that is valid
@@ -314,17 +310,17 @@ es
 * When es = Right s', all of the remaining bytes from index n and beyond are the
   beginning of an incomplete UTF-8 code point, and s' is the corresponding
   intermediate decoding state, which can be used to parse the next chunk with
-  `parseUtf8NextChunk`
+  `parseNextUtf8Chunk`
 -}
-parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
-parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s)
+parseNextUtf8Chunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
+parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s)
   | len > 0 =
     let g pos s'
           -- first things first. let's try to get to the start of the next code point
           | isUtf8StateIsComplete s' =
             -- found the beginning of the next code point, hand this off to someone else
             case parseUtf8Chunk $ B.drop pos bs of
-              (len', mS) -> (pos + len', mS)
+              (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS)
           -- code point is not complete yet
           -- walk the rest of the code point until error, complete, or no more data
           | pos < len =
@@ -345,8 +341,11 @@ data TextDataStack = TextDataStack
   }
   deriving Show
 
-injectText :: Text -> TextDataStack -> TextDataStack
-injectText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
+emptyStack :: TextDataStack
+emptyStack = TextDataStack [] 0
+
+pushText :: Text -> TextDataStack -> TextDataStack
+pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
   if tLen > 0
   then TextDataStack (Left t : stack) $ sLen + tLen
   else tds
@@ -374,382 +373,37 @@ stackToText (TextDataStack stack sLen)
       pure $ Text arr 0 sLen
   | otherwise = empty
 
-decodeNextUtf8Chunk'
+decodeNextUtf8Chunk
   :: ByteString
   -> Utf8ParseState
   -> TextDataStack
-  -> (Either (Int, ByteString) Utf8ParseState, TextDataStack)
-decodeNextUtf8Chunk' bs s tds =
-  case parseUtf8NextChunk bs s of
+  -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
+decodeNextUtf8Chunk bs s tds =
+  case parseNextUtf8Chunk bs s of
     (len, res) ->
       let stackedData'
             | len >= 0 =
               let stackedData@(TextDataStack stack' sLen') =
-                    foldr (\ bs'@(B.length -> bLen) (TextDataStack stack sLen) ->
-                      TextDataStack (Right bs' : stack) $ sLen + bLen) tds $ partialCodePoint s
+                    foldl (\ tds'@(TextDataStack stack sLen) bs'@(B.length -> bLen) ->
+                      if bLen > 0
+                      then TextDataStack (Right bs' : stack) $ sLen + bLen
+                      else tds'
+                      ) tds $ partialCodePoint s
               in
               if len > 0
               then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len
               else stackedData
             | otherwise = tds
       in
-      case res of
-        Left pos -> (Left (pos, B.drop pos bs), stackedData')
-        Right s' -> (Right s', stackedData')
+      ( ( len
+        , case res of
+            Left pos -> Left (pos, B.drop pos bs)
+            Right s' -> Right s'
+        )
+      , stackedData')
 
-decodeUtf8Chunk' :: ByteString -> (Either (Int, ByteString) Utf8ParseState, TextDataStack)
-decodeUtf8Chunk' bs = decodeNextUtf8Chunk' bs startUtf8ParseState $ TextDataStack [] 0
-
--- | A value that represents the state of a UTF-8 decoding process potentionally
--- across multiple 'ByteString's.
---
--- @since 2.0.2
-data Utf8DecodeState = Utf8DecodeState
-  -- Code point decode state or error
-  (Maybe
-      -- self-explanatory (I hope)
-    ( Utf8CodePointState
-      -- Count of Word8s that have been evaluated so far for this code point.
-      -- The first word is specified by Position indicator below
-    , Int
-    ))
-  -- ByteStrings containing data whose evaluations are unfinished
-  [ByteString]
-  -- Postion within the lead ByteString of either unfinished evaluated data or
-  -- the first word of an invalid code point.
-  Int
-  -- the first non-listed Word8 in the lead ByteString.
-  Int
-  -- Queued text data.
-  [Either Text (ByteString, Int, Int)]
-  -- Word8 length of listed data.
-  Int
-  deriving (Show)
-
--- | This represents the begining state of a UTF-8 decoding process.
---
--- @since 2.0.2
-startUtf8State :: Utf8DecodeState
-startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 0 [] 0
-
--- | Takes whatever data has been decoded thus far and spits it out as a `Text`
--- value and a `Utf8DecodeState` value that no longer references the decoded
--- data. This function operates on error states, but does not clear the error.
--- (See 'recoverFromUtf8Error'.)
---
--- @since 2.0.2
-outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState)
-outAvailableUtf8Text st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) =
-  let tbpLen = bs1Off - tbpPos
-      totalLen = tLen + tbpLen
-  in
-  if totalLen > 0
-  then runST $ do
-    dst <- A.new totalLen
-    when (tbpLen > 0) . withBS (head bss) $ \ fp _ ->
-      unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-        unsafeSTToIO $ A.copyFromPointer dst tLen (src `plusPtr` tbpPos) tbpLen
-    let g (dat : dataStack') tLen' =
-            (case dat of
-              Left (Text arr0 off utf8Len) -> do
-                let dstOff = tLen' - utf8Len
-                A.copyI utf8Len dst dstOff arr0 off
-                pure dstOff
-              Right (bs, bsOff, utf8Len) -> do
-                let dstOff = tLen' - utf8Len
-                withBS bs $ \ fp _ ->
-                  unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-                    unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len
-                pure dstOff) >>= g dataStack'
-        g _ _ = pure ()
-    g dataStack tLen
-    arr <- A.unsafeFreeze dst
-    pure (Text arr 0 totalLen, Utf8DecodeState mCpSt bss bs1Off bs1Off [] 0)
-  else (empty, st)
-
-getCodePointStateOrError :: Utf8DecodeState -> Either Int Int
-getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _ _) =
-  case mCpSt of
-    Nothing ->
-      let (lenInit, _) = foldr
-            (\ bs@(B.length -> len') (lenInit', (lenN', _)) ->
-              (lenInit' + lenN', (len', bs))) (0, (len1 - bs1Off, bs1)) bss'
-      in
-      Left (bs1Off - lenInit)
-    Just (_, cpLen) -> Right cpLen
-getCodePointStateOrError _ = Right 0
-
-decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st)
-decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)
-decodeUtf8Chunks
-  st@(Utf8DecodeState
-    (Just (cpSt, cpPos))
-    bss@(bs1@(B.length -> len1) : bss')
-    bs1Off
-    tbpPos
-    dataStack
-    tLen
-  )
-  =
-  {-
-bs1Off = 0  cpPos = Δbs1Off
-   |    bs1Off     |   boundary = Δbs1Off
-   v       v   bsX v           v
-  |. . bs1 . .|. .|. . .bsN. . .|
-  ^-----------^   ^-------------^
-      len1            lenN
-          ^---^   ^-----------^
-          len1_   isValidBS span
-          ^-------^
-          lenInit
-          ^---------------------^
-                len
-  --}
-  let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytestring that's to be evaluated.
-      (lenInit, (lenN, bsN)) = foldr
-        (\ bs@(B.length -> len') (lenInit', (lenN', _)) ->
-          (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss'
-      len = lenInit + lenN
-  in
-  if len == cpPos
-  then (getCodePointStateOrError st, st)
-  else
-    let index i =
-          if i < len1_
-          then B.index bs1 (i + bs1Off)
-          else
-            let index' i' (bs@(B.length -> len0) : bss'') =
-                  if i' < len0
-                  then B.index bs i'
-                  else index' (i' - len0) bss''
-                index' i' _ = B.index bsN i'
-            in
-            index' (i - len1_) bss'
-        guessUtf8Boundary
-          | wi 3 0xf0 = Just $ len - 3  -- third to last char starts a four-byte code point
-          | wi 2 0xe0 = Just $ len - 2  -- pre-last char starts a three-or-four-byte code point
-          | wi 1 0xc2 = Just $ len - 1  -- last char starts a two-(or more-)byte code point
-          | wc 4 0xf8 0xf0 ||           -- last four bytes are a four-byte code point
-            wc 3 0xf0 0xe0 ||           -- last three bytes are a three-byte code point
-            wc 2 0xe0 0xc0 ||           -- last two bytes are a two-byte code point
-            w 1 (< 0x80) = Just len     -- last char is ASCII
-          | otherwise = Nothing         -- no clue
-          where
-            w n test = len >= n && test (index $ len - n)
-            wc n mask word8 = w n $ (word8 ==) . (mask .&.)
-            wi n word8 = w n (>= word8)
-        -- push the available valid data on to the list, and remove completely evaulated bytestrings.
-        pushValidUtf8 wordCount mCps =
-          let bs1Off' = bs1Off + wordCount
-              (bss'''', bs1Off''', tbpPos'', dataStack'', tLen'') =
-                if wordCount < len1_
-                then (bss, bs1Off', tbpPos, dataStack, tLen)
-                else
-                  let pushValidUtf8' wordCount' bss''@(bs'@(B.length -> len') : bss''') bs1Off'' tbpPos' dataStack' tLen' =
-                        if wordCount' < len'
-                        then (bss'', bs1Off'', tbpPos', dataStack', tLen')
-                        else pushValidUtf8' (wordCount' - len') bss''' (bs1Off'' - len') 0 (Right (bs', 0, len') : dataStack') (tLen' + len')
-                      pushValidUtf8' _ _ bs1Off'' tbpPos' dataStack' tLen' = ([], bs1Off'', tbpPos', dataStack', tLen')
-                      bs1WordCount = bs1Off - tbpPos + len1_
-                  in
-                  pushValidUtf8' (wordCount - len1_) bss' (bs1Off' - len1) 0 (Right (bs1, tbpPos, bs1WordCount) : dataStack) (tLen + bs1WordCount)
-              st' = Utf8DecodeState mCps bss'''' bs1Off''' tbpPos'' dataStack'' tLen''
-          in
-          (getCodePointStateOrError st', st')
-        huntDownError off ndx cps =
-          if ndx < len
-          then
-            case updateUtf8State (index ndx) cps of
-              Just cps' ->
-                let ndx' = ndx + 1 in
-                huntDownError (
-                  if isUtf8StateIsComplete cps'
-                  then ndx'
-                  else off
-                ) ndx' cps'
-              Nothing -> pushValidUtf8 off Nothing
-          else pushValidUtf8 off $ Just (cps, ndx - off)
-    in
-    -- did we find the boundary?
-    case guessUtf8Boundary of
-      -- yes
-      Just boundary ->
-        -- are we before it?
-        if cpPos < boundary
-        -- yes: let's check this incomplete code point before checking the rest up to the boundary
-        then
-          let checkIncompleteCodePoint cpSt' cpPos'
-                -- a complete code point
-                | isUtf8StateIsComplete cpSt' =
-                  let getEndState ndx cpSt''
-                        | ndx < len =
-                          case updateUtf8State (index ndx) cpSt'' of
-                            Nothing -> Nothing
-                            Just cpSt''' -> getEndState (ndx + 1) cpSt'''
-                        | otherwise = Just (cpSt'', ndx - boundary)
-                      soFarSoGood =
-                        pushValidUtf8 boundary $ getEndState boundary cpSt'
-                  in
-                  -- are we at the boundary?
-                  if boundary == cpPos'
-                  -- yes: get the state of the last code point
-                  then soFarSoGood
-                  -- no:
-                  else
-                    -- are we before bsN?
-                    if cpPos' < lenInit
-                    -- yes
-                    then
-                      -- keep walking the data until we get to bsN or an error
-                      case updateUtf8State (index cpPos') cpSt' of
-                        Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
-                        Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen)
-                    -- no: we're in bsN
-                    else let
-                          off = (if lenInit > 0
-                            then cpPos' - lenInit
-                            else cpPos' + bs1Off)
-                      in
-                      -- is the rest of the bytestring valid utf-8 up to the boundary?
-                      if (
-#ifdef SIMDUTF
-                          withBS bsN $ \ fp _ -> unsafeDupablePerformIO $
-                            unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
-                              c_is_valid_utf8 (plusPtr ptr off) (fromIntegral $ boundary - cpPos')
-#elif MIN_VERSION_bytestring(0,11,2)
-                          B.isValidUtf8 . B.take (boundary - cpPos') $ B.drop off bsN
-#else
-                          let bLen = boundary - cpPos'
-                              step ndx cps
-                                | ndx < off + bLen =
-                                  case updateUtf8State (B.unsafeIndex bsN ndx) cps of
-                                  Just cps' -> step (ndx + 1) cps'
-                                  Nothing -> False
-                                | otherwise = isUtf8StateIsComplete cps
-                          in
-                          step off utf8StartState
-#endif
-                        )
-                      -- Yes
-                      then soFarSoGood
-                      -- No
-                      else huntDownError cpPos' cpPos' cpSt'
-                -- We're mid code point
-                | otherwise =
-                  if cpPos' < len
-                  then
-                    -- try to complete the code point
-                    case updateUtf8State (index cpPos') cpSt' of
-                      Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1)
-                      -- just enough additional data to find an error with the code point
-                      Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen)
-                  else
-                    -- didn't get enough additional data to complete the code point
-                    (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off tbpPos dataStack tLen)
-          in
-          checkIncompleteCodePoint cpSt cpPos
-        -- no, we're past the boundary
-        else
-          -- the code point is the only thing that (potentially) changes
-          let getEndCodePointState cpPos' cpSt'
-                | cpPos' < len =
-                  case updateUtf8State (index cpPos') cpSt' of
-                    Nothing -> Nothing
-                    Just cpSt'' -> getEndCodePointState (cpPos' + 1) cpSt''
-                | otherwise = Just (cpSt', cpPos' - boundary)
-              mCpStLen = getEndCodePointState cpPos cpSt
-          in
-          ( case mCpStLen of
-            Nothing -> Left (-lenInit)
-            Just _ -> Right len
-          , Utf8DecodeState mCpStLen bss bs1Off tbpPos dataStack tLen
-          )
-      -- no: there's an error
-      Nothing -> huntDownError 0 cpPos cpSt
-
--- | Decodes a UTF-8 'ByteString' in the context of what has already been
--- decoded which is represented by the 'Utf8ParseState value. Returned is the
--- new decode state and either ('Right') the number of 'Word8's that make up the
--- incomplete code point at the end of the input, or ('Left') the start position
--- of an invalid code point that was encountered. The position is relative to
--- the start of the input 'ByteString'.
---
--- If the previous 'ByteString' ended with an incomplete code point, the
--- beginning of the input data will be treated as a continuation of the code
--- point. NOTE: That in this case if the input causes the previous incomplete
--- code point to be invalid, the returned error ('Left') position value will be
--- negative.
---
--- If decoding the last 'ByteString' resulted in a error. The input is ignored,
--- and the state value is returned unchanged. Error states can be handled with
--- 'recoverFromUtf8Error'.
---
--- @since 2.0.2
-decodeNextUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
- ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st)
-decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen)
-  | len == 0 = (getCodePointStateOrError st, st)
-  | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off tbpPos dataStack tLen
-
--- | Decodes a 'ByteString' from a clean state.
---
--- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State'@
---
--- @since 2.0.2
-decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState)
-decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State
-
--- | If the 'Utf8ParseState value indicates an error state, the 'Word8' that
--- the state value point to is replaced with the input 'Text' value which may
--- be empty. Decoding resumes after the text is inserted and produces the result
--- described by 'decodeNextUtf8Chunk'.
---
--- If not in an error state, the 'Text' is inserted at the end of the data, but
--- before an incomplete code point at the end of the last input 'ByteString'.
---
--- @since 2.0.2
-recoverFromUtf8Error ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
- Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState)
-recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@(bs1@(B.length -> len) : bss') bs1Off tbpPos dataStack tLen) =
-  let tbpLen = bs1Off - tbpPos
-      (dataStack', tLen') =
-        if tbpLen > 0
-        then (Right (bs1, tbpPos, tbpLen) : dataStack, tLen + tbpLen)
-        else (dataStack, tLen)
-      (dataStack'', tLen'') =
-        if utf8Len > 0
-        then (Left t : dataStack', tLen' + utf8Len)
-        else (dataStack', tLen')
-      (mCpSt'', bss'', bs1Off'') =
-        case mCpSt of
-          Just _ -> (mCpSt, bss, bs1Off)
-          Nothing ->
-            let bs1Off' = bs1Off + 1
-                mCpSt' = Just (utf8StartState, 0)
-            in
-            if bs1Off' == len
-            then (mCpSt', bss', 0)
-            else (mCpSt', bss, bs1Off')
-  in
-  decodeUtf8Chunks $ Utf8DecodeState mCpSt'' bss'' bs1Off'' bs1Off'' dataStack'' tLen''
-recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ _ dataStack tLen) =
-  decodeUtf8Chunks $ Utf8DecodeState
-    (Just (utf8StartState, 0))
-    []
-    0
-    0
-    (if utf8Len > 0
-      then Left t : dataStack
-      else dataStack
-    ) $ tLen + utf8Len
+decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
+decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -760,46 +414,21 @@ decodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs
-  | B.null undecoded = txt
-  | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
-    Nothing -> txt'
-    Just c  -> T.singleton c `append` txt')
-  where
-    (txt, undecoded) = decodeUtf8With2 onErr mempty bs
-    txt' = decodeUtf8With onErr (B.tail undecoded)
-    desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-
--- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
-decodeUtf8With2 ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
-decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) =
-  let g res isSecondBs =
+decodeUtf8With onErr bs =
+  let handleErr errPos errEndPos bs' tds
+        | errPos < errEndPos = handleErr (errPos + 1) errEndPos bs' $
+          case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of
+            Just c -> pushText (T.singleton c) tds
+            Nothing -> tds
+        | otherwise = tds
+      g bs'@(B.length -> bLen) res =
         case res of
-          ((Left pos), st) ->
-            g ( recoverFromUtf8Error
-                ( case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 code point" . Just $
-                    if pos >= 0
-                    then B.index (if isSecondBs then bs2 else bs1) pos
-                    else B.index bs1 (len1 + pos) of
-                  Just c -> T.singleton c
-                  Nothing -> empty
-                ) st
-              ) isSecondBs
-          ((Right cpLen), st) ->
-            if isSecondBs
-            then
-              ( fst $ outAvailableUtf8Text st
-              , if cpLen > len2
-                then B.drop (len1 + len2 - cpLen) bs1 `B.append` bs2
-                else B.drop (len2 - cpLen) bs2
-              )
-            else g (decodeNextUtf8Chunk bs2 st) True
+          ((len, eS), tds) ->
+            case eS of
+              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ handleErr len pos bs' tds
+              Right _ -> stackToText $ handleErr len bLen bs' tds
   in
-  g (decodeUtf8Chunk bs1) False
+  g bs $ decodeUtf8Chunk bs
 
 -- $stream
 --
@@ -891,11 +520,39 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr = go mempty
-  where
-    go bs1 bs2 = Some txt undecoded (go undecoded)
-      where
-        (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
+-- streamDecodeUtf8With onErr = go mempty
+--   where
+--     go bs1 bs2 = Some txt undecoded (go undecoded)
+--       where
+--         (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
+streamDecodeUtf8With onErr bs =
+  let handleErr errPos errEndPos bss tds
+        | errPos < errEndPos =
+          let errPos' = errPos + 1
+              (bs'@(B.length -> len) : bss') = bss
+              h = if errPos' < len
+                then handleErr errPos' errEndPos bss
+                else handleErr 0 (errEndPos - len) bss'
+          in
+          h $
+            case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of
+              Just c -> pushText (T.singleton c) tds
+              Nothing -> tds
+        | otherwise = tds
+      g bs' s tds =
+        case decodeNextUtf8Chunk bs' s tds of
+          ((len, eS), tds') ->
+            case eS of
+              Left (pos, bs'') -> g bs'' startUtf8ParseState $
+                ( if len < 0
+                  then handleErr 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs']
+                  else handleErr len pos [bs']
+                ) tds'
+              Right s' -> let bss' = partialCodePoint s' in
+                Some (stackToText tds') (B.concat bss') $ \ bs'' ->
+                  g bs'' s' emptyStack
+  in
+  g bs startUtf8ParseState emptyStack
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index aec54ec0..2d5395a0 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -25,6 +25,7 @@ import qualified Data.ByteString.Lazy.Char8 as BLC
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
+import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete)
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
@@ -61,38 +62,46 @@ t_pn_utf8_1   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of
   (result0, mS) -> whenEqProp result0 0 $
     case mS of
       Left _ -> counterexample (show mS) False
-      Right s -> case E.parseUtf8NextChunk (B.pack [0x80]) s of
+      Right s -> case E.parseNextUtf8Chunk (B.pack [0x80]) s of
         (result1, mS1) -> whenEqProp result1 1 $
           if isLeft mS1
           then counterexample (show mS1) False
-          else case E.parseUtf8NextChunk (B.pack [0x7f]) s of
+          else case E.parseNextUtf8Chunk (B.pack [0x7f]) s of
             (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0
 t_pn_utf8_2   = case E.parseUtf8Chunk (B.pack [0xF0]) of
   (result0, mS0) -> whenEqProp result0 0 $
     case mS0 of
       Left _ -> counterexample (show mS0) False
-      Right s0 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s0 of
+      Right s0 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s0 of
         (result1, mS1) -> whenEqProp result1 (-1) .
           whenEqProp mS1 (Left 0) $
-          case E.parseUtf8NextChunk (B.pack [0x90]) s0 of
+          case E.parseNextUtf8Chunk (B.pack [0x90]) s0 of
             (result2, mS2) -> whenEqProp result2 (-1) $
               case mS2 of
                 Left _ -> counterexample (show mS2) False
-                Right s1 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s1 of
+                Right s1 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s1 of
                   (result3, mS3) -> whenEqProp result3 (-2) .
                     whenEqProp mS3 (Left 0) $
-                    case E.parseUtf8NextChunk (B.pack [0x80]) s1 of
+                    case E.parseNextUtf8Chunk (B.pack [0x80]) s1 of
                       (result4, mS4) -> whenEqProp result4 (-2) $
                         case mS4 of
                           Left _ -> counterexample (show mS3) False
-                          Right s2 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s2 of
+                          Right s2 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s2 of
                             (result5, mS5) -> whenEqProp result5 (-3) .
                               whenEqProp mS5 (Left 0) $
-                              case E.parseUtf8NextChunk (B.pack [0x80]) s2 of
+                              case E.parseNextUtf8Chunk (B.pack [0x80]) s2 of
                                 (result6, mS6) -> whenEqProp result6 1 $
                                   property $ isRight mS6
+t_pn_utf8_3 = case E.parseUtf8Chunk $ B.pack [0xc2] of
+  (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
+    Left _ -> counterexample (show eS1) False
+    Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $
+      if isUtf8StateIsComplete $ E.codePointState s1
+      then counterexample (show $ E.codePointState s1) False
+      else case E.parseNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
+        (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2
 
-t_utf8_c     = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
+t_utf8_c     = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
@@ -248,6 +257,11 @@ t_decode_with_error3 =
 t_decode_with_error4 =
   E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa"
 
+t_decode_with_error1' =
+  case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) of
+    E.Some x1 bs1 f1 -> whenEqProp x1 "" . whenEqProp bs1 (B.pack [0xc2]) $
+      case f1 $ B.pack [0x80, 0x80] of
+        E.Some x2 bs2 _ -> whenEqProp x2 "\x80x" $ bs2 === mempty
 t_decode_with_error2' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) of
     E.Some x _ _ -> x === "xa"
@@ -263,21 +277,18 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
-t_decode_chunk =
-  case E.decodeUtf8Chunk $ B.pack [0xC2, 97] of
-    (result, st) -> whenEqProp result (Left 0) $
-      case E.decodeNextUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0x63, 0x63, 0x63]) st of
-        (result', st') -> whenEqProp result' result $
-          case E.recoverFromUtf8Error "bbbb" st' of
-            (result'', st'') -> whenEqProp result'' (Right 0) $
-              case E.decodeNextUtf8Chunk (B.pack [0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0xc2]) st'' of
-                (result''', st''') -> whenEqProp result''' (Right 1) $
-                  case E.outAvailableUtf8Text st''' of
-                    (t, st'''') -> whenEqProp t "bbbbadddddd" $
-                      case E.decodeNextUtf8Chunk (B.singleton 0x80) st'''' of
-                        (result'''', st''''') -> whenEqProp result'''' (Right 0) $
-                          case E.outAvailableUtf8Text st''''' of
-                            (t', _) -> t' === "\x80"
+t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of
+  ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
+    case eS1 of
+      Left _ -> counterexample (show eS1) False
+      Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $
+        if isUtf8StateIsComplete $ E.codePointState s1
+        then counterexample (show $ E.codePointState s1) False
+        else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of
+          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $
+            case eS2 of
+              Right _ -> counterexample (show eS2) False
+              Left res -> res === (2, mempty)
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -303,7 +314,7 @@ testTranscoding =
     testProperty "t_p_utf8_0" t_p_utf8_0,
     testProperty "t_pn_utf8_1" t_pn_utf8_1,
     testProperty "t_pn_utf8_2" t_pn_utf8_2,
-    -- testProperty "t_pn_utf8_3" t_pn_utf8_3,
+    testProperty "t_pn_utf8_3" t_pn_utf8_3,
     testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
@@ -335,6 +346,7 @@ testTranscoding =
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,
+      testProperty "t_decode_with_error1'" t_decode_with_error1',
       testProperty "t_decode_with_error2'" t_decode_with_error2',
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',

From c971a88653c85a409d8c856438b4f33f50f618fe Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 23 Oct 2022 23:18:55 -0600
Subject: [PATCH 51/87] A little refactorin'

---
 src/Data/Text/Encoding.hs      | 188 ++++++++++++++++++---------------
 src/Data/Text/Lazy/Encoding.hs |  32 +++---
 2 files changed, 119 insertions(+), 101 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 1c4298c3..f7bf8997 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -28,12 +28,6 @@ module Data.Text.Encoding
     , parseUtf8Chunk
     , parseNextUtf8Chunk
     , startUtf8ParseState
-    , TextDataStack
-    , dataStack
-    , stackLen
-    , emptyStack
-    , pushText
-    , stackToText
 
     -- * Decoding ByteStrings to Text
     -- $strict
@@ -41,15 +35,22 @@ module Data.Text.Encoding
     -- ** Total Functions #total#
     -- $total
     , decodeLatin1
-    , decodeUtf8Lenient
     , decodeAsciiPrefix
+    , TextDataStack
+    , dataStack
+    , stackLen
+    , emptyStack
+    , pushText
+    , stackToText
     , decodeNextUtf8Chunk
     , decodeUtf8Chunk
+    , decodeUtf8Lenient
 
     -- *** Catchable failure
     , decodeUtf8'
 
     -- *** Controllable error handling
+    , handleUtf8Err
     , decodeUtf8With
     , decodeUtf16LEWith
     , decodeUtf16BEWith
@@ -147,8 +148,38 @@ import qualified Data.ByteString.Unsafe as B
 -- (preferably not at all). See "Data.Text.Encoding#g:total" for better
 -- solutions.
 
--- | Decode a 'ByteString' containing 7-bit ASCII
--- encoded text.
+-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
+--
+-- This is a total function. The 'ByteString' is decoded until either
+-- the end is reached or it errors with the first non-ASCII 'Word8' is
+-- encountered. In either case the function will return the 'Text'
+-- value of the longest prefix that is valid ASCII. On error, the index
+-- of the non-ASCII 'Word8' is also returned.
+--
+-- @since 2.0.2
+decodeAsciiPrefix ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString -> (Text, Maybe (Word8, Int))
+decodeAsciiPrefix bs = if B.null bs
+  then (empty, Nothing)
+  else unsafeDupablePerformIO $ withBS bs $ \ fp len ->
+    unsafeWithForeignPtr fp $ \src -> do
+      asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len
+      let !prefix = if asciiPrefixLen == 0
+            then empty
+            else runST $ do
+              dst <- A.new asciiPrefixLen
+              A.copyFromPointer dst 0 src asciiPrefixLen
+              arr <- A.unsafeFreeze dst
+              pure $ Text arr 0 asciiPrefixLen
+      let suffix = if asciiPrefixLen < len
+            then Just (B.index bs asciiPrefixLen, asciiPrefixLen)
+            else Nothing
+      pure (prefix, suffix)
+
+-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a partial function: it checks that input does not contain
 -- anything except ASCII and copies buffer or throws an error otherwise.
@@ -175,7 +206,7 @@ decodeLatin1 ::
 decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
   dst <- A.new (2 * len)
   let inner srcOff dstOff = if srcOff >= len then return dstOff else do
-        asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+        asciiPrefixLen <- fmap fromIntegral $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
           c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
         if asciiPrefixLen == 0
         then do
@@ -196,39 +227,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
--- | Decode a 'ByteString' containing ASCII.
---
--- This is a total function. The 'ByteString' is decoded until either
--- the end is reached or it errors with the first non-ASCII 'Word8' is
--- encountered. In either case the function will return what 'Text' was
--- decoded. On error, the index of the non-ASCII 'Word8' is also returned.
---
--- @since 2.0.2
-decodeAsciiPrefix
-  :: ByteString
-  -> (Text, Maybe (Word8, Int))
-decodeAsciiPrefix bs = if B.null bs
-  then (empty, Nothing)
-  else unsafeDupablePerformIO $ withBS bs $ \ fp len ->
-    unsafeWithForeignPtr fp $ \src -> do
-      asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len
-      let !prefix = if asciiPrefixLen == 0
-            then empty
-            else runST $ do
-              dst <- A.new asciiPrefixLen
-              A.copyFromPointer dst 0 src asciiPrefixLen
-              arr <- A.unsafeFreeze dst
-              pure $ Text arr 0 asciiPrefixLen
-      let suffix = if asciiPrefixLen < len
-            then Just (B.index bs asciiPrefixLen, asciiPrefixLen)
-            else Nothing
-      pure (prefix, suffix)
-
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
-
 data Utf8ParseState = Utf8ParseState
   { partialCodePoint :: [ByteString]
   , codePointState :: Utf8CodePointState
@@ -238,6 +236,11 @@ data Utf8ParseState = Utf8ParseState
 startUtf8ParseState ::Utf8ParseState 
 startUtf8ParseState = Utf8ParseState [] utf8StartState
 
+#ifdef SIMDUTF
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
+#endif
+
 {-
 `parseUtf8Chunk chunk = (n, es)`
 
@@ -252,7 +255,11 @@ es
   intermediate decoding state, which can be used to parse the next chunk with
   `parseNextUtf8Chunk`
 -}
-parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState)
+parseUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString -> (Int, Either Int Utf8ParseState)
 parseUtf8Chunk bs@(B.length -> len)
 #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
   | guessUtf8Boundary > 0 &&
@@ -312,7 +319,11 @@ es
   intermediate decoding state, which can be used to parse the next chunk with
   `parseNextUtf8Chunk`
 -}
-parseNextUtf8Chunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
+parseNextUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
 parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s)
   | len > 0 =
     let g pos s'
@@ -350,7 +361,11 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
   then TextDataStack (Left t : stack) $ sLen + tLen
   else tds
 
-stackToText :: TextDataStack -> Text
+stackToText ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  TextDataStack -> Text
 stackToText (TextDataStack stack sLen)
   | sLen > 0 = runST $
     do
@@ -373,8 +388,11 @@ stackToText (TextDataStack stack sLen)
       pure $ Text arr 0 sLen
   | otherwise = empty
 
-decodeNextUtf8Chunk
-  :: ByteString
+decodeNextUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString
   -> Utf8ParseState
   -> TextDataStack
   -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
@@ -405,6 +423,38 @@ decodeNextUtf8Chunk bs s tds =
 decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
 decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack
 
+handleUtf8Err
+  :: OnDecodeError
+  -> String
+  -> Int
+  -> Int
+  -> Utf8ParseState
+  -> ByteString
+  -> TextDataStack
+  -> TextDataStack
+handleUtf8Err onErr errMsg len pos s bs tds =
+  let h errPos errEndPos bss tds'
+        | errPos < errEndPos =
+          let errPos' = errPos + 1 in
+          case bss of
+            bs'@(B.length -> len') : bss' ->
+              ( if errPos' < len'
+                then h errPos' errEndPos bss
+                else h 0 (errEndPos - len') bss'
+              ) $ case onErr errMsg . Just $ B.index bs' errPos of
+                Just c -> pushText (T.singleton c) tds'
+                Nothing -> tds'
+            [] -> tds'
+        | otherwise = tds'
+  in
+  ( if len < 0
+    then h 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs]
+    else h len pos [bs]
+  ) tds
+
+invalidUtf8Msg :: String
+invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
 -- Surrogate code points in replacement character returned by 'OnDecodeError'
@@ -415,18 +465,13 @@ decodeUtf8With ::
 #endif
   OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr bs =
-  let handleErr errPos errEndPos bs' tds
-        | errPos < errEndPos = handleErr (errPos + 1) errEndPos bs' $
-          case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of
-            Just c -> pushText (T.singleton c) tds
-            Nothing -> tds
-        | otherwise = tds
-      g bs'@(B.length -> bLen) res =
+  let g bs'@(B.length -> bLen) res =
         case res of
           ((len, eS), tds) ->
+            let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in
             case eS of
-              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ handleErr len pos bs' tds
-              Right _ -> stackToText $ handleErr len bLen bs' tds
+              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ h invalidUtf8Msg pos startUtf8ParseState
+              Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s
   in
   g bs $ decodeUtf8Chunk bs
 
@@ -520,34 +565,12 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
--- streamDecodeUtf8With onErr = go mempty
---   where
---     go bs1 bs2 = Some txt undecoded (go undecoded)
---       where
---         (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
 streamDecodeUtf8With onErr bs =
-  let handleErr errPos errEndPos bss tds
-        | errPos < errEndPos =
-          let errPos' = errPos + 1
-              (bs'@(B.length -> len) : bss') = bss
-              h = if errPos' < len
-                then handleErr errPos' errEndPos bss
-                else handleErr 0 (errEndPos - len) bss'
-          in
-          h $
-            case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of
-              Just c -> pushText (T.singleton c) tds
-              Nothing -> tds
-        | otherwise = tds
-      g bs' s tds =
+  let g bs' s tds =
         case decodeNextUtf8Chunk bs' s tds of
           ((len, eS), tds') ->
             case eS of
-              Left (pos, bs'') -> g bs'' startUtf8ParseState $
-                ( if len < 0
-                  then handleErr 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs']
-                  else handleErr len pos [bs']
-                ) tds'
+              Left (pos, bs'') -> g bs'' startUtf8ParseState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
               Right s' -> let bss' = partialCodePoint s' in
                 Some (stackToText tds') (B.concat bss') $ \ bs'' ->
                   g bs'' s' emptyStack
@@ -754,6 +777,3 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
 encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
-
-cSizeToInt :: CSize -> Int
-cSizeToInt = fromIntegral
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index ad361af5..19f605a7 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE BangPatterns,CPP #-}
 {-# LANGUAGE Trustworthy #-}
+{-# LANGUAGE ViewPatterns #-}
 
 -- |
 -- Module      : Data.Text.Lazy.Encoding
@@ -65,7 +66,6 @@ import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Lazy as B
 import qualified Data.ByteString.Lazy.Internal as B
-import qualified Data.Text as T
 import qualified Data.Text.Encoding as TE
 import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
 import qualified Data.Text.Internal.Lazy.Fusion as F
@@ -107,22 +107,20 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr (B.Chunk b0 bs0) =
-    case TE.streamDecodeUtf8With onErr b0 of
-      TE.Some t l f -> chunk t (go f l bs0)
-  where
-    go f0 _ (B.Chunk b bs) =
-      case f0 b of
-        TE.Some t l f -> chunk t (go f l bs)
-    go _ l _
-      | S.null l  = empty
-      | otherwise =
-        let !t = T.pack (skipBytes l)
-            skipBytes = S.foldr (\x s' ->
-                  case onErr desc (Just x) of
-                    Just c -> c : s'
-                    Nothing -> s') [] in
-        Chunk t Empty
-    desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
+  let g bs@(S.length -> bLen) lbs s tds diffText =
+        case TE.decodeNextUtf8Chunk bs s tds of
+          ((len, eS), tds') ->
+            let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in
+            case eS of
+              Left (pos, bs') -> g bs' lbs TE.startUtf8ParseState
+                (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText
+              Right s' ->
+                case lbs of
+                  B.Chunk bs' lbs' ->
+                    g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds')
+                  B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty
+  in
+  g b0 bs0 TE.startUtf8ParseState TE.emptyStack id
 decodeUtf8With _ _ = empty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known

From d63effad3ff2c01fa829a9c0dc9e98b427df356d Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 25 Oct 2022 22:54:29 -0600
Subject: [PATCH 52/87] Documenting new functions.

---
 changelog.md                            |  16 +-
 src/Data/Text/Encoding.hs               | 193 ++++++++++++++----------
 src/Data/Text/Internal/Encoding/Utf8.hs |   2 +-
 src/Data/Text/Lazy/Encoding.hs          |   4 +-
 tests/Tests/Properties/Transcoding.hs   |  44 +++---
 5 files changed, 154 insertions(+), 105 deletions(-)

diff --git a/changelog.md b/changelog.md
index c2c72420..752c7fbb 100644
--- a/changelog.md
+++ b/changelog.md
@@ -4,10 +4,22 @@
   allow decoding to be aborted on errors without the need to raise an
   `error` and `catch` it elsewhere:
   * `decodeAsciiPrefix`
+  * `TextDataStack`
+  * `dataStack`
+  * `stackLen`
+  * `emptyStack`
   * `decodeUtf8Chunk`
   * `decodeNextUtf8Chunk`
-  * `recoverFromUtf8Error`
-  * `outAvailableUtf8Text`
+  * `pushText`
+  * `stackToText`
+
+* Added functions to validate `ByteString`s that represent encoded text:
+  * `Utf8ValidState`
+  * `partialUtf8CodePoint`
+  * `utf8CodePointState`
+  * `validateUtf8Chunk`
+  * `validateNextUtf8Chunk`
+  * `startUtf8ValidState`
 
 ### 2.0.1
 
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index f7bf8997..d3d0396f 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -22,12 +22,15 @@
 -- <http://hackage.haskell.org/package/text-icu text-icu package>.
 
 module Data.Text.Encoding
-    ( Utf8ParseState
-    , partialCodePoint
-    , codePointState
-    , parseUtf8Chunk
-    , parseNextUtf8Chunk
-    , startUtf8ParseState
+    (
+    -- * ByteString validation
+    -- $validation
+      Utf8ValidState
+    , partialUtf8CodePoint
+    , utf8CodePointState
+    , validateUtf8Chunk
+    , validateNextUtf8Chunk
+    , startUtf8ValidState
 
     -- * Decoding ByteStrings to Text
     -- $strict
@@ -124,6 +127,9 @@ import Foreign.C.Types (CInt(..))
 import qualified Data.ByteString.Unsafe as B
 #endif
 
+-- $validation
+-- These functions are for validating 'ByteString's as encoded text.
+
 -- $strict
 --
 -- All of the single-parameter functions for decoding bytestrings
@@ -227,40 +233,53 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
-data Utf8ParseState = Utf8ParseState
-  { partialCodePoint :: [ByteString]
-  , codePointState :: Utf8CodePointState
+-- | This data type represents the state of a 'ByteString' representing
+-- UTF-8-encoded text. It consists of a value representing whether or
+-- not the last byte is a complete code point, and on incompletion what
+-- the 1 to 3 end bytes are that make up the incomplete code point.
+data Utf8ValidState = Utf8ValidState
+  { -- | Get the incomplete UTF-8 code point of the 'ByteString's that
+    -- have been validated thus far.
+    partialUtf8CodePoint :: [ByteString]
+    -- | Get the current UTF-8 code point state of the 'ByteString's
+    -- that have been validated thus far.
+  , utf8CodePointState :: Utf8CodePointState
   }
-  deriving (Eq, Ord, Show, Read)
+  deriving (Eq, Ord, Show)
 
-startUtf8ParseState ::Utf8ParseState 
-startUtf8ParseState = Utf8ParseState [] utf8StartState
+-- | This represtents the starting state of a UTF-8 validation check.
+startUtf8ValidState :: Utf8ValidState 
+startUtf8ValidState = Utf8ValidState [] utf8StartState
 
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
 #endif
 
-{-
-`parseUtf8Chunk chunk = (n, es)`
-
-n is the end index of the longest prefix of chunk that is valid UTF-8
-
-es
-* When `es = Left p`, there is an error: the bytes from index n and beyond are
-  not part of a valid UTF-8 code point, and is the index of the start of the
-  next (possibly valid) code point. `p - n` is the number of invalid bytes.
-* When `es = Right s`, all of the remaining bytes from index n and beyond are the
-  beginning of an incomplete UTF-8 code point, and s is the corresponding
-  intermediate decoding state, which can be used to parse the next chunk with
-  `parseNextUtf8Chunk`
--}
-parseUtf8Chunk ::
+-- | Validate a 'ByteString' as a UTF-8-encoded text.
+--
+-- @validateUtf8Chunk chunk = (n, es)@
+--
+-- This function returns two values:
+--
+-- * The value 'n' indicates the longest prefix of the 'ByteString'
+--   that is valid UTF-8-encoded data.
+-- * The value 'es' indicates whether the 'ByteString'
+--
+--     * (@Left p@) contains an invalid code point and where the next
+--       (potentially valid) code point begins, so that @p - n@ is the
+--       number of invalid bytes, or
+--     * (@Right s@) is valid, and all of the remaining bytes starting
+--       at inbex 'n' are the beginning of an incomplete UTF-8 code
+--       point, and 's' is the resulting 'Utf8ValidState' value, which
+--       can be used to validate against a following 'ByteString' with
+--       'validateNextUtf8Chunk'.
+validateUtf8Chunk ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  ByteString -> (Int, Either Int Utf8ParseState)
-parseUtf8Chunk bs@(B.length -> len)
+  ByteString -> (Int, Either Int Utf8ValidState)
+validateUtf8Chunk bs@(B.length -> len)
 #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
   | guessUtf8Boundary > 0 &&
     -- the rest of the bytestring valid utf-8 up to the boundary
@@ -276,7 +295,7 @@ parseUtf8Chunk bs@(B.length -> len)
     -- No
   | otherwise = getEndState 0
     where
-      getEndState ndx = parseUtf8 ndx ndx utf8StartState
+      getEndState ndx = validateUtf8 ndx ndx utf8StartState
       w n word8 = len >= n && word8 <= (B.index bs $ len - n)
       guessUtf8Boundary
         | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
@@ -284,53 +303,56 @@ parseUtf8Chunk bs@(B.length -> len)
         | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
         | otherwise = len
 #else
-  = parseUtf8 0 0 utf8StartState
+  = validateUtf8 0 0 utf8StartState
     where
 #endif
-      parseUtf8 ndx0 ndx s
+      validateUtf8 ndx0 ndx s
         | ndx < len =
           let ndx' = ndx + 1 in
           case updateUtf8State (B.index bs ndx) s of
             Just s' ->
-              parseUtf8 (
+              validateUtf8 (
                 if isUtf8StateIsComplete s'
                 then ndx'
                 else ndx0
               ) ndx' s'
             Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        | otherwise = (ndx0, Right $ Utf8ParseState (if ndx0 < len then [B.drop ndx0 bs] else []) s)
-
-{-
-parseNextUtf8Chunk chunk s = (n, es)
-
-n
-* When n >= 0, n is the length of the longest prefix of chunk that is valid
-  UTF-8 starting from state s (i.e., n points after the end of a full codepoint,
-  to the beginning of an incomplete codepoint).
-* When n > 0, the starting code point from the previous input is either still
-  incomplete with the additonal chunk or in error.
-
-es
-* When es = Left p, there is an error: the bytes from index n and beyond are
-  not part of a valid UTF-8 code point, and is the index of the start of the
-  next (possibly valid) code point. `p - n` is the number of invalid bytes.
-* When es = Right s', all of the remaining bytes from index n and beyond are the
-  beginning of an incomplete UTF-8 code point, and s' is the corresponding
-  intermediate decoding state, which can be used to parse the next chunk with
-  `parseNextUtf8Chunk`
--}
-parseNextUtf8Chunk ::
+        | otherwise = (ndx0, Right $ Utf8ValidState (if ndx0 < len then [B.drop ndx0 bs] else []) s)
+
+-- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
+--
+-- @validateNextUtf8Chunk chunk s = (n, es)@
+--
+-- This function returns two values:
+--
+-- * The value 'n' indicates the end position of longest prefix of the
+--   'ByteString' that is valid UTF-8-encoded data from the starting
+--   state 's'. If 's' contains an incomplete code point, the input
+--   'ByteString' is considered a continuation. As a result 'n' will be
+--   negative if the code point is still incomplete or is proven to be
+--   invalid.
+--   
+-- * The value 'es' indicates whether the 'ByteString'
+--
+--     * (@Left p@) contains an invalid code point and where the next
+--       (potentially valid) code point begins, so that @p - n@ is the
+--       number of invalid bytes, or
+--     * (@Right s'@) is valid, and all of the remaining bytes starting
+--       at inbex 'n' are the beginning of an incomplete UTF-8 code
+--       point, and `s'` is the resulting 'Utf8ValidState' value, which
+--       can be used to validate against a following 'ByteString'.
+validateNextUtf8Chunk ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState)
-parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s)
+  ByteString -> Utf8ValidState -> (Int, Either Int Utf8ValidState)
+validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s)
   | len > 0 =
     let g pos s'
           -- first things first. let's try to get to the start of the next code point
           | isUtf8StateIsComplete s' =
             -- found the beginning of the next code point, hand this off to someone else
-            case parseUtf8Chunk $ B.drop pos bs of
+            case validateUtf8Chunk $ B.drop pos bs of
               (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS)
           -- code point is not complete yet
           -- walk the rest of the code point until error, complete, or no more data
@@ -341,26 +363,32 @@ parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s)
               -- keep going
               Just s'' -> g (pos + 1) s''
           -- no more data
-          | otherwise = (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s')
+          | otherwise = (leadPos, Right $ Utf8ValidState (lead ++ [bs]) s')
     in g 0 s
   | otherwise = (leadPos, Right st)
     where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead)
 
+-- | Validated UTF-8 data to be converted into a 'Text' value.
 data TextDataStack = TextDataStack
-  { dataStack :: [Either Text ByteString]
+  { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values.
+    dataStack :: [Either Text ByteString]
+    -- | Returns total number of UTF-8 valid bytes in the stack.
   , stackLen :: Int
   }
   deriving Show
 
+-- | Empty stack
 emptyStack :: TextDataStack
 emptyStack = TextDataStack [] 0
 
+-- | Push a text value onto the stack
 pushText :: Text -> TextDataStack -> TextDataStack
 pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
   if tLen > 0
   then TextDataStack (Left t : stack) $ sLen + tLen
   else tds
 
+-- | Create a 'Text' value from the contents of a stack.
 stackToText ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -388,16 +416,17 @@ stackToText (TextDataStack stack sLen)
       pure $ Text arr 0 sLen
   | otherwise = empty
 
+-- | Decode a 'ByteString' in the context of what has been already been decoded.
 decodeNextUtf8Chunk ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   ByteString
-  -> Utf8ParseState
+  -> Utf8ValidState
   -> TextDataStack
-  -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
+  -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)
 decodeNextUtf8Chunk bs s tds =
-  case parseNextUtf8Chunk bs s of
+  case validateNextUtf8Chunk bs s of
     (len, res) ->
       let stackedData'
             | len >= 0 =
@@ -406,7 +435,7 @@ decodeNextUtf8Chunk bs s tds =
                       if bLen > 0
                       then TextDataStack (Right bs' : stack) $ sLen + bLen
                       else tds'
-                      ) tds $ partialCodePoint s
+                      ) tds $ partialUtf8CodePoint s
               in
               if len > 0
               then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len
@@ -420,36 +449,44 @@ decodeNextUtf8Chunk bs s tds =
         )
       , stackedData')
 
-decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack)
-decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack
+-- | Decode a 'ByteString'.
+--
+-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@
+decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)
+decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack
 
+-- | Call an error handler with the give 'String' message for each byte
+-- in given 'ByteString' and lead data in the given 'Utf8ValidState'
+-- value. The bytes are the positions from 'errStart' (inclusive) to
+-- 'errEnd' (exclusive). Any substite characters are pushed onto the
+-- supplied 'TextDataStack' argument.
 handleUtf8Err
   :: OnDecodeError
   -> String
   -> Int
   -> Int
-  -> Utf8ParseState
+  -> Utf8ValidState
   -> ByteString
   -> TextDataStack
   -> TextDataStack
-handleUtf8Err onErr errMsg len pos s bs tds =
+handleUtf8Err onErr errMsg errStart errEnd s bs tds =
   let h errPos errEndPos bss tds'
         | errPos < errEndPos =
           let errPos' = errPos + 1 in
           case bss of
-            bs'@(B.length -> len') : bss' ->
-              ( if errPos' < len'
+            bs'@(B.length -> len) : bss' ->
+              ( if errPos' < len
                 then h errPos' errEndPos bss
-                else h 0 (errEndPos - len') bss'
+                else h 0 (errEndPos - len) bss'
               ) $ case onErr errMsg . Just $ B.index bs' errPos of
                 Just c -> pushText (T.singleton c) tds'
                 Nothing -> tds'
             [] -> tds'
         | otherwise = tds'
   in
-  ( if len < 0
-    then h 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs]
-    else h len pos [bs]
+  ( if errStart < 0
+    then h 0 (errEnd - errStart) $ partialUtf8CodePoint s ++ [B.take errEnd bs]
+    else h errStart errEnd [bs]
   ) tds
 
 invalidUtf8Msg :: String
@@ -470,7 +507,7 @@ decodeUtf8With onErr bs =
           ((len, eS), tds) ->
             let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in
             case eS of
-              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ h invalidUtf8Msg pos startUtf8ParseState
+              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ValidState $ h invalidUtf8Msg pos startUtf8ValidState
               Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s
   in
   g bs $ decodeUtf8Chunk bs
@@ -570,12 +607,12 @@ streamDecodeUtf8With onErr bs =
         case decodeNextUtf8Chunk bs' s tds of
           ((len, eS), tds') ->
             case eS of
-              Left (pos, bs'') -> g bs'' startUtf8ParseState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
-              Right s' -> let bss' = partialCodePoint s' in
+              Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
+              Right s' -> let bss' = partialUtf8CodePoint s' in
                 Some (stackToText tds') (B.concat bss') $ \ bs'' ->
                   g bs'' s' emptyStack
   in
-  g bs startUtf8ParseState emptyStack
+  g bs startUtf8ValidState emptyStack
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 5d4aaebd..148995a8 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -243,7 +243,7 @@ byteToClass n = ByteClass (W8# el#)
     table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
 
 newtype Utf8CodePointState = Utf8CodePointState Word8
-  deriving (Eq, Ord, Show, Read)
+  deriving (Eq, Ord, Show)
 
 utf8StartState :: Utf8CodePointState
 utf8StartState = Utf8CodePointState 0
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 19f605a7..5b5e8bb9 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -112,7 +112,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
           ((len, eS), tds') ->
             let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in
             case eS of
-              Left (pos, bs') -> g bs' lbs TE.startUtf8ParseState
+              Left (pos, bs') -> g bs' lbs TE.startUtf8ValidState
                 (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText
               Right s' ->
                 case lbs of
@@ -120,7 +120,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
                     g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds')
                   B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty
   in
-  g b0 bs0 TE.startUtf8ParseState TE.emptyStack id
+  g b0 bs0 TE.startUtf8ValidState TE.emptyStack id
 decodeUtf8With _ _ = empty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 2d5395a0..82419614 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -37,68 +37,68 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
-t_p_utf8_1   = case E.parseUtf8Chunk (B.pack [0x63]) of
+t_p_utf8_1   = case E.validateUtf8Chunk (B.pack [0x63]) of
   (result, st) -> whenEqProp result 1 . property $ isRight st
-t_p_utf8_2   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of
+t_p_utf8_2   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of
   (result, st) -> whenEqProp result 3 . property $ isRight st
-t_p_utf8_3   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of
+t_p_utf8_3   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of
   (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_4   = case E.parseUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of
+t_p_utf8_4   = case E.validateUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of
   (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_5   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of
+t_p_utf8_5   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of
   (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_6   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of
+t_p_utf8_6   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of
   (result, st) -> whenEqProp result 2 . property $ isRight st
-t_p_utf8_7   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of
+t_p_utf8_7   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of
   (result, st) -> whenEqProp result 3 . property $ isRight st
-t_p_utf8_8   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of
+t_p_utf8_8   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of
   (result, st) -> whenEqProp result 0 $ st === Left 3
-t_p_utf8_9   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of
+t_p_utf8_9   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of
   (result, st) -> whenEqProp result 2 $ st === Left 3
-t_p_utf8_0   = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of
+t_p_utf8_0   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of
   (result, st) -> whenEqProp result 2 $ st === Left 3
 
-t_pn_utf8_1   = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of
+t_pn_utf8_1   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of
   (result0, mS) -> whenEqProp result0 0 $
     case mS of
       Left _ -> counterexample (show mS) False
-      Right s -> case E.parseNextUtf8Chunk (B.pack [0x80]) s of
+      Right s -> case E.validateNextUtf8Chunk (B.pack [0x80]) s of
         (result1, mS1) -> whenEqProp result1 1 $
           if isLeft mS1
           then counterexample (show mS1) False
-          else case E.parseNextUtf8Chunk (B.pack [0x7f]) s of
+          else case E.validateNextUtf8Chunk (B.pack [0x7f]) s of
             (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0
-t_pn_utf8_2   = case E.parseUtf8Chunk (B.pack [0xF0]) of
+t_pn_utf8_2   = case E.validateUtf8Chunk (B.pack [0xF0]) of
   (result0, mS0) -> whenEqProp result0 0 $
     case mS0 of
       Left _ -> counterexample (show mS0) False
-      Right s0 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s0 of
+      Right s0 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s0 of
         (result1, mS1) -> whenEqProp result1 (-1) .
           whenEqProp mS1 (Left 0) $
-          case E.parseNextUtf8Chunk (B.pack [0x90]) s0 of
+          case E.validateNextUtf8Chunk (B.pack [0x90]) s0 of
             (result2, mS2) -> whenEqProp result2 (-1) $
               case mS2 of
                 Left _ -> counterexample (show mS2) False
-                Right s1 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s1 of
+                Right s1 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s1 of
                   (result3, mS3) -> whenEqProp result3 (-2) .
                     whenEqProp mS3 (Left 0) $
-                    case E.parseNextUtf8Chunk (B.pack [0x80]) s1 of
+                    case E.validateNextUtf8Chunk (B.pack [0x80]) s1 of
                       (result4, mS4) -> whenEqProp result4 (-2) $
                         case mS4 of
                           Left _ -> counterexample (show mS3) False
-                          Right s2 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s2 of
+                          Right s2 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s2 of
                             (result5, mS5) -> whenEqProp result5 (-3) .
                               whenEqProp mS5 (Left 0) $
-                              case E.parseNextUtf8Chunk (B.pack [0x80]) s2 of
+                              case E.validateNextUtf8Chunk (B.pack [0x80]) s2 of
                                 (result6, mS6) -> whenEqProp result6 1 $
                                   property $ isRight mS6
-t_pn_utf8_3 = case E.parseUtf8Chunk $ B.pack [0xc2] of
+t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of
   (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
     Left _ -> counterexample (show eS1) False
     Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $
       if isUtf8StateIsComplete $ E.codePointState s1
       then counterexample (show $ E.codePointState s1) False
-      else case E.parseNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
+      else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
         (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2
 
 t_utf8_c     = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id

From 1f18883ea000df72f0976c95aa7ae75d804cfc2e Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Tue, 25 Oct 2022 23:10:12 -0600
Subject: [PATCH 53/87] Check tests before pushing...

---
 tests/Tests/Properties/Transcoding.hs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 82419614..3b9c908f 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -95,9 +95,9 @@ t_pn_utf8_2   = case E.validateUtf8Chunk (B.pack [0xF0]) of
 t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of
   (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
     Left _ -> counterexample (show eS1) False
-    Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $
-      if isUtf8StateIsComplete $ E.codePointState s1
-      then counterexample (show $ E.codePointState s1) False
+    Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $
+      if isUtf8StateIsComplete $ E.utf8CodePointState s1
+      then counterexample (show $ E.utf8CodePointState s1) False
       else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
         (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2
 
@@ -281,9 +281,9 @@ t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of
   ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
     case eS1 of
       Left _ -> counterexample (show eS1) False
-      Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $
-        if isUtf8StateIsComplete $ E.codePointState s1
-        then counterexample (show $ E.codePointState s1) False
+      Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $
+        if isUtf8StateIsComplete $ E.utf8CodePointState s1
+        then counterexample (show $ E.utf8CodePointState s1) False
         else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of
           ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $
             case eS2 of

From 8dc84fe7263a6b0713f672db97d3e2d929e72a1a Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Mon, 31 Oct 2022 19:39:22 -0600
Subject: [PATCH 54/87] Finish documentation... for now.

---
 src/Data/Text/Encoding.hs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index d3d0396f..711457bc 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -388,7 +388,7 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
   then TextDataStack (Left t : stack) $ sLen + tLen
   else tds
 
--- | Create a 'Text' value from the contents of a stack.
+-- | Create a 'Text' value from the contents of a 'TextDataStack'.
 stackToText ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -417,6 +417,14 @@ stackToText (TextDataStack stack sLen)
   | otherwise = empty
 
 -- | Decode a 'ByteString' in the context of what has been already been decoded.
+--
+-- The 'ByteString' is validated against the 'Utf8ValidState' using the rules
+-- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added
+-- to the input 'TextDataStack' which is returned with the end position of the
+-- valid prefix, and either the resulting 'Utf8ValidState'
+-- (@Right Utf8ValidState@) or the position of the of the first (potentially)
+-- valid byte after the invalid bytes with remainder of the 'ByteString'
+-- (@Left (Int, ByteString)@).
 decodeNextUtf8Chunk ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -447,9 +455,11 @@ decodeNextUtf8Chunk bs s tds =
             Left pos -> Left (pos, B.drop pos bs)
             Right s' -> Right s'
         )
-      , stackedData')
+      , stackedData'
+      )
 
--- | Decode a 'ByteString'.
+-- | Decode a 'ByteString' against a start 'Utf8ValidState' with an empty
+-- 'TextDataStack'.
 --
 -- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@
 decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)

From 37d19dee870067541cf60db7b387a85c4d9d9728 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sun, 13 Nov 2022 22:40:54 -0700
Subject: [PATCH 55/87] minor changes

---
 src/Data/Text/Encoding.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 711457bc..0af27b8d 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -193,7 +193,7 @@ decodeAsciiPrefix bs = if B.null bs
 decodeASCII :: ByteString -> Text
 decodeASCII bs =
   case decodeAsciiPrefix bs of
-    (_, Just errPos) -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show errPos
+    (_, Just (word, errPos)) -> error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos
     (t, Nothing) -> t
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
@@ -306,7 +306,7 @@ validateUtf8Chunk bs@(B.length -> len)
   = validateUtf8 0 0 utf8StartState
     where
 #endif
-      validateUtf8 ndx0 ndx s
+      validateUtf8 !ndx0 ndx s
         | ndx < len =
           let ndx' = ndx + 1 in
           case updateUtf8State (B.index bs ndx) s of

From 1e3db5b546b133b8829f32ad785def66ae25922a Mon Sep 17 00:00:00 2001
From: Sledge <sledged@gmail.com>
Date: Fri, 30 Dec 2022 10:18:36 -0700
Subject: [PATCH 56/87] Update src/Data/Text/Encoding.hs

typo

Co-authored-by: Xia Li-yao <Lysxia@users.noreply.github.com>
---
 src/Data/Text/Encoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 0af27b8d..bc9778c3 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -338,7 +338,7 @@ validateUtf8Chunk bs@(B.length -> len)
 --       (potentially valid) code point begins, so that @p - n@ is the
 --       number of invalid bytes, or
 --     * (@Right s'@) is valid, and all of the remaining bytes starting
---       at inbex 'n' are the beginning of an incomplete UTF-8 code
+--       at index 'n' are the beginning of an incomplete UTF-8 code
 --       point, and `s'` is the resulting 'Utf8ValidState' value, which
 --       can be used to validate against a following 'ByteString'.
 validateNextUtf8Chunk ::

From 90a0c5bca12aaadf93764c7a28aefd9044444ae5 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 14 Jan 2023 11:11:24 -0700
Subject: [PATCH 57/87] encode partial codepoint in Word32

---
 changelog.md                          |   2 +
 src/Data/Text/Encoding.hs             | 157 ++++++++++++++++----------
 tests/Tests/Properties/Transcoding.hs |  47 +++++++-
 3 files changed, 144 insertions(+), 62 deletions(-)

diff --git a/changelog.md b/changelog.md
index 752c7fbb..9ea8beef 100644
--- a/changelog.md
+++ b/changelog.md
@@ -16,6 +16,8 @@
 * Added functions to validate `ByteString`s that represent encoded text:
   * `Utf8ValidState`
   * `partialUtf8CodePoint`
+  * `partUtf8CPLen`
+  * `wordAtPartUft8CP`
   * `utf8CodePointState`
   * `validateUtf8Chunk`
   * `validateNextUtf8Chunk`
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index bc9778c3..69a10658 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -27,6 +27,8 @@ module Data.Text.Encoding
     -- $validation
       Utf8ValidState
     , partialUtf8CodePoint
+    , partUtf8CPLen
+    , wordAtPartUft8CP
     , utf8CodePointState
     , validateUtf8Chunk
     , validateNextUtf8Chunk
@@ -89,34 +91,35 @@ module Data.Text.Encoding
     , encodeUtf8BuilderEscaped
     ) where
 
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (runST)
-import Data.Bits (shiftR, (.&.))
+import Control.Monad.ST (ST, runST)
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Data.Bifunctor (Bifunctor(first))
+import Data.Bits (shiftL, shiftR, (.|.), (.&.))
 import Data.ByteString (ByteString)
-import qualified Data.ByteString as B
-import qualified Data.ByteString.Internal as B
+import Data.Maybe (fromJust)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8)
+import Data.Word (Word32, Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
-import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
-import Data.Text.Internal.ByteStringCompat
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
@@ -224,7 +227,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
           unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
             unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
           inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
-
   actualLen <- inner 0 0
   dst' <- A.resizeM dst actualLen
   arr <- A.unsafeFreeze dst'
@@ -239,17 +241,34 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
 -- the 1 to 3 end bytes are that make up the incomplete code point.
 data Utf8ValidState = Utf8ValidState
   { -- | Get the incomplete UTF-8 code point of the 'ByteString's that
-    -- have been validated thus far.
-    partialUtf8CodePoint :: [ByteString]
+    -- have been validated thus far. The first byte of the 'Word32'
+    -- indicates the number of bytes of the code point are available,
+    -- and is followed by the bytes of the code point.
+    partialUtf8CodePoint :: Word32
     -- | Get the current UTF-8 code point state of the 'ByteString's
     -- that have been validated thus far.
   , utf8CodePointState :: Utf8CodePointState
   }
   deriving (Eq, Ord, Show)
 
+partUtf8CPLen :: Word32 -> Int
+partUtf8CPLen partCP = fromIntegral $ partCP `shiftR` 24
+
+wordAtPartUft8CP :: Int -> Word32 -> Maybe Word8
+wordAtPartUft8CP n partCP
+  | n < partUtf8CPLen partCP && n >= 0 =
+    Just . fromIntegral $ partCP `shiftR` (16 - 8 * n)
+  | otherwise = Nothing
+
 -- | This represtents the starting state of a UTF-8 validation check.
 startUtf8ValidState :: Utf8ValidState 
-startUtf8ValidState = Utf8ValidState [] utf8StartState
+startUtf8ValidState = Utf8ValidState 0 utf8StartState
+
+appendPartialCodePoint :: Word32 -> ByteString -> Word32
+appendPartialCodePoint cp bs@(B.length -> len) =
+  fst $ B.foldl (\ (cp', bsc) word ->
+      (cp' .|. (fromIntegral word `shiftL` (16 - bsc * 8)), bsc + 1)
+    ) (cp + (fromIntegral len `shiftL` 24), partUtf8CPLen cp) bs
 
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
@@ -317,7 +336,7 @@ validateUtf8Chunk bs@(B.length -> len)
                 else ndx0
               ) ndx' s'
             Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        | otherwise = (ndx0, Right $ Utf8ValidState (if ndx0 < len then [B.drop ndx0 bs] else []) s)
+        | otherwise = (ndx0, Right $ Utf8ValidState (appendPartialCodePoint 0 $ B.drop ndx0 bs) s)
 
 -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
 --
@@ -353,7 +372,7 @@ validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s)
           | isUtf8StateIsComplete s' =
             -- found the beginning of the next code point, hand this off to someone else
             case validateUtf8Chunk $ B.drop pos bs of
-              (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS)
+              (len', mS) -> (pos + len', first (+ pos) mS)
           -- code point is not complete yet
           -- walk the rest of the code point until error, complete, or no more data
           | pos < len =
@@ -363,15 +382,15 @@ validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s)
               -- keep going
               Just s'' -> g (pos + 1) s''
           -- no more data
-          | otherwise = (leadPos, Right $ Utf8ValidState (lead ++ [bs]) s')
+          | otherwise = (leadPos, Right $ Utf8ValidState (appendPartialCodePoint lead bs) s')
     in g 0 s
   | otherwise = (leadPos, Right st)
-    where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead)
+    where leadPos = -(partUtf8CPLen lead)
 
 -- | Validated UTF-8 data to be converted into a 'Text' value.
 data TextDataStack = TextDataStack
   { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values.
-    dataStack :: [Either Text ByteString]
+    dataStack :: [Either Text (Either Word32 ByteString)]
     -- | Returns total number of UTF-8 valid bytes in the stack.
   , stackLen :: Int
   }
@@ -388,6 +407,35 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
   then TextDataStack (Left t : stack) $ sLen + tLen
   else tds
 
+copyFromStack :: [Either Text (Either Word32 ByteString)] -> Int -> A.MArray s -> ST s ()
+copyFromStack (dat : dataStack') tLen dst =
+  (case dat of
+    Left (Text arr0 off utf8Len) -> do
+      let dstOff = tLen - utf8Len
+      A.copyI utf8Len dst dstOff arr0 off
+      pure dstOff
+    Right encoded ->
+      case encoded of
+        Left partial ->
+          let utf8Len = partUtf8CPLen partial
+              dstOff = tLen - utf8Len
+              g dstOff' =
+                case wordAtPartUft8CP (dstOff' - dstOff) partial of
+                  Just w -> do
+                    A.unsafeWrite dst dstOff' w
+                    g $ dstOff' + 1
+                  Nothing -> pure dstOff
+          in
+          g dstOff
+        Right bs@(B.length -> utf8Len) -> do
+          let dstOff = tLen - utf8Len
+          withBS bs $ \ fp _ ->
+            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
+          pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst)
+copyFromStack _ _ _ = pure ()
+{-# INLINE copyFromStack #-}
+
 -- | Create a 'Text' value from the contents of a 'TextDataStack'.
 stackToText ::
 #if defined(ASSERTS)
@@ -398,20 +446,7 @@ stackToText (TextDataStack stack sLen)
   | sLen > 0 = runST $
     do
       dst <- A.new sLen
-      let g (dat : dataStack') tLen' =
-              (case dat of
-                Left (Text arr0 off utf8Len) -> do
-                  let dstOff = tLen' - utf8Len
-                  A.copyI utf8Len dst dstOff arr0 off
-                  pure dstOff
-                Right bs@(B.length -> utf8Len) -> do
-                  let dstOff = tLen' - utf8Len
-                  withBS bs $ \ fp _ ->
-                    unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-                      unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
-                  pure dstOff) >>= g dataStack'
-          g _ _ = pure ()
-      g stack sLen
+      copyFromStack stack sLen dst
       arr <- A.unsafeFreeze dst
       pure $ Text arr 0 sLen
   | otherwise = empty
@@ -433,20 +468,20 @@ decodeNextUtf8Chunk ::
   -> Utf8ValidState
   -> TextDataStack
   -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)
-decodeNextUtf8Chunk bs s tds =
+decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) =
   case validateNextUtf8Chunk bs s of
     (len, res) ->
       let stackedData'
             | len >= 0 =
-              let stackedData@(TextDataStack stack' sLen') =
-                    foldl (\ tds'@(TextDataStack stack sLen) bs'@(B.length -> bLen) ->
-                      if bLen > 0
-                      then TextDataStack (Right bs' : stack) $ sLen + bLen
-                      else tds'
-                      ) tds $ partialUtf8CodePoint s
+              let partCP = partialUtf8CodePoint s
+                  partLen = partUtf8CPLen partCP
+                  stackedData@(TextDataStack stack' sLen') =
+                    if partLen > 0
+                    then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen
+                    else tds
               in
               if len > 0
-              then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len
+              then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len
               else stackedData
             | otherwise = tds
       in
@@ -468,7 +503,7 @@ decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack
 -- | Call an error handler with the give 'String' message for each byte
 -- in given 'ByteString' and lead data in the given 'Utf8ValidState'
 -- value. The bytes are the positions from 'errStart' (inclusive) to
--- 'errEnd' (exclusive). Any substite characters are pushed onto the
+-- 'errEnd' (exclusive). Any substitute characters are pushed onto the
 -- supplied 'TextDataStack' argument.
 handleUtf8Err
   :: OnDecodeError
@@ -480,23 +515,27 @@ handleUtf8Err
   -> TextDataStack
   -> TextDataStack
 handleUtf8Err onErr errMsg errStart errEnd s bs tds =
-  let h errPos errEndPos bss tds'
-        | errPos < errEndPos =
-          let errPos' = errPos + 1 in
-          case bss of
-            bs'@(B.length -> len) : bss' ->
-              ( if errPos' < len
-                then h errPos' errEndPos bss
-                else h 0 (errEndPos - len) bss'
-              ) $ case onErr errMsg . Just $ B.index bs' errPos of
-                Just c -> pushText (T.singleton c) tds'
-                Nothing -> tds'
-            [] -> tds'
+  let h errPos tds'
+        | errPos < errEnd =
+          h (errPos + 1) $
+            case onErr errMsg . Just $ B.index bs errPos of
+              Just c -> pushText (T.singleton c) tds'
+              Nothing -> tds'
         | otherwise = tds'
   in
   ( if errStart < 0
-    then h 0 (errEnd - errStart) $ partialUtf8CodePoint s ++ [B.take errEnd bs]
-    else h errStart errEnd [bs]
+    then
+      let partCP = partialUtf8CodePoint s
+          g pNdx tds' =
+            case wordAtPartUft8CP pNdx partCP of
+              Nothing -> h (partUtf8CPLen partCP + errStart) tds'
+              mW -> g (pNdx + 1) $
+                case onErr errMsg mW of
+                  Just c -> pushText (T.singleton c) tds'
+                  Nothing -> tds'
+      in
+      g 0
+    else h errStart
   ) tds
 
 invalidUtf8Msg :: String
@@ -613,13 +652,17 @@ streamDecodeUtf8With ::
 #endif
   OnDecodeError -> ByteString -> Decoding
 streamDecodeUtf8With onErr bs =
-  let g bs' s tds =
+  let h w32 n ws =
+        if n >= 0
+        then h w32 (n - 1) $ fromJust (wordAtPartUft8CP n w32) : ws
+        else ws
+      g bs' s tds =
         case decodeNextUtf8Chunk bs' s tds of
           ((len, eS), tds') ->
             case eS of
               Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
               Right s' -> let bss' = partialUtf8CodePoint s' in
-                Some (stackToText tds') (B.concat bss') $ \ bs'' ->
+                Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' ->
                   g bs'' s' emptyStack
   in
   g bs startUtf8ValidState emptyStack
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 9f45638a..5585803e 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -2,10 +2,12 @@
 
 {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-}
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
+{-# LANGUAGE BangPatterns #-}
 module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
+-- import Debug.Trace (trace)
 import Prelude hiding (head, tail)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
@@ -29,6 +31,8 @@ import qualified Data.Text.Encoding.Error as E
 import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete)
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
+-- import qualified Data.Text.Internal.Lazy as BL
+-- import qualified Data.ByteString.Lazy.Internal as BS
 
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
@@ -96,7 +100,7 @@ t_pn_utf8_2   = case E.validateUtf8Chunk (B.pack [0xF0]) of
 t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of
   (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
     Left _ -> counterexample (show eS1) False
-    Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $
+    Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) 0x01c20000 $
       if isUtf8StateIsComplete $ E.utf8CodePointState s1
       then counterexample (show $ E.utf8CodePointState s1) False
       else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
@@ -245,6 +249,11 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
+-- t_decode_utf8_lenient =
+--   let !res = trace "HEREHEREHERE" $
+--         EL.decodeUtf8With E.lenientDecode (BS.Chunk (B.pack [0xe1]) (BS.Chunk (B.pack [0xa0]) BS.Empty)) === TL.fromStrict (T.pack "\xFFFD\xFFFD") in
+--   trace "THERETHERETHERE" res
+
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
 -- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -278,19 +287,46 @@ t_decode_with_error5' = ioProperty $ do
     Left (_ :: E.UnicodeException) -> True
     Right{} -> False
 
-t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of
+t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of
   ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
     case eS1 of
       Left _ -> counterexample (show eS1) False
-      Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $
+      Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
+        whenEqProp partCP 0x01c20000 .
+        whenEqProp (E.partUtf8CPLen partCP) 1 .
+        whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xc2) .
+        whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $
         if isUtf8StateIsComplete $ E.utf8CodePointState s1
         then counterexample (show $ E.utf8CodePointState s1) False
         else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of
-          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $
+          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left 0x1c20000], 2) $
             case eS2 of
               Right _ -> counterexample (show eS2) False
               Left res -> res === (2, mempty)
 
+t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of
+  ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
+    case eS1 of
+      Left _ -> counterexample (show eS1) False
+      Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
+        whenEqProp partCP 0x01f00000 .
+        whenEqProp (E.partUtf8CPLen partCP) 1 .
+        whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xf0) .
+        whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $
+        if isUtf8StateIsComplete $ E.utf8CodePointState s1
+        then counterexample (show $ E.utf8CodePointState s1) False
+        else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of
+          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (-1, [], 0) $
+            case eS2 of
+              Left _ -> counterexample (show eS2) False
+              Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in
+                whenEqProp partCP2 0x03f09080 .
+                whenEqProp (E.partUtf8CPLen partCP2) 3 .
+                whenEqProp (E.wordAtPartUft8CP 0 partCP2) (Just 0xf0) .
+                whenEqProp (E.wordAtPartUft8CP 1 partCP2) (Just 0x90) .
+                whenEqProp (E.wordAtPartUft8CP 2 partCP2) (Just 0x80) $
+                E.wordAtPartUft8CP 3 partCP2 === Nothing
+
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
   text `T.isInfixOf`
@@ -352,7 +388,8 @@ testTranscoding =
       testProperty "t_decode_with_error3'" t_decode_with_error3',
       testProperty "t_decode_with_error4'" t_decode_with_error4',
       testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_chunk" t_decode_chunk,
+      testProperty "t_decode_chunk1" t_decode_chunk1,
+      testProperty "t_decode_chunk2" t_decode_chunk2,
       testProperty "t_infix_concat" t_infix_concat
     ]
   ]

From 2e4988e801cad3cebc34a8fe103eab7ee6f4a1cd Mon Sep 17 00:00:00 2001
From: Andrew Martin <andrew.thaddeus@gmail.com>
Date: Tue, 3 Jan 2023 11:58:43 -0500
Subject: [PATCH 58/87] Rebase against master

---
 .drone.yml                                 | 45 ----------------------
 .github/workflows/emulated.yml             |  2 +-
 .github/workflows/haskell-ci.yml           |  5 ---
 .github/workflows/i386.yml                 | 26 +++++++++++++
 cabal.haskell-ci                           |  2 +-
 changelog.md                               |  4 ++
 src/Data/Text/Internal/ByteStringCompat.hs | 28 --------------
 tests/Tests/Properties/LowLevel.hs         |  3 --
 text.cabal                                 |  9 ++---
 9 files changed, 36 insertions(+), 88 deletions(-)
 delete mode 100644 .drone.yml
 create mode 100644 .github/workflows/i386.yml

diff --git a/.drone.yml b/.drone.yml
deleted file mode 100644
index 633e4dd9..00000000
--- a/.drone.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-kind: pipeline
-name: arm64-ghc9.0
-platform: { os: linux, arch: arm64 }
-steps:
-- name: Test
-  image: haskell:9.0
-  commands:
-    - uname -a # check platform
-    - getconf LONG_BIT # check bitness
-    - cabal update
-    - cabal new-test
----
-kind: pipeline
-name: arm64-ghc9.2
-platform: { os: linux, arch: arm64 }
-steps:
-- name: Test
-  image: haskell:9.2
-  commands:
-    - uname -a # check platform
-    - getconf LONG_BIT # check bitness
-    - cabal update
-    - cabal new-test
----
-kind: pipeline
-name: i386
-platform: { os: linux, arch: amd64 }
-steps:
-- name: Test
-  image: i386/ubuntu
-  commands:
-    - export LC_ALL=C.UTF-8
-    - apt-get update -y
-    - apt-get install -y ghc cabal-install zlib1g-dev libgmp-dev build-essential curl
-    # Build and install a modern cabal, because 1.24
-    # does not support cxx-sources or cxx-options.
-    - cabal --version # 1.24
-    - cabal update
-    - cabal install cabal-install
-    - export PATH="$HOME/.cabal/bin:$PATH"
-    - hash -r
-    # Now with a modern cabal
-    - cabal --version # 3.0
-    - cabal new-update
-    - cabal new-test
diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml
index 29ba6253..9dae4a25 100644
--- a/.github/workflows/emulated.yml
+++ b/.github/workflows/emulated.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        arch: ['s390x', 'ppc64le']
+        arch: ['s390x', 'ppc64le', 'armv7', 'aarch64']
     steps:
     - uses: actions/checkout@v2
     - uses: uraimo/run-on-arch-action@v2.1.1
diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml
index 576842ad..c23ba1d3 100644
--- a/.github/workflows/haskell-ci.yml
+++ b/.github/workflows/haskell-ci.yml
@@ -68,11 +68,6 @@ jobs:
             compilerVersion: 8.2.2
             setup-method: ghcup
             allow-failure: false
-          - compiler: ghc-8.0.2
-            compilerKind: ghc
-            compilerVersion: 8.0.2
-            setup-method: ghcup
-            allow-failure: false
       fail-fast: false
     steps:
       - name: apt
diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
new file mode 100644
index 00000000..5ece5ea2
--- /dev/null
+++ b/.github/workflows/i386.yml
@@ -0,0 +1,26 @@
+name: ci-i386
+on:
+  - push
+  - pull_request
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  i386:
+    runs-on: ubuntu-latest
+    container:
+      image: i386/ubuntu:bionic
+    steps:
+    - name: Install
+      run: |
+        apt-get update -y
+        apt-get install -y autoconf build-essential zlib1g-dev libgmp-dev curl libncurses5 libtinfo5 libncurses5-dev libtinfo-dev
+        curl --proto '=https' --tlsv1.2 -sSf https://get-ghcup.haskell.org | BOOTSTRAP_HASKELL_NONINTERACTIVE=1 sh
+    - uses: actions/checkout@v1
+    - name: Test
+      run: |
+        source ~/.ghcup/env
+        cabal update
+        cabal test
diff --git a/cabal.haskell-ci b/cabal.haskell-ci
index 9d605ba8..f7425e79 100644
--- a/cabal.haskell-ci
+++ b/cabal.haskell-ci
@@ -1,4 +1,4 @@
-ghcup-jobs: >=8.0
+ghcup-jobs: >=8.2
 docspec: True
 docspec-options: --timeout 2
 haddock: >=8.6
diff --git a/changelog.md b/changelog.md
index 9ea8beef..9bba57f1 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,7 @@
+### Unreleased
+
+* Remove support for GHC 8.0.
+
 ### 2.0.2
 
 * A suite of functions have been added in `Data.Text.Encoding` that
diff --git a/src/Data/Text/Internal/ByteStringCompat.hs b/src/Data/Text/Internal/ByteStringCompat.hs
index 4f8b3949..c5f729b6 100644
--- a/src/Data/Text/Internal/ByteStringCompat.hs
+++ b/src/Data/Text/Internal/ByteStringCompat.hs
@@ -8,13 +8,7 @@ import Data.Word (Word8)
 import Foreign.ForeignPtr (ForeignPtr)
 
 #if !MIN_VERSION_bytestring(0,11,0)
-#if MIN_VERSION_base(4,10,0)
 import GHC.ForeignPtr (plusForeignPtr)
-#else
-import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
-import GHC.Types (Int (..))
-import GHC.Prim (plusAddr#)
-#endif
 #endif
 
 mkBS :: ForeignPtr Word8 -> Int -> ByteString
@@ -32,25 +26,3 @@ withBS (BS !sfp !slen)       kont = kont sfp slen
 withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen
 #endif
 {-# INLINE withBS #-}
-
-#if !MIN_VERSION_bytestring(0,11,0)
-#if !MIN_VERSION_base(4,10,0)
--- |Advances the given address by the given offset in bytes.
---
--- The new 'ForeignPtr' shares the finalizer of the original,
--- equivalent from a finalization standpoint to just creating another
--- reference to the original. That is, the finalizer will not be
--- called before the new 'ForeignPtr' is unreachable, nor will it be
--- called an additional time due to this call, and the finalizer will
--- be called with the same address that it would have had this call
--- not happened, *not* the new address.
-plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b
-plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts
-{-# INLINE [0] plusForeignPtr #-}
-{-# RULES
-"ByteString plusForeignPtr/0" forall fp .
-   plusForeignPtr fp 0 = fp
- #-}
-#endif
-#endif
-
diff --git a/tests/Tests/Properties/LowLevel.hs b/tests/Tests/Properties/LowLevel.hs
index 92c4e209..c3b0a605 100644
--- a/tests/Tests/Properties/LowLevel.hs
+++ b/tests/Tests/Properties/LowLevel.hs
@@ -132,10 +132,7 @@ testLowLevel =
         [ (`hasNoTypes` [''Char, ''[]])
         , (`doesNotUseAnyOf` ['T.pack, 'S.unstream, 'T.map, 'safe, 'S.streamList])
         , (`doesNotUseAnyOf` ['GHC.unpackCString#, 'GHC.unpackCStringUtf8#])
-#if MIN_VERSION_base(4,10,0)
-        -- skip this test for GHC 8.0
         , (`doesNotUseAnyOf` ['T.unpackCString#, 'T.unpackCStringAscii#])
-#endif
         ]
         't_literal_foo)
 #endif
diff --git a/text.cabal b/text.cabal
index 90eca24b..d3176579 100644
--- a/text.cabal
+++ b/text.cabal
@@ -46,7 +46,6 @@ copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper, 2021 Andrew Le
 category:       Data, Text
 build-type:     Simple
 tested-with:
-    GHC == 8.0.2
     GHC == 8.2.2
     GHC == 8.4.4
     GHC == 8.6.5
@@ -117,7 +116,7 @@ library
 
   -- Certain version of GHC crash on Windows, when TemplateHaskell encounters C++.
   -- https://gitlab.haskell.org/ghc/ghc/-/issues/19417
-  if flag(simdutf) && os(windows) && impl(ghc == 8.0.1 || >= 8.8 && < 8.10.5 || == 9.0.1)
+  if flag(simdutf) && os(windows) && impl(ghc >= 8.8 && < 8.10.5 || == 9.0.1)
     build-depends: base < 0
 
   -- For GHC 8.2, 8.6.3 and 8.10.1 even TH + C crash Windows linker.
@@ -187,7 +186,7 @@ library
 
   build-depends:
     array            >= 0.3 && < 0.6,
-    base             >= 4.9 && < 5,
+    base             >= 4.10 && < 5,
     binary           >= 0.5 && < 0.9,
     bytestring       >= 0.10.4 && < 0.12,
     deepseq          >= 1.1 && < 1.5,
@@ -266,9 +265,9 @@ test-suite tests
     transformers,
     text
 
-  -- Plugin infrastructure does not work properly in 8.0.1 and 8.6.1, and
+  -- Plugin infrastructure does not work properly in 8.6.1, and
   -- ghc-9.2.1 library depends on parsec, which causes a circular dependency.
-  if impl(ghc >= 8.0.2 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2)
+  if impl(ghc >= 8.2.1 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2)
     build-depends: tasty-inspection-testing
 
   default-language: Haskell2010

From 6e51fddad9f3206a507a0d0de8c8a1d175fcde13 Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Wed, 10 Aug 2022 12:44:51 -0600
Subject: [PATCH 59/87] move test utility function whenEqProp

---
 tests/Tests/QuickCheckUtils.hs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 9834e92c..8dc24ffd 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,6 +27,10 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
+<<<<<<< HEAD
+=======
+
+>>>>>>> 06137ca (move test utility function whenEqProp)
     , whenEqProp
     ) where
 
@@ -286,7 +290,7 @@ newtype SkewedBool = Skewed { getSkewed :: Bool }
 instance Arbitrary SkewedBool where
   arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)]
 
--- like 'when' but with 'Property' instead of a monad.
+-- like Control.Monad.when, but with properties instead of monad values
 whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property
 whenEqProp a b next = if a == b
   then next

From 47b4dca24dbfb6a7a27b8d4a15876556da397b5a Mon Sep 17 00:00:00 2001
From: david-sledge <sledged@gmail.com>
Date: Sat, 14 Jan 2023 12:03:16 -0700
Subject: [PATCH 60/87] remove sponge left in patient

---
 tests/Tests/QuickCheckUtils.hs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 8dc24ffd..58ad3c47 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,10 +27,7 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
-<<<<<<< HEAD
-=======
 
->>>>>>> 06137ca (move test utility function whenEqProp)
     , whenEqProp
     ) where
 

From 9fb6205c185b1b79720be31548af3cda6b67ffd3 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 30 Jan 2023 20:16:04 +0000
Subject: [PATCH 61/87] Refactor decodeASCII

---
 changelog.md              |  2 +-
 src/Data/Text/Encoding.hs | 69 +++++++++++++++++++++------------------
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/changelog.md b/changelog.md
index 9bba57f1..136d89a3 100644
--- a/changelog.md
+++ b/changelog.md
@@ -7,7 +7,7 @@
 * A suite of functions have been added in `Data.Text.Encoding` that
   allow decoding to be aborted on errors without the need to raise an
   `error` and `catch` it elsewhere:
-  * `decodeAsciiPrefix`
+  * `decodeASCIIPrefix`
   * `TextDataStack`
   * `dataStack`
   * `stackLen`
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 69a10658..ee5d1c69 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -40,7 +40,7 @@ module Data.Text.Encoding
     -- ** Total Functions #total#
     -- $total
     , decodeLatin1
-    , decodeAsciiPrefix
+    , decodeASCIIPrefix
     , TextDataStack
     , dataStack
     , stackLen
@@ -97,6 +97,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bifunctor (Bifunctor(first))
 import Data.Bits (shiftL, shiftR, (.|.), (.&.))
 import Data.ByteString (ByteString)
+import qualified Data.ByteString.Short.Internal as SBS
 import Data.Maybe (fromJust)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty)
@@ -157,47 +158,51 @@ import qualified Data.ByteString.Unsafe as B
 -- (preferably not at all). See "Data.Text.Encoding#g:total" for better
 -- solutions.
 
--- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
+-- | Decode a 'ByteString' containing ASCII text.
+--
+-- This is a total function which returns a pair of the longest ASCII prefix
+-- as 'Text', and the remaining suffix as 'ByteString'.
 --
--- This is a total function. The 'ByteString' is decoded until either
--- the end is reached or it errors with the first non-ASCII 'Word8' is
--- encountered. In either case the function will return the 'Text'
--- value of the longest prefix that is valid ASCII. On error, the index
--- of the non-ASCII 'Word8' is also returned.
+-- Important note: the pair is lazy. This lets you check for errors by testing
+-- whether the second component is empty, without forcing the first component
+-- (which does a copy).
+-- To drop references to the input bytestring, force the prefix
+-- (using 'seq' or @BangPatterns@) and drop references to the suffix.
+--
+-- Properties:
+-- - If @(prefix, suffix) = decodeAsciiPrefix s@, then @'encodeUtf8' prefix <> suffix = s@.
+-- - Either @suffix@ is empty, or @'B.head' suffix > 127@.
 --
 -- @since 2.0.2
-decodeAsciiPrefix ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString -> (Text, Maybe (Word8, Int))
-decodeAsciiPrefix bs = if B.null bs
-  then (empty, Nothing)
-  else unsafeDupablePerformIO $ withBS bs $ \ fp len ->
-    unsafeWithForeignPtr fp $ \src -> do
-      asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len
-      let !prefix = if asciiPrefixLen == 0
-            then empty
-            else runST $ do
-              dst <- A.new asciiPrefixLen
-              A.copyFromPointer dst 0 src asciiPrefixLen
-              arr <- A.unsafeFreeze dst
-              pure $ Text arr 0 asciiPrefixLen
-      let suffix = if asciiPrefixLen < len
-            then Just (B.index bs asciiPrefixLen, asciiPrefixLen)
-            else Nothing
-      pure (prefix, suffix)
+decodeASCIIPrefix :: ByteString -> (Text, ByteString)
+decodeASCIIPrefix bs = if B.null bs
+  then (empty, B.empty)
+  else
+    let len = asciiPrefixLength bs
+        prefix =
+          let !(SBS.SBS arr) = SBS.toShort (B.take len bs) in
+          Text (A.ByteArray arr) 0 len
+        suffix = B.drop len bs in
+    (prefix, suffix)
+
+-- | Length of the longest ASCII prefix.
+asciiPrefixLength :: ByteString -> Int
+asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len ->
+  unsafeWithForeignPtr fp $ \src -> do
+    fromIntegral <$> c_is_ascii src (src `plusPtr` len)
 
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 --
 -- This is a partial function: it checks that input does not contain
 -- anything except ASCII and copies buffer or throws an error otherwise.
---
 decodeASCII :: ByteString -> Text
 decodeASCII bs =
-  case decodeAsciiPrefix bs of
-    (_, Just (word, errPos)) -> error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos
-    (t, Nothing) -> t
+  let (prefix, suffix) = decodeASCIIPrefix bs in
+  case B.uncons suffix of
+    Nothing -> prefix
+    Just (word, _) ->
+      let !errPos = B.length bs - B.length suffix in
+      error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
 --

From fe5b4e69b93878ea8930bdf5c0b50dc9cfab1540 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Tue, 31 Jan 2023 03:57:00 +0000
Subject: [PATCH 62/87] Refactor decodeUtf8 (WIP)

---
 src/Data/Text/Encoding.hs             | 352 +---------------------
 src/Data/Text/Internal/Encoding.hs    | 402 ++++++++++++++++++++++++++
 src/Data/Text/Lazy/Encoding.hs        |   4 +-
 tests/Tests/Properties/Transcoding.hs |  32 +-
 text.cabal                            |   1 +
 5 files changed, 429 insertions(+), 362 deletions(-)
 create mode 100644 src/Data/Text/Internal/Encoding.hs

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index ee5d1c69..e8f2941e 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -25,14 +25,10 @@ module Data.Text.Encoding
     (
     -- * ByteString validation
     -- $validation
-      Utf8ValidState
-    , partialUtf8CodePoint
-    , partUtf8CPLen
-    , wordAtPartUft8CP
-    , utf8CodePointState
+      Utf8State
     , validateUtf8Chunk
     , validateNextUtf8Chunk
-    , startUtf8ValidState
+    , startUtf8State
 
     -- * Decoding ByteStrings to Text
     -- $strict
@@ -92,26 +88,22 @@ module Data.Text.Encoding
     ) where
 
 import Control.Exception (evaluate, try)
-import Control.Monad.ST (ST, runST)
+import Control.Monad.ST (runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Data.Bifunctor (Bifunctor(first))
-import Data.Bits (shiftL, shiftR, (.|.), (.&.))
+import Data.Bits (shiftR, (.&.))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Short.Internal as SBS
-import Data.Maybe (fromJust)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word32, Word8)
+import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import Data.Text.Internal.ByteStringCompat (withBS)
-import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Builder as B
@@ -119,6 +111,7 @@ import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
 import qualified Data.Text.Array as A
+import Data.Text.Internal.Encoding
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
 #if defined(ASSERTS)
@@ -126,7 +119,6 @@ import GHC.Stack (HasCallStack)
 #endif
 
 #ifdef SIMDUTF
-import Foreign.C.Types (CInt(..))
 #elif !MIN_VERSION_bytestring(0,11,2)
 import qualified Data.ByteString.Unsafe as B
 #endif
@@ -240,332 +232,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
--- | This data type represents the state of a 'ByteString' representing
--- UTF-8-encoded text. It consists of a value representing whether or
--- not the last byte is a complete code point, and on incompletion what
--- the 1 to 3 end bytes are that make up the incomplete code point.
-data Utf8ValidState = Utf8ValidState
-  { -- | Get the incomplete UTF-8 code point of the 'ByteString's that
-    -- have been validated thus far. The first byte of the 'Word32'
-    -- indicates the number of bytes of the code point are available,
-    -- and is followed by the bytes of the code point.
-    partialUtf8CodePoint :: Word32
-    -- | Get the current UTF-8 code point state of the 'ByteString's
-    -- that have been validated thus far.
-  , utf8CodePointState :: Utf8CodePointState
-  }
-  deriving (Eq, Ord, Show)
-
-partUtf8CPLen :: Word32 -> Int
-partUtf8CPLen partCP = fromIntegral $ partCP `shiftR` 24
-
-wordAtPartUft8CP :: Int -> Word32 -> Maybe Word8
-wordAtPartUft8CP n partCP
-  | n < partUtf8CPLen partCP && n >= 0 =
-    Just . fromIntegral $ partCP `shiftR` (16 - 8 * n)
-  | otherwise = Nothing
-
--- | This represtents the starting state of a UTF-8 validation check.
-startUtf8ValidState :: Utf8ValidState 
-startUtf8ValidState = Utf8ValidState 0 utf8StartState
-
-appendPartialCodePoint :: Word32 -> ByteString -> Word32
-appendPartialCodePoint cp bs@(B.length -> len) =
-  fst $ B.foldl (\ (cp', bsc) word ->
-      (cp' .|. (fromIntegral word `shiftL` (16 - bsc * 8)), bsc + 1)
-    ) (cp + (fromIntegral len `shiftL` 24), partUtf8CPLen cp) bs
-
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
-
--- | Validate a 'ByteString' as a UTF-8-encoded text.
---
--- @validateUtf8Chunk chunk = (n, es)@
---
--- This function returns two values:
---
--- * The value 'n' indicates the longest prefix of the 'ByteString'
---   that is valid UTF-8-encoded data.
--- * The value 'es' indicates whether the 'ByteString'
---
---     * (@Left p@) contains an invalid code point and where the next
---       (potentially valid) code point begins, so that @p - n@ is the
---       number of invalid bytes, or
---     * (@Right s@) is valid, and all of the remaining bytes starting
---       at inbex 'n' are the beginning of an incomplete UTF-8 code
---       point, and 's' is the resulting 'Utf8ValidState' value, which
---       can be used to validate against a following 'ByteString' with
---       'validateNextUtf8Chunk'.
-validateUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString -> (Int, Either Int Utf8ValidState)
-validateUtf8Chunk bs@(B.length -> len)
-#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
-  | guessUtf8Boundary > 0 &&
-    -- the rest of the bytestring valid utf-8 up to the boundary
-    (
-#ifdef SIMDUTF
-      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
-        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
-          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
-#else
-      B.isValidUtf8 $ B.take guessUtf8Boundary bs
-#endif
-    ) = getEndState guessUtf8Boundary
-    -- No
-  | otherwise = getEndState 0
-    where
-      getEndState ndx = validateUtf8 ndx ndx utf8StartState
-      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
-      guessUtf8Boundary
-        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
-        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
-        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
-        | otherwise = len
-#else
-  = validateUtf8 0 0 utf8StartState
-    where
-#endif
-      validateUtf8 !ndx0 ndx s
-        | ndx < len =
-          let ndx' = ndx + 1 in
-          case updateUtf8State (B.index bs ndx) s of
-            Just s' ->
-              validateUtf8 (
-                if isUtf8StateIsComplete s'
-                then ndx'
-                else ndx0
-              ) ndx' s'
-            Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        | otherwise = (ndx0, Right $ Utf8ValidState (appendPartialCodePoint 0 $ B.drop ndx0 bs) s)
-
--- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
---
--- @validateNextUtf8Chunk chunk s = (n, es)@
---
--- This function returns two values:
---
--- * The value 'n' indicates the end position of longest prefix of the
---   'ByteString' that is valid UTF-8-encoded data from the starting
---   state 's'. If 's' contains an incomplete code point, the input
---   'ByteString' is considered a continuation. As a result 'n' will be
---   negative if the code point is still incomplete or is proven to be
---   invalid.
---   
--- * The value 'es' indicates whether the 'ByteString'
---
---     * (@Left p@) contains an invalid code point and where the next
---       (potentially valid) code point begins, so that @p - n@ is the
---       number of invalid bytes, or
---     * (@Right s'@) is valid, and all of the remaining bytes starting
---       at index 'n' are the beginning of an incomplete UTF-8 code
---       point, and `s'` is the resulting 'Utf8ValidState' value, which
---       can be used to validate against a following 'ByteString'.
-validateNextUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString -> Utf8ValidState -> (Int, Either Int Utf8ValidState)
-validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s)
-  | len > 0 =
-    let g pos s'
-          -- first things first. let's try to get to the start of the next code point
-          | isUtf8StateIsComplete s' =
-            -- found the beginning of the next code point, hand this off to someone else
-            case validateUtf8Chunk $ B.drop pos bs of
-              (len', mS) -> (pos + len', first (+ pos) mS)
-          -- code point is not complete yet
-          -- walk the rest of the code point until error, complete, or no more data
-          | pos < len =
-            case updateUtf8State (B.index bs pos) s' of
-              -- error
-              Nothing -> (leadPos, Left pos)
-              -- keep going
-              Just s'' -> g (pos + 1) s''
-          -- no more data
-          | otherwise = (leadPos, Right $ Utf8ValidState (appendPartialCodePoint lead bs) s')
-    in g 0 s
-  | otherwise = (leadPos, Right st)
-    where leadPos = -(partUtf8CPLen lead)
-
--- | Validated UTF-8 data to be converted into a 'Text' value.
-data TextDataStack = TextDataStack
-  { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values.
-    dataStack :: [Either Text (Either Word32 ByteString)]
-    -- | Returns total number of UTF-8 valid bytes in the stack.
-  , stackLen :: Int
-  }
-  deriving Show
-
--- | Empty stack
-emptyStack :: TextDataStack
-emptyStack = TextDataStack [] 0
-
--- | Push a text value onto the stack
-pushText :: Text -> TextDataStack -> TextDataStack
-pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
-  if tLen > 0
-  then TextDataStack (Left t : stack) $ sLen + tLen
-  else tds
-
-copyFromStack :: [Either Text (Either Word32 ByteString)] -> Int -> A.MArray s -> ST s ()
-copyFromStack (dat : dataStack') tLen dst =
-  (case dat of
-    Left (Text arr0 off utf8Len) -> do
-      let dstOff = tLen - utf8Len
-      A.copyI utf8Len dst dstOff arr0 off
-      pure dstOff
-    Right encoded ->
-      case encoded of
-        Left partial ->
-          let utf8Len = partUtf8CPLen partial
-              dstOff = tLen - utf8Len
-              g dstOff' =
-                case wordAtPartUft8CP (dstOff' - dstOff) partial of
-                  Just w -> do
-                    A.unsafeWrite dst dstOff' w
-                    g $ dstOff' + 1
-                  Nothing -> pure dstOff
-          in
-          g dstOff
-        Right bs@(B.length -> utf8Len) -> do
-          let dstOff = tLen - utf8Len
-          withBS bs $ \ fp _ ->
-            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
-          pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst)
-copyFromStack _ _ _ = pure ()
-{-# INLINE copyFromStack #-}
-
--- | Create a 'Text' value from the contents of a 'TextDataStack'.
-stackToText ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  TextDataStack -> Text
-stackToText (TextDataStack stack sLen)
-  | sLen > 0 = runST $
-    do
-      dst <- A.new sLen
-      copyFromStack stack sLen dst
-      arr <- A.unsafeFreeze dst
-      pure $ Text arr 0 sLen
-  | otherwise = empty
-
--- | Decode a 'ByteString' in the context of what has been already been decoded.
---
--- The 'ByteString' is validated against the 'Utf8ValidState' using the rules
--- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added
--- to the input 'TextDataStack' which is returned with the end position of the
--- valid prefix, and either the resulting 'Utf8ValidState'
--- (@Right Utf8ValidState@) or the position of the of the first (potentially)
--- valid byte after the invalid bytes with remainder of the 'ByteString'
--- (@Left (Int, ByteString)@).
-decodeNextUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString
-  -> Utf8ValidState
-  -> TextDataStack
-  -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)
-decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) =
-  case validateNextUtf8Chunk bs s of
-    (len, res) ->
-      let stackedData'
-            | len >= 0 =
-              let partCP = partialUtf8CodePoint s
-                  partLen = partUtf8CPLen partCP
-                  stackedData@(TextDataStack stack' sLen') =
-                    if partLen > 0
-                    then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen
-                    else tds
-              in
-              if len > 0
-              then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len
-              else stackedData
-            | otherwise = tds
-      in
-      ( ( len
-        , case res of
-            Left pos -> Left (pos, B.drop pos bs)
-            Right s' -> Right s'
-        )
-      , stackedData'
-      )
-
--- | Decode a 'ByteString' against a start 'Utf8ValidState' with an empty
--- 'TextDataStack'.
---
--- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@
-decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack)
-decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack
-
--- | Call an error handler with the give 'String' message for each byte
--- in given 'ByteString' and lead data in the given 'Utf8ValidState'
--- value. The bytes are the positions from 'errStart' (inclusive) to
--- 'errEnd' (exclusive). Any substitute characters are pushed onto the
--- supplied 'TextDataStack' argument.
-handleUtf8Err
-  :: OnDecodeError
-  -> String
-  -> Int
-  -> Int
-  -> Utf8ValidState
-  -> ByteString
-  -> TextDataStack
-  -> TextDataStack
-handleUtf8Err onErr errMsg errStart errEnd s bs tds =
-  let h errPos tds'
-        | errPos < errEnd =
-          h (errPos + 1) $
-            case onErr errMsg . Just $ B.index bs errPos of
-              Just c -> pushText (T.singleton c) tds'
-              Nothing -> tds'
-        | otherwise = tds'
-  in
-  ( if errStart < 0
-    then
-      let partCP = partialUtf8CodePoint s
-          g pNdx tds' =
-            case wordAtPartUft8CP pNdx partCP of
-              Nothing -> h (partUtf8CPLen partCP + errStart) tds'
-              mW -> g (pNdx + 1) $
-                case onErr errMsg mW of
-                  Just c -> pushText (T.singleton c) tds'
-                  Nothing -> tds'
-      in
-      g 0
-    else h errStart
-  ) tds
-
-invalidUtf8Msg :: String
-invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-
--- | Decode a 'ByteString' containing UTF-8 encoded text.
---
--- Surrogate code points in replacement character returned by 'OnDecodeError'
--- will be automatically remapped to the replacement char @U+FFFD@.
-decodeUtf8With ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs =
-  let g bs'@(B.length -> bLen) res =
-        case res of
-          ((len, eS), tds) ->
-            let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in
-            case eS of
-              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ValidState $ h invalidUtf8Msg pos startUtf8ValidState
-              Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s
-  in
-  g bs $ decodeUtf8Chunk bs
-
 -- $stream
 --
 -- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
@@ -659,18 +325,18 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr bs =
   let h w32 n ws =
         if n >= 0
-        then h w32 (n - 1) $ fromJust (wordAtPartUft8CP n w32) : ws
+        then h w32 (n - 1) $ partUtf8CPUnsafeIndex n w32 : ws
         else ws
       g bs' s tds =
         case decodeNextUtf8Chunk bs' s tds of
           ((len, eS), tds') ->
             case eS of
-              Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
+              Left (pos, bs'') -> g bs'' startUtf8State $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
               Right s' -> let bss' = partialUtf8CodePoint s' in
                 Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' ->
                   g bs'' s' emptyStack
   in
-  g bs startUtf8ValidState emptyStack
+  g bs startUtf8State emptyStack
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
new file mode 100644
index 00000000..a16ac3bd
--- /dev/null
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -0,0 +1,402 @@
+{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
+    UnliftedFFITypes #-}
+{-# LANGUAGE Trustworthy #-}
+{-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE ViewPatterns #-}
+-- |
+module Data.Text.Internal.Encoding where
+
+import Control.Exception (evaluate, try)
+import Control.Monad.ST (ST, runST)
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Data.Bifunctor (Bifunctor(first))
+import Data.Bits (shiftL, shiftR, (.&.))
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Short.Internal as SBS
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
+import Data.Text.Internal (Text(..), empty)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Show as T (singleton)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Word (Word32, Word8)
+import Foreign.C.Types (CSize(..))
+import Foreign.Ptr (Ptr, minusPtr, plusPtr)
+import Foreign.Storable (poke, peekByteOff)
+import GHC.Exts (byteArrayContents#, unsafeCoerce#)
+import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Encoding.Utf8
+  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Builder as B
+import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
+import qualified Data.ByteString.Builder.Prim as BP
+import qualified Data.ByteString.Builder.Prim.Internal as BP
+import qualified Data.Text.Array as A
+import qualified Data.Text.Internal.Encoding.Fusion as E
+import qualified Data.Text.Internal.Fusion as F
+#if defined(ASSERTS)
+import GHC.Stack (HasCallStack)
+#endif
+
+#ifdef SIMDUTF
+import Foreign.C.Types (CInt(..))
+#elif !MIN_VERSION_bytestring(0,11,2)
+import qualified Data.ByteString.Unsafe as B
+#endif
+
+-- | State of decoding a 'ByteString' in UTF-8.
+-- It consists of a value representing whether or
+-- not the last byte is a complete code point, and on incompletion what
+-- the 1 to 3 end bytes are that make up the incomplete code point.
+data Utf8State = Utf8State
+  { -- | Current UTF-8 code point state of the 'ByteString's
+    -- that have been validated thus far.
+    utf8CodePointState :: Utf8CodePointState
+    -- | Get the incomplete UTF-8 code point of the 'ByteString's that
+    -- have been validated thus far. The first byte of the 'Word32'
+    -- indicates the number of bytes of the code point are available,
+    -- and is followed by the bytes of the code point.
+  , partialUtf8CodePoint :: PartialUtf8CodePoint
+  }
+  deriving (Eq, Ord, Show)
+
+-- | This represtents the starting state of a UTF-8 validation check.
+startUtf8State :: Utf8State 
+startUtf8State = Utf8State utf8StartState partUtf8CPEmpty
+
+-- | Prefix of a valid UTF-8 encoded code point.
+-- This consists of a length (in bytes) between 1 and 3 stored in the most
+-- significant byte, and the actual bytes in the rest of the word.
+newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
+  deriving (Eq, Ord, Show)
+
+-- | Empty prefix.
+partUtf8CPEmpty :: PartialUtf8CodePoint
+partUtf8CPEmpty = PartialUtf8CodePoint 0
+
+-- | Length of the partial code point, stored in the most significant byte.
+partUtf8CPLen :: PartialUtf8CodePoint -> Int
+partUtf8CPLen (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
+
+-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8CPLen c@.
+partUtf8CPUnsafeIndex ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  Int -> PartialUtf8CodePoint -> Word8
+partUtf8CPUnsafeIndex n (PartialUtf8CodePoint w) = 
+#if defined(ASSERTS)
+  assert (0 <= n && n < partUtf8CPLen w) $
+#endif
+  fromIntegral $ w `shiftR` (16 - 8 * n)
+
+partUtf8CPUnsafeAppend ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
+partUtf8CPUnsafeAppend c@(PartialUtf8CodePoint word) bs =
+#if defined(ASSERTS)
+  assert (lenc + lenbs <= 3) $
+#endif
+  PartialUtf8CodePoint $
+    tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24)
+  where
+    lenc = partUtf8CPLen c
+    lenbs = B.length bs
+    tryPush i w =
+      if i < lenbs
+      then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i)))
+      else w
+
+#ifdef SIMDUTF
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
+    :: Ptr Word8 -> CSize -> IO CInt
+#endif
+
+-- | Validate a 'ByteString' as a UTF-8-encoded text.
+--
+-- @validateUtf8Chunk chunk = (n, es)@
+--
+-- This function returns two values:
+--
+-- * The value 'n' indicates the longest prefix of the 'ByteString'
+--   that is valid UTF-8-encoded data.
+-- * The value 'es' indicates whether the 'ByteString'
+--
+--     * (@Left p@) contains an invalid code point and where the next
+--       (potentially valid) code point begins, so that @p - n@ is the
+--       number of invalid bytes, or
+--     * (@Right s@) is valid, and all of the remaining bytes starting
+--       at inbex 'n' are the beginning of an incomplete UTF-8 code
+--       point, and 's' is the resulting 'Utf8State' value, which
+--       can be used to validate against a following 'ByteString' with
+--       'validateNextUtf8Chunk'.
+validateUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString -> (Int, Either Int Utf8State)
+validateUtf8Chunk bs@(B.length -> len)
+#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
+  | guessUtf8Boundary > 0 &&
+    -- the rest of the bytestring valid utf-8 up to the boundary
+    (
+#ifdef SIMDUTF
+      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
+          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
+#else
+      B.isValidUtf8 $ B.take guessUtf8Boundary bs
+#endif
+    ) = getEndState guessUtf8Boundary
+    -- No
+  | otherwise = getEndState 0
+    where
+      getEndState ndx = validateUtf8 ndx ndx utf8StartState
+      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
+      guessUtf8Boundary
+        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
+        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
+        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
+        | otherwise = len
+#else
+  = validateUtf8 0 0 utf8StartState
+    where
+#endif
+      validateUtf8 !ndx0 ndx s
+        | ndx < len =
+          let ndx' = ndx + 1 in
+          case updateUtf8State (B.index bs ndx) s of
+            Just s' ->
+              validateUtf8 (
+                if isUtf8StateIsComplete s'
+                then ndx'
+                else ndx0
+              ) ndx' s'
+            Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
+        | otherwise = (ndx0, Right $ Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty $ B.drop ndx0 bs))
+
+-- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
+--
+-- @validateNextUtf8Chunk chunk s = (n, es)@
+--
+-- This function returns two values:
+--
+-- * The value 'n' indicates the end position of longest prefix of the
+--   'ByteString' that is valid UTF-8-encoded data from the starting
+--   state 's'. If 's' contains an incomplete code point, the input
+--   'ByteString' is considered a continuation. As a result 'n' will be
+--   negative if the code point is still incomplete or is proven to be
+--   invalid.
+--   
+-- * The value 'es' indicates whether the 'ByteString'
+--
+--     * (@Left p@) contains an invalid code point and where the next
+--       (potentially valid) code point begins, so that @p - n@ is the
+--       number of invalid bytes, or
+--     * (@Right s'@) is valid, and all of the remaining bytes starting
+--       at index 'n' are the beginning of an incomplete UTF-8 code
+--       point, and `s'` is the resulting 'Utf8State' value, which
+--       can be used to validate against a following 'ByteString'.
+validateNextUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString -> Utf8State -> (Int, Either Int Utf8State)
+validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8State s lead)
+  | len > 0 =
+    let g pos s'
+          -- first things first. let's try to get to the start of the next code point
+          | isUtf8StateIsComplete s' =
+            -- found the beginning of the next code point, hand this off to someone else
+            case validateUtf8Chunk $ B.drop pos bs of
+              (len', mS) -> (pos + len', first (+ pos) mS)
+          -- code point is not complete yet
+          -- walk the rest of the code point until error, complete, or no more data
+          | pos < len =
+            case updateUtf8State (B.index bs pos) s' of
+              -- error
+              Nothing -> (leadPos, Left pos)
+              -- keep going
+              Just s'' -> g (pos + 1) s''
+          -- no more data
+          | otherwise = (leadPos, Right $ Utf8State s' (partUtf8CPUnsafeAppend lead bs))
+    in g 0 s
+  | otherwise = (leadPos, Right st)
+    where leadPos = -(partUtf8CPLen lead)
+
+-- | Validated UTF-8 data to be converted into a 'Text' value.
+data TextDataStack = TextDataStack
+  { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values.
+    dataStack :: [Either Text (Either PartialUtf8CodePoint ByteString)]
+    -- | Returns total number of UTF-8 valid bytes in the stack.
+  , stackLen :: Int
+  }
+  deriving Show
+
+-- | Empty stack
+emptyStack :: TextDataStack
+emptyStack = TextDataStack [] 0
+
+-- | Push a text value onto the stack
+pushText :: Text -> TextDataStack -> TextDataStack
+pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
+  if tLen > 0
+  then TextDataStack (Left t : stack) $ sLen + tLen
+  else tds
+
+copyFromStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] -> Int -> A.MArray s -> ST s ()
+copyFromStack (dat : dataStack') tLen dst =
+  (case dat of
+    Left (Text arr0 off utf8Len) -> do
+      let dstOff = tLen - utf8Len
+      A.copyI utf8Len dst dstOff arr0 off
+      pure dstOff
+    Right encoded ->
+      case encoded of
+        Left partial -> do
+          let utf8Len = partUtf8CPLen partial
+              dstOff = tLen - utf8Len
+              g i | i < utf8Len = do
+                    A.unsafeWrite dst (dstOff + i) (partUtf8CPUnsafeIndex i partial)
+                    g (i + 1)
+                  | otherwise = pure ()
+          g dstOff
+          pure utf8Len
+        Right bs@(B.length -> utf8Len) -> do
+          let dstOff = tLen - utf8Len
+          withBS bs $ \ fp _ ->
+            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
+              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
+          pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst)
+copyFromStack _ _ _ = pure ()
+{-# INLINE copyFromStack #-}
+
+-- | Create a 'Text' value from the contents of a 'TextDataStack'.
+stackToText ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  TextDataStack -> Text
+stackToText (TextDataStack stack sLen)
+  | sLen > 0 = runST $
+    do
+      dst <- A.new sLen
+      copyFromStack stack sLen dst
+      arr <- A.unsafeFreeze dst
+      pure $ Text arr 0 sLen
+  | otherwise = empty
+
+-- | Decode a 'ByteString' in the context of what has been already been decoded.
+--
+-- The 'ByteString' is validated against the 'Utf8State' using the rules
+-- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added
+-- to the input 'TextDataStack' which is returned with the end position of the
+-- valid prefix, and either the resulting 'Utf8State'
+-- (@Right Utf8State@) or the position of the of the first (potentially)
+-- valid byte after the invalid bytes with remainder of the 'ByteString'
+-- (@Left (Int, ByteString)@).
+decodeNextUtf8Chunk ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  ByteString
+  -> Utf8State
+  -> TextDataStack
+  -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack)
+decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) =
+  case validateNextUtf8Chunk bs s of
+    (len, res) ->
+      let stackedData'
+            | len >= 0 =
+              let partCP = partialUtf8CodePoint s
+                  partLen = partUtf8CPLen partCP
+                  stackedData@(TextDataStack stack' sLen') =
+                    if partLen > 0
+                    then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen
+                    else tds
+              in
+              if len > 0
+              then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len
+              else stackedData
+            | otherwise = tds
+      in
+      ( ( len
+        , case res of
+            Left pos -> Left (pos, B.drop pos bs)
+            Right s' -> Right s'
+        )
+      , stackedData'
+      )
+
+-- | Decode a 'ByteString' against a start 'Utf8State' with an empty
+-- 'TextDataStack'.
+--
+-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@
+decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack)
+decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8State emptyStack
+
+-- | Call an error handler with the give 'String' message for each byte
+-- in given 'ByteString' and lead data in the given 'Utf8State'
+-- value. The bytes are the positions from 'errStart' (inclusive) to
+-- 'errEnd' (exclusive). Any substitute characters are pushed onto the
+-- supplied 'TextDataStack' argument.
+handleUtf8Err
+  :: OnDecodeError
+  -> String
+  -> Int
+  -> Int
+  -> Utf8State
+  -> ByteString
+  -> TextDataStack
+  -> TextDataStack
+handleUtf8Err onErr errMsg errStart errEnd s bs tds =
+  let h errPos tds'
+        | errPos < errEnd =
+          h (errPos + 1) $
+            case onErr errMsg . Just $ B.index bs errPos of
+              Just c -> pushText (T.singleton c) tds'
+              Nothing -> tds'
+        | otherwise = tds'
+  in
+  ( if errStart < 0
+    then
+      let partCP = partialUtf8CodePoint s
+          partCPLen = partUtf8CPLen partCP
+          g i tds'
+            | i < partCPLen = g (i + 1) $
+              case onErr errMsg (Just (partUtf8CPUnsafeIndex i partCP)) of
+                Just c -> pushText (T.singleton c) tds'
+                Nothing -> tds'
+            | otherwise = h (partCPLen + errStart) tds'
+      in
+      g 0
+    else h errStart
+  ) tds
+
+invalidUtf8Msg :: String
+invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+--
+-- Surrogate code points in replacement character returned by 'OnDecodeError'
+-- will be automatically remapped to the replacement char @U+FFFD@.
+decodeUtf8With ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr bs =
+  let g bs' res =
+        case res of
+          ((len, eS), tds) ->
+            let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in
+            case eS of
+              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8State $ h invalidUtf8Msg pos startUtf8State
+              Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" (B.length bs') s
+  in
+  g bs $ decodeUtf8Chunk bs
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 5b5e8bb9..418e473e 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -112,7 +112,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
           ((len, eS), tds') ->
             let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in
             case eS of
-              Left (pos, bs') -> g bs' lbs TE.startUtf8ValidState
+              Left (pos, bs') -> g bs' lbs TE.startUtf8State
                 (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText
               Right s' ->
                 case lbs of
@@ -120,7 +120,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
                     g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds')
                   B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty
   in
-  g b0 bs0 TE.startUtf8ValidState TE.emptyStack id
+  g b0 bs0 TE.startUtf8State TE.emptyStack id
 decodeUtf8With _ _ = empty
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 5585803e..d75cfaaa 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -29,6 +29,7 @@ import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
 import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete)
+import qualified Data.Text.Internal.Encoding as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 -- import qualified Data.Text.Internal.Lazy as BL
@@ -100,11 +101,11 @@ t_pn_utf8_2   = case E.validateUtf8Chunk (B.pack [0xF0]) of
 t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of
   (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
     Left _ -> counterexample (show eS1) False
-    Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) 0x01c20000 $
+    Right s1 -> E.partialUtf8CodePoint s1 === E.PartialUtf8CodePoint 0x01c20000 .&&.
       if isUtf8StateIsComplete $ E.utf8CodePointState s1
       then counterexample (show $ E.utf8CodePointState s1) False
       else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
-        (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2
+        (len2, eS2) -> len2 === 1 .&&. eS2 === Left 2
 
 t_utf8_c     = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
@@ -292,14 +293,13 @@ t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of
     case eS1 of
       Left _ -> counterexample (show eS1) False
       Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
-        whenEqProp partCP 0x01c20000 .
-        whenEqProp (E.partUtf8CPLen partCP) 1 .
-        whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xc2) .
-        whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $
+        partCP === E.PartialUtf8CodePoint 0x01c20000 .&&.
+        E.partUtf8CPLen partCP === 1 .&&.
+        E.partUtf8CPUnsafeIndex 0 partCP === 0xc2 .&&.
         if isUtf8StateIsComplete $ E.utf8CodePointState s1
         then counterexample (show $ E.utf8CodePointState s1) False
         else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of
-          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left 0x1c20000], 2) $
+          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left (E.PartialUtf8CodePoint 0x1c20000)], 2) $
             case eS2 of
               Right _ -> counterexample (show eS2) False
               Left res -> res === (2, mempty)
@@ -309,10 +309,9 @@ t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of
     case eS1 of
       Left _ -> counterexample (show eS1) False
       Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
-        whenEqProp partCP 0x01f00000 .
-        whenEqProp (E.partUtf8CPLen partCP) 1 .
-        whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xf0) .
-        whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $
+        partCP === E.PartialUtf8CodePoint 0x01f00000 .&&.
+        E.partUtf8CPLen partCP === 1 .&&.
+        E.partUtf8CPUnsafeIndex 0 partCP === 0xf0 .&&.
         if isUtf8StateIsComplete $ E.utf8CodePointState s1
         then counterexample (show $ E.utf8CodePointState s1) False
         else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of
@@ -320,12 +319,11 @@ t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of
             case eS2 of
               Left _ -> counterexample (show eS2) False
               Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in
-                whenEqProp partCP2 0x03f09080 .
-                whenEqProp (E.partUtf8CPLen partCP2) 3 .
-                whenEqProp (E.wordAtPartUft8CP 0 partCP2) (Just 0xf0) .
-                whenEqProp (E.wordAtPartUft8CP 1 partCP2) (Just 0x90) .
-                whenEqProp (E.wordAtPartUft8CP 2 partCP2) (Just 0x80) $
-                E.wordAtPartUft8CP 3 partCP2 === Nothing
+                partCP2 === E.PartialUtf8CodePoint 0x03f09080 .&&.
+                E.partUtf8CPLen partCP2 === 3 .&&.
+                E.partUtf8CPUnsafeIndex 0 partCP2 === 0xf0 .&&.
+                E.partUtf8CPUnsafeIndex 1 partCP2 === 0x90 .&&.
+                E.partUtf8CPUnsafeIndex 2 partCP2 === 0x80
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
diff --git a/text.cabal b/text.cabal
index 445bb6e5..0d420fa2 100644
--- a/text.cabal
+++ b/text.cabal
@@ -150,6 +150,7 @@ library
     Data.Text.Internal.Builder.RealFloat.Functions
     Data.Text.Internal.ByteStringCompat
     Data.Text.Internal.PrimCompat
+    Data.Text.Internal.Encoding
     Data.Text.Internal.Encoding.Fusion
     Data.Text.Internal.Encoding.Fusion.Common
     Data.Text.Internal.Encoding.Utf16

From 4d9c09091290aeabfb17ff071f255e253297d1b1 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Tue, 31 Jan 2023 18:07:16 +0000
Subject: [PATCH 63/87] More streaming decodeUtf8 (WIP)

---
 src/Data/Text/Encoding.hs               |  43 ++-
 src/Data/Text/Internal/Encoding.hs      | 383 +++++++++++-------------
 src/Data/Text/Internal/StrictBuilder.hs |   0
 src/Data/Text/Lazy/Encoding.hs          |  24 +-
 4 files changed, 194 insertions(+), 256 deletions(-)
 create mode 100644 src/Data/Text/Internal/StrictBuilder.hs

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index e8f2941e..d24942a1 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -37,13 +37,6 @@ module Data.Text.Encoding
     -- $total
     , decodeLatin1
     , decodeASCIIPrefix
-    , TextDataStack
-    , dataStack
-    , stackLen
-    , emptyStack
-    , pushText
-    , stackToText
-    , decodeNextUtf8Chunk
     , decodeUtf8Chunk
     , decodeUtf8Lenient
 
@@ -51,7 +44,6 @@ module Data.Text.Encoding
     , decodeUtf8'
 
     -- *** Controllable error handling
-    , handleUtf8Err
     , decodeUtf8With
     , decodeUtf16LEWith
     , decodeUtf16BEWith
@@ -114,6 +106,7 @@ import qualified Data.Text.Array as A
 import Data.Text.Internal.Encoding
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
+import Data.Text.Show ()
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
@@ -322,21 +315,25 @@ streamDecodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With onErr bs =
-  let h w32 n ws =
-        if n >= 0
-        then h w32 (n - 1) $ partUtf8CPUnsafeIndex n w32 : ws
-        else ws
-      g bs' s tds =
-        case decodeNextUtf8Chunk bs' s tds of
-          ((len, eS), tds') ->
-            case eS of
-              Left (pos, bs'') -> g bs'' startUtf8State $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds'
-              Right s' -> let bss' = partialUtf8CodePoint s' in
-                Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' ->
-                  g bs'' s' emptyStack
-  in
-  g bs startUtf8State emptyStack
+streamDecodeUtf8With onErr = loop startUtf8State
+  where
+    loop s chunk =
+      let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) chunk s
+      in Some (strictBuilderToText builder) undecoded (loop s')
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+--
+-- Surrogate code points in replacement character returned by 'OnDecodeError'
+-- will be automatically remapped to the replacement char @U+FFFD@.
+decodeUtf8With ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr = decodeUtf8With1 (onErr invalidUtf8Msg . Just)
+
+invalidUtf8Msg :: String
+invalidUtf8Msg = "Data.Text.Encoding: Invalid UTF-8 stream"
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index a16ac3bd..41b69902 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -1,42 +1,28 @@
 {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
     UnliftedFFITypes #-}
 {-# LANGUAGE Trustworthy #-}
-{-# LANGUAGE TypeApplications #-}
-{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ViewPatterns #-}
 -- |
 module Data.Text.Internal.Encoding where
 
-import Control.Exception (evaluate, try)
 import Control.Monad.ST (ST, runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Data.Bifunctor (Bifunctor(first))
-import Data.Bits (shiftL, shiftR, (.&.))
+import Data.Bits (shiftL, shiftR)
 import Data.ByteString (ByteString)
-import qualified Data.ByteString.Short.Internal as SBS
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), empty)
+import Data.Functor (void)
+import Data.Text.Internal (Text(..), empty, safe)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Show as T (singleton)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word32, Word8)
 import Foreign.C.Types (CSize(..))
-import Foreign.Ptr (Ptr, minusPtr, plusPtr)
-import Foreign.Storable (poke, peekByteOff)
-import GHC.Exts (byteArrayContents#, unsafeCoerce#)
-import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
+import Foreign.Ptr (Ptr)
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
-  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
+  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length)
 import qualified Data.ByteString as B
-import qualified Data.ByteString.Internal as B
-import qualified Data.ByteString.Builder as B
-import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
-import qualified Data.ByteString.Builder.Prim as BP
-import qualified Data.ByteString.Builder.Prim.Internal as BP
 import qualified Data.Text.Array as A
-import qualified Data.Text.Internal.Encoding.Fusion as E
-import qualified Data.Text.Internal.Fusion as F
+import qualified Data.Text.Internal.Unsafe.Char as Char
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
@@ -86,8 +72,8 @@ partUtf8CPUnsafeIndex ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  Int -> PartialUtf8CodePoint -> Word8
-partUtf8CPUnsafeIndex n (PartialUtf8CodePoint w) = 
+  PartialUtf8CodePoint -> Int -> Word8
+partUtf8CPUnsafeIndex (PartialUtf8CodePoint w) n =
 #if defined(ASSERTS)
   assert (0 <= n && n < partUtf8CPLen w) $
 #endif
@@ -135,50 +121,57 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 --       point, and 's' is the resulting 'Utf8State' value, which
 --       can be used to validate against a following 'ByteString' with
 --       'validateNextUtf8Chunk'.
-validateUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString -> (Int, Either Int Utf8State)
-validateUtf8Chunk bs@(B.length -> len)
+validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
+validateUtf8Chunk = validateUtf8ChunkFrom 0
+
+-- | Add an offset to the index returned by 'validateUtf8Chunk'.
+--
+-- @
+-- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8Chunk' . 'B.drop' n
+-- @
+validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
+validateUtf8ChunkFrom ofs bs
 #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
   | guessUtf8Boundary > 0 &&
     -- the rest of the bytestring valid utf-8 up to the boundary
     (
 #ifdef SIMDUTF
-      withBS bs $ \ fp _ -> unsafeDupablePerformIO $
+      withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $
         unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
           c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
 #else
-      B.isValidUtf8 $ B.take guessUtf8Boundary bs
+      B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
 #endif
-    ) = getEndState guessUtf8Boundary
+    ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) bs
     -- No
-  | otherwise = getEndState 0
+  | otherwise = slowValidateUtf8ChunkFrom ofs bs
     where
-      getEndState ndx = validateUtf8 ndx ndx utf8StartState
-      w n word8 = len >= n && word8 <= (B.index bs $ len - n)
+      len = B.length bs
+      isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n)
       guessUtf8Boundary
-        | w 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
-        | w 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
-        | w 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
+        | isBoundary 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
+        | isBoundary 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
+        | isBoundary 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
         | otherwise = len
 #else
-  = validateUtf8 0 0 utf8StartState
-    where
+  = slowValidateUtf8ChunkFrom ofs bs
 #endif
-      validateUtf8 !ndx0 ndx s
-        | ndx < len =
-          let ndx' = ndx + 1 in
-          case updateUtf8State (B.index bs ndx) s of
+
+-- | A pure Haskell implementation of validateUtf8Chunk.
+--
+-- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
+-- indices to let us avoid this function.
+slowValidateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
+slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
+  where
+    loop !utf8End i s
+      | i < B.length bs =
+          case updateUtf8State (B.index bs i) s of
             Just s' ->
-              validateUtf8 (
-                if isUtf8StateIsComplete s'
-                then ndx'
-                else ndx0
-              ) ndx' s'
-            Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx)
-        | otherwise = (ndx0, Right $ Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty $ B.drop ndx0 bs))
+              let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End
+              in loop (i + 1) utf8End' s'
+            Nothing -> (utf8End, Nothing)
+      | otherwise = (utf8End, Just (Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty (B.drop utf8End bs))))
 
 -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
 --
@@ -202,94 +195,78 @@ validateUtf8Chunk bs@(B.length -> len)
 --       at index 'n' are the beginning of an incomplete UTF-8 code
 --       point, and `s'` is the resulting 'Utf8State' value, which
 --       can be used to validate against a following 'ByteString'.
-validateNextUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString -> Utf8State -> (Int, Either Int Utf8State)
-validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8State s lead)
-  | len > 0 =
-    let g pos s'
-          -- first things first. let's try to get to the start of the next code point
-          | isUtf8StateIsComplete s' =
-            -- found the beginning of the next code point, hand this off to someone else
-            case validateUtf8Chunk $ B.drop pos bs of
-              (len', mS) -> (pos + len', first (+ pos) mS)
-          -- code point is not complete yet
-          -- walk the rest of the code point until error, complete, or no more data
-          | pos < len =
-            case updateUtf8State (B.index bs pos) s' of
-              -- error
-              Nothing -> (leadPos, Left pos)
-              -- keep going
-              Just s'' -> g (pos + 1) s''
-          -- no more data
-          | otherwise = (leadPos, Right $ Utf8State s' (partUtf8CPUnsafeAppend lead bs))
-    in g 0 s
-  | otherwise = (leadPos, Right st)
-    where leadPos = -(partUtf8CPLen lead)
+validateNextUtf8Chunk :: ByteString -> Utf8State -> (Int, Maybe Utf8State)
+validateNextUtf8Chunk bs st@(Utf8State s0 part)
+  | len > 0 = loop 0 s0
+  | otherwise = (0, Just st)
+  where
+    len = B.length bs
+    -- Complete an incomplete code point (if there is one)
+    -- and then jump to validateUtf8ChunkFrom
+    loop i s
+      | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs
+      | i < len =
+        case updateUtf8State (B.index bs i) s of
+          Nothing -> (0, Nothing)
+          Just s' -> loop (i + 1) s'
+      | otherwise = (0, Just (Utf8State s (partUtf8CPUnsafeAppend part bs)))
 
--- | Validated UTF-8 data to be converted into a 'Text' value.
-data TextDataStack = TextDataStack
-  { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values.
-    dataStack :: [Either Text (Either PartialUtf8CodePoint ByteString)]
-    -- | Returns total number of UTF-8 valid bytes in the stack.
-  , stackLen :: Int
+-- | Construct an Array. This is currently an internal data structure which is
+-- only used to construct Text (so for example byteStringToStrictBuilder should
+-- only be applied to valid UTF-8 bytestrings).
+data StrictBuilder = StrictBuilder
+  { sbLength :: !Int
+  , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
   }
-  deriving Show
 
--- | Empty stack
-emptyStack :: TextDataStack
-emptyStack = TextDataStack [] 0
+emptyStrictBuilder :: StrictBuilder
+emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
 
--- | Push a text value onto the stack
-pushText :: Text -> TextDataStack -> TextDataStack
-pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) =
-  if tLen > 0
-  then TextDataStack (Left t : stack) $ sLen + tLen
-  else tds
+-- | Right-biased append: run the right action first.  This allows a builder to
+-- run tail-recursively when accumulating text left-to-right.
+appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
+appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
+  StrictBuilder (n1 + n2) (\arr ofs -> do
+    write2 arr (ofs + n1)
+    write1 arr ofs)
 
-copyFromStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] -> Int -> A.MArray s -> ST s ()
-copyFromStack (dat : dataStack') tLen dst =
-  (case dat of
-    Left (Text arr0 off utf8Len) -> do
-      let dstOff = tLen - utf8Len
-      A.copyI utf8Len dst dstOff arr0 off
-      pure dstOff
-    Right encoded ->
-      case encoded of
-        Left partial -> do
-          let utf8Len = partUtf8CPLen partial
-              dstOff = tLen - utf8Len
-              g i | i < utf8Len = do
-                    A.unsafeWrite dst (dstOff + i) (partUtf8CPUnsafeIndex i partial)
-                    g (i + 1)
-                  | otherwise = pure ()
-          g dstOff
-          pure utf8Len
-        Right bs@(B.length -> utf8Len) -> do
-          let dstOff = tLen - utf8Len
-          withBS bs $ \ fp _ ->
-            unsafeIOToST . unsafeWithForeignPtr fp $ \ src ->
-              unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len
-          pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst)
-copyFromStack _ _ _ = pure ()
-{-# INLINE copyFromStack #-}
+copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s ()
+copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
+  unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do
+    unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len
 
--- | Create a 'Text' value from the contents of a 'TextDataStack'.
-stackToText ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  TextDataStack -> Text
-stackToText (TextDataStack stack sLen)
-  | sLen > 0 = runST $
-    do
-      dst <- A.new sLen
-      copyFromStack stack sLen dst
-      arr <- A.unsafeFreeze dst
-      pure $ Text arr 0 sLen
-  | otherwise = empty
+byteStringToStrictBuilder :: ByteString -> StrictBuilder
+byteStringToStrictBuilder bs =
+  StrictBuilder (B.length bs) (\arr ofs -> copyFromByteString arr ofs bs)
+
+charToStrictBuilder :: Char -> StrictBuilder
+charToStrictBuilder c =
+  StrictBuilder (utf8Length c) (\arr ofs -> void (Char.unsafeWrite arr ofs (safe c)))
+
+word8ToStrictBuilder :: Word8 -> StrictBuilder
+word8ToStrictBuilder w =
+  StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w)
+
+partUtf8CPFoldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
+partUtf8CPFoldr f x0 c = case partUtf8CPLen c of
+    0 -> x0
+    1 -> build 0 x0
+    2 -> build 0 (build 1 x0)
+    _ -> build 0 (build 1 (build 2 x0))
+  where
+    build i x = f (partUtf8CPUnsafeIndex c i) x
+
+partUtf8CPToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
+partUtf8CPToStrictBuilder =
+  partUtf8CPFoldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder
+
+strictBuilderToText :: StrictBuilder -> Text
+strictBuilderToText (StrictBuilder 0 _) = empty
+strictBuilderToText (StrictBuilder n write) = runST (do
+  dst <- A.new n
+  write dst 0
+  arr <- A.unsafeFreeze dst
+  pure (Text arr 0 n))
 
 -- | Decode a 'ByteString' in the context of what has been already been decoded.
 --
@@ -300,103 +277,77 @@ stackToText (TextDataStack stack sLen)
 -- (@Right Utf8State@) or the position of the of the first (potentially)
 -- valid byte after the invalid bytes with remainder of the 'ByteString'
 -- (@Left (Int, ByteString)@).
-decodeNextUtf8Chunk ::
-#if defined(ASSERTS)
-  HasCallStack =>
-#endif
-  ByteString
-  -> Utf8State
-  -> TextDataStack
-  -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack)
-decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) =
+decodeUtf8Chunk :: ByteString -> Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State)
+decodeUtf8Chunk bs s =
   case validateNextUtf8Chunk bs s of
-    (len, res) ->
-      let stackedData'
-            | len >= 0 =
-              let partCP = partialUtf8CodePoint s
-                  partLen = partUtf8CPLen partCP
-                  stackedData@(TextDataStack stack' sLen') =
-                    if partLen > 0
-                    then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen
-                    else tds
-              in
-              if len > 0
-              then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len
-              else stackedData
-            | otherwise = tds
-      in
-      ( ( len
-        , case res of
-            Left pos -> Left (pos, B.drop pos bs)
-            Right s' -> Right s'
-        )
-      , stackedData'
-      )
+    (len, s') ->
+      let builder | len == 0 = emptyStrictBuilder
+                  | otherwise = partUtf8CPToStrictBuilder (partialUtf8CodePoint s)
+                    `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs)
+      in (builder, B.drop len bs, s')
 
 -- | Decode a 'ByteString' against a start 'Utf8State' with an empty
 -- 'TextDataStack'.
 --
 -- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@
-decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack)
-decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8State emptyStack
+decodeUtf8ChunkStart :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
+decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State
 
 -- | Call an error handler with the give 'String' message for each byte
 -- in given 'ByteString' and lead data in the given 'Utf8State'
 -- value. The bytes are the positions from 'errStart' (inclusive) to
 -- 'errEnd' (exclusive). Any substitute characters are pushed onto the
 -- supplied 'TextDataStack' argument.
-handleUtf8Err
-  :: OnDecodeError
-  -> String
-  -> Int
-  -> Int
-  -> Utf8State
-  -> ByteString
-  -> TextDataStack
-  -> TextDataStack
-handleUtf8Err onErr errMsg errStart errEnd s bs tds =
-  let h errPos tds'
-        | errPos < errEnd =
-          h (errPos + 1) $
-            case onErr errMsg . Just $ B.index bs errPos of
-              Just c -> pushText (T.singleton c) tds'
-              Nothing -> tds'
-        | otherwise = tds'
-  in
-  ( if errStart < 0
-    then
-      let partCP = partialUtf8CodePoint s
-          partCPLen = partUtf8CPLen partCP
-          g i tds'
-            | i < partCPLen = g (i + 1) $
-              case onErr errMsg (Just (partUtf8CPUnsafeIndex i partCP)) of
-                Just c -> pushText (T.singleton c) tds'
-                Nothing -> tds'
-            | otherwise = h (partCPLen + errStart) tds'
-      in
-      g 0
-    else h errStart
-  ) tds
+skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder
+skipIncomplete onErr s =
+  partUtf8CPFoldr
+    (appendRStrictBuilder . handleUtf8Error onErr)
+    emptyStrictBuilder (partialUtf8CodePoint s)
 
-invalidUtf8Msg :: String
-invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
+handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder
+handleUtf8Error onErr w = case onErr w of
+  Just c -> charToStrictBuilder c
+  Nothing -> emptyStrictBuilder
 
--- | Decode a 'ByteString' containing UTF-8 encoded text.
---
--- Surrogate code points in replacement character returned by 'OnDecodeError'
--- will be automatically remapped to the replacement char @U+FFFD@.
-decodeUtf8With ::
+decodeUtf8With1 ::
+#if defined(ASSERTS)
+  HasCallStack =>
+#endif
+  (Word8 -> Maybe Char) -> ByteString -> Text
+decodeUtf8With1 onErr bs0 = strictBuilderToText $
+    builder `appendRStrictBuilder` skipIncomplete onErr s
+  where
+    (builder, _, s) = decodeUtf8With2 onErr bs0 startUtf8State
+
+-- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'.
+-- This uses an 'OnDecodeError' to process bad bytes.
+-- This is not a very pretty legacy API.
+-- See 'decodeUtf8Chunk' for a more flexible alternative.
+decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs =
-  let g bs' res =
-        case res of
-          ((len, eS), tds) ->
-            let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in
-            case eS of
-              Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8State $ h invalidUtf8Msg pos startUtf8State
-              Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" (B.length bs') s
-  in
-  g bs $ decodeUtf8Chunk bs
+  (Word8 -> Maybe Char) -> ByteString -> Utf8State -> (StrictBuilder, ByteString, Utf8State)
+decodeUtf8With2 onErr bs0 s0 = loop bs0 s0 emptyStrictBuilder
+  where
+    loop bs s !builder =
+      let nonEmptyPrefix len = builder
+            `appendRStrictBuilder` partUtf8CPToStrictBuilder (partialUtf8CodePoint s)
+            `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs)
+      in case validateNextUtf8Chunk bs s of
+        (len, Nothing) ->
+          if len == 0 && partUtf8CPLen (partialUtf8CodePoint s) == 0
+          then
+            -- loop is strict on builder, so if onErr raises an error it will be forced here.
+            let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s
+            in loop bs startUtf8State builder'
+          else
+            let builder' = nonEmptyPrefix len
+                  `appendRStrictBuilder` handleUtf8Error onErr (B.index bs len)
+            in loop (B.drop 1 bs) startUtf8State builder'
+        (len, Just s') ->
+          let builder' =
+                if len == 0
+                then builder `appendRStrictBuilder` skipIncomplete onErr s'
+                else nonEmptyPrefix len `appendRStrictBuilder` skipIncomplete onErr s'
+          in (builder', B.drop len bs, s')
diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs
new file mode 100644
index 00000000..e69de29b
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 418e473e..a50f9d9e 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -61,12 +61,12 @@ import Data.Monoid (Monoid(..))
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
 import Data.Word (Word8)
-import qualified Data.ByteString as S
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Lazy as B
 import qualified Data.ByteString.Lazy.Internal as B
 import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Internal.Encoding as TE
 import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
 import qualified Data.Text.Internal.Lazy.Fusion as F
 import Data.Text.Unsafe (unsafeDupablePerformIO)
@@ -106,22 +106,12 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
-decodeUtf8With onErr (B.Chunk b0 bs0) =
-  let g bs@(S.length -> bLen) lbs s tds diffText =
-        case TE.decodeNextUtf8Chunk bs s tds of
-          ((len, eS), tds') ->
-            let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in
-            case eS of
-              Left (pos, bs') -> g bs' lbs TE.startUtf8State
-                (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText
-              Right s' ->
-                case lbs of
-                  B.Chunk bs' lbs' ->
-                    g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds')
-                  B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty
-  in
-  g b0 bs0 TE.startUtf8State TE.emptyStack id
-decodeUtf8With _ _ = empty
+decodeUtf8With onErr = loop TE.startUtf8State
+  where
+    loop s (B.Chunk b0 bs0) = case TE.decodeUtf8With2 onErr' b0 s of
+      (builder, _, s') -> Chunk (TE.strictBuilderToText builder) (loop s' bs0)
+    loop s B.Empty = Chunk (TE.strictBuilderToText (TE.skipIncomplete onErr' s)) Empty
+    onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.

From 1a96f2df03e8da76f5bd7663b98f64636013177e Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Fri, 3 Feb 2023 19:14:15 +0000
Subject: [PATCH 64/87] Fix bugs, fix tests

---
 src/Data/Text/Encoding.hs             |   4 +-
 src/Data/Text/Internal/Encoding.hs    | 151 ++++++++------
 src/Data/Text/Lazy/Encoding.hs        |   8 +-
 tests/Tests/Properties/Transcoding.hs | 285 +++++++++++++-------------
 tests/Tests/QuickCheckUtils.hs        |   8 -
 5 files changed, 237 insertions(+), 219 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index d24942a1..6e52d24d 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -27,7 +27,7 @@ module Data.Text.Encoding
     -- $validation
       Utf8State
     , validateUtf8Chunk
-    , validateNextUtf8Chunk
+    , validateUtf8More
     , startUtf8State
 
     -- * Decoding ByteStrings to Text
@@ -318,7 +318,7 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = loop startUtf8State
   where
     loop s chunk =
-      let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) chunk s
+      let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) s chunk
       in Some (strictBuilderToText builder) undecoded (loop s')
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 41b69902..7ae90d8a 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -8,7 +8,7 @@ module Data.Text.Internal.Encoding where
 
 import Control.Monad.ST (ST, runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Data.Bits (shiftL, shiftR)
+import Data.Bits ((.&.), shiftL, shiftR)
 import Data.ByteString (ByteString)
 import Data.Functor (void)
 import Data.Text.Internal (Text(..), empty, safe)
@@ -17,10 +17,12 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word32, Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr)
+import Foreign.Storable (pokeElemOff)
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
   (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length)
 import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as BI
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Unsafe.Char as Char
 #if defined(ASSERTS)
@@ -40,64 +42,91 @@ import qualified Data.ByteString.Unsafe as B
 data Utf8State = Utf8State
   { -- | Current UTF-8 code point state of the 'ByteString's
     -- that have been validated thus far.
-    utf8CodePointState :: Utf8CodePointState
+    utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState
     -- | Get the incomplete UTF-8 code point of the 'ByteString's that
     -- have been validated thus far. The first byte of the 'Word32'
     -- indicates the number of bytes of the code point are available,
     -- and is followed by the bytes of the code point.
-  , partialUtf8CodePoint :: PartialUtf8CodePoint
+  , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
   }
   deriving (Eq, Ord, Show)
 
 -- | This represtents the starting state of a UTF-8 validation check.
 startUtf8State :: Utf8State 
-startUtf8State = Utf8State utf8StartState partUtf8CPEmpty
+startUtf8State = Utf8State utf8StartState partUtf8Empty
 
--- | Prefix of a valid UTF-8 encoded code point.
+-- | Prefix of a valid UTF-8 encoded code point encoded in 4 bytes.
 -- This consists of a length (in bytes) between 1 and 3 stored in the most
 -- significant byte, and the actual bytes in the rest of the word.
+--
+-- All of its operations are the functions below.
+-- The constructor should never be used outside of those.
 newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
   deriving (Eq, Ord, Show)
 
 -- | Empty prefix.
-partUtf8CPEmpty :: PartialUtf8CodePoint
-partUtf8CPEmpty = PartialUtf8CodePoint 0
+partUtf8Empty :: PartialUtf8CodePoint
+partUtf8Empty = PartialUtf8CodePoint 0
 
 -- | Length of the partial code point, stored in the most significant byte.
-partUtf8CPLen :: PartialUtf8CodePoint -> Int
-partUtf8CPLen (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
+partUtf8Len :: PartialUtf8CodePoint -> Int
+partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
+
+partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
+partUtf8CompleteLen c@(PartialUtf8CodePoint w)
+  | partUtf8Len c == 0 = 0
+  | 0xf0 <= firstByte = 4
+  | 0xe0 <= firstByte = 3
+  | 0xc2 <= firstByte = 2
+  | otherwise = 0
+  where
+    firstByte = (w `shiftR` 16) .&. 255
 
--- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8CPLen c@.
-partUtf8CPUnsafeIndex ::
+-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@.
+partUtf8UnsafeIndex ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   PartialUtf8CodePoint -> Int -> Word8
-partUtf8CPUnsafeIndex (PartialUtf8CodePoint w) n =
+partUtf8UnsafeIndex (PartialUtf8CodePoint w) n =
 #if defined(ASSERTS)
-  assert (0 <= n && n < partUtf8CPLen w) $
+  assert (0 <= n && n < partUtf8Len w) $
 #endif
   fromIntegral $ w `shiftR` (16 - 8 * n)
 
-partUtf8CPUnsafeAppend ::
+partUtf8UnsafeAppend ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
-partUtf8CPUnsafeAppend c@(PartialUtf8CodePoint word) bs =
+partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs =
 #if defined(ASSERTS)
   assert (lenc + lenbs <= 3) $
 #endif
   PartialUtf8CodePoint $
     tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24)
   where
-    lenc = partUtf8CPLen c
+    lenc = partUtf8Len c
     lenbs = B.length bs
     tryPush i w =
       if i < lenbs
       then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i)))
       else w
 
+-- | This avoids recursion to unfold to straightline code.
+partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
+partUtf8Foldr f x0 c = case partUtf8Len c of
+    0 -> x0
+    1 -> build 0 x0
+    2 -> build 0 (build 1 x0)
+    _ -> build 0 (build 1 (build 2 x0))
+  where
+    build i x = f (partUtf8UnsafeIndex c i) x
+
+partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
+partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr ->
+  partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0
+
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
@@ -105,7 +134,7 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 
 -- | Validate a 'ByteString' as a UTF-8-encoded text.
 --
--- @validateUtf8Chunk chunk = (n, es)@
+-- @validateUtf8More chunk = (n, es)@
 --
 -- This function returns two values:
 --
@@ -124,10 +153,10 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
 validateUtf8Chunk = validateUtf8ChunkFrom 0
 
--- | Add an offset to the index returned by 'validateUtf8Chunk'.
+-- | Add an offset to the index returned by 'validateUtf8More'.
 --
 -- @
--- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8Chunk' . 'B.drop' n
+-- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8More' . 'B.drop' n
 -- @
 validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
 validateUtf8ChunkFrom ofs bs
@@ -146,7 +175,7 @@ validateUtf8ChunkFrom ofs bs
     -- No
   | otherwise = slowValidateUtf8ChunkFrom ofs bs
     where
-      len = B.length bs
+      len = B.length bs - ofs
       isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n)
       guessUtf8Boundary
         | isBoundary 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
@@ -157,7 +186,7 @@ validateUtf8ChunkFrom ofs bs
   = slowValidateUtf8ChunkFrom ofs bs
 #endif
 
--- | A pure Haskell implementation of validateUtf8Chunk.
+-- | A pure Haskell implementation of validateUtf8More.
 --
 -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
 -- indices to let us avoid this function.
@@ -169,9 +198,9 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
           case updateUtf8State (B.index bs i) s of
             Just s' ->
               let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End
-              in loop (i + 1) utf8End' s'
+              in loop utf8End' (i + 1) s'
             Nothing -> (utf8End, Nothing)
-      | otherwise = (utf8End, Just (Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty (B.drop utf8End bs))))
+      | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
 
 -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
 --
@@ -195,27 +224,27 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
 --       at index 'n' are the beginning of an incomplete UTF-8 code
 --       point, and `s'` is the resulting 'Utf8State' value, which
 --       can be used to validate against a following 'ByteString'.
-validateNextUtf8Chunk :: ByteString -> Utf8State -> (Int, Maybe Utf8State)
-validateNextUtf8Chunk bs st@(Utf8State s0 part)
+validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
+validateUtf8More st@(Utf8State s0 part) bs
   | len > 0 = loop 0 s0
   | otherwise = (0, Just st)
   where
     len = B.length bs
     -- Complete an incomplete code point (if there is one)
     -- and then jump to validateUtf8ChunkFrom
-    loop i s
+    loop !i s
       | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs
       | i < len =
         case updateUtf8State (B.index bs i) s of
           Nothing -> (0, Nothing)
           Just s' -> loop (i + 1) s'
-      | otherwise = (0, Just (Utf8State s (partUtf8CPUnsafeAppend part bs)))
+      | otherwise = (0, Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
 -- | Construct an Array. This is currently an internal data structure which is
 -- only used to construct Text (so for example byteStringToStrictBuilder should
 -- only be applied to valid UTF-8 bytestrings).
 data StrictBuilder = StrictBuilder
-  { sbLength :: !Int
+  { sbLength :: {-# UNPACK #-} !Int
   , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
   }
 
@@ -247,18 +276,9 @@ word8ToStrictBuilder :: Word8 -> StrictBuilder
 word8ToStrictBuilder w =
   StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w)
 
-partUtf8CPFoldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
-partUtf8CPFoldr f x0 c = case partUtf8CPLen c of
-    0 -> x0
-    1 -> build 0 x0
-    2 -> build 0 (build 1 x0)
-    _ -> build 0 (build 1 (build 2 x0))
-  where
-    build i x = f (partUtf8CPUnsafeIndex c i) x
-
-partUtf8CPToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
-partUtf8CPToStrictBuilder =
-  partUtf8CPFoldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder
+partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
+partUtf8ToStrictBuilder =
+  partUtf8Foldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder
 
 strictBuilderToText :: StrictBuilder -> Text
 strictBuilderToText (StrictBuilder 0 _) = empty
@@ -277,21 +297,21 @@ strictBuilderToText (StrictBuilder n write) = runST (do
 -- (@Right Utf8State@) or the position of the of the first (potentially)
 -- valid byte after the invalid bytes with remainder of the 'ByteString'
 -- (@Left (Int, ByteString)@).
-decodeUtf8Chunk :: ByteString -> Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State)
-decodeUtf8Chunk bs s =
-  case validateNextUtf8Chunk bs s of
+decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
+decodeUtf8More s bs =
+  case validateUtf8More s bs of
     (len, s') ->
       let builder | len == 0 = emptyStrictBuilder
-                  | otherwise = partUtf8CPToStrictBuilder (partialUtf8CodePoint s)
+                  | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s)
                     `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs)
       in (builder, B.drop len bs, s')
 
 -- | Decode a 'ByteString' against a start 'Utf8State' with an empty
 -- 'TextDataStack'.
 --
--- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@
-decodeUtf8ChunkStart :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
-decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State
+-- @decodeUtf8More bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@
+decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
+decodeUtf8Chunk = decodeUtf8More startUtf8State
 
 -- | Call an error handler with the give 'String' message for each byte
 -- in given 'ByteString' and lead data in the given 'Utf8State'
@@ -300,7 +320,7 @@ decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State
 -- supplied 'TextDataStack' argument.
 skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder
 skipIncomplete onErr s =
-  partUtf8CPFoldr
+  partUtf8Foldr
     (appendRStrictBuilder . handleUtf8Error onErr)
     emptyStrictBuilder (partialUtf8CodePoint s)
 
@@ -309,6 +329,7 @@ handleUtf8Error onErr w = case onErr w of
   Just c -> charToStrictBuilder c
   Nothing -> emptyStrictBuilder
 
+-- | Helper for 'decodeUtfWith'.
 decodeUtf8With1 ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -317,37 +338,37 @@ decodeUtf8With1 ::
 decodeUtf8With1 onErr bs0 = strictBuilderToText $
     builder `appendRStrictBuilder` skipIncomplete onErr s
   where
-    (builder, _, s) = decodeUtf8With2 onErr bs0 startUtf8State
+    (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0
 
 -- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'.
 -- This uses an 'OnDecodeError' to process bad bytes.
 -- This is not a very pretty legacy API.
--- See 'decodeUtf8Chunk' for a more flexible alternative.
+-- See 'decodeUtf8More' for a more flexible alternative.
 decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  (Word8 -> Maybe Char) -> ByteString -> Utf8State -> (StrictBuilder, ByteString, Utf8State)
-decodeUtf8With2 onErr bs0 s0 = loop bs0 s0 emptyStrictBuilder
+  (Word8 -> Maybe Char) -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
+decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder
   where
-    loop bs s !builder =
+    loop s i !builder =
       let nonEmptyPrefix len = builder
-            `appendRStrictBuilder` partUtf8CPToStrictBuilder (partialUtf8CodePoint s)
-            `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs)
-      in case validateNextUtf8Chunk bs s of
+            `appendRStrictBuilder` partUtf8ToStrictBuilder (partialUtf8CodePoint s)
+            `appendRStrictBuilder` byteStringToStrictBuilder (B.take len (B.drop i bs))
+      in case validateUtf8More s (B.drop i bs) of
         (len, Nothing) ->
-          if len == 0 && partUtf8CPLen (partialUtf8CodePoint s) == 0
+          if len == 0 && utf8CodePointState s /= utf8StartState
           then
             -- loop is strict on builder, so if onErr raises an error it will be forced here.
             let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s
-            in loop bs startUtf8State builder'
+            in loop startUtf8State i builder'
           else
             let builder' = nonEmptyPrefix len
-                  `appendRStrictBuilder` handleUtf8Error onErr (B.index bs len)
-            in loop (B.drop 1 bs) startUtf8State builder'
+                  `appendRStrictBuilder` handleUtf8Error onErr (B.index bs (i + len))
+            in loop startUtf8State (i + len + 1) builder'
         (len, Just s') ->
-          let builder' =
-                if len == 0
-                then builder `appendRStrictBuilder` skipIncomplete onErr s'
-                else nonEmptyPrefix len `appendRStrictBuilder` skipIncomplete onErr s'
-          in (builder', B.drop len bs, s')
+          let builder' = if len == 0 then builder else nonEmptyPrefix len
+              undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s')
+                then B.drop (i + len) bs  -- Reuse bs if possible
+                else partUtf8ToByteString (partialUtf8CodePoint s')
+          in (builder', undecoded, s')
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index a50f9d9e..1def3f51 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -108,9 +108,11 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr = loop TE.startUtf8State
   where
-    loop s (B.Chunk b0 bs0) = case TE.decodeUtf8With2 onErr' b0 s of
-      (builder, _, s') -> Chunk (TE.strictBuilderToText builder) (loop s' bs0)
-    loop s B.Empty = Chunk (TE.strictBuilderToText (TE.skipIncomplete onErr' s)) Empty
+    chunkb builder t | TE.sbLength builder == 0 = t
+                    | otherwise = Chunk (TE.strictBuilderToText builder) t
+    loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr' s b of
+      (builder, _, s') -> chunkb builder (loop s' bs)
+    loop s B.Empty = chunkb (TE.skipIncomplete onErr' s) Empty
     onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index d75cfaaa..8e7a614e 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -11,10 +11,11 @@ module Tests.Properties.Transcoding
 import Prelude hiding (head, tail)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
-import Data.Either (isLeft, isRight)
+import Data.Functor (void)
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
+import Test.Tasty.HUnit ((@?=), assertBool, assertFailure, testCase)
 import Tests.QuickCheckUtils
 import qualified Control.Exception as Exception
 import qualified Data.Bits as Bits (shiftL, shiftR)
@@ -28,12 +29,10 @@ import qualified Data.ByteString.Lazy.Char8 as BLC
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 import qualified Data.Text.Encoding.Error as E
-import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete)
 import qualified Data.Text.Internal.Encoding as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
--- import qualified Data.Text.Internal.Lazy as BL
--- import qualified Data.ByteString.Lazy.Internal as BS
+import Data.Word (Word8)
 
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
@@ -43,71 +42,69 @@ tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) === a
 t_latin1     = E.decodeLatin1 `eq` (T.pack . BC.unpack)
 tl_latin1    = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack)
 
-t_p_utf8_1   = case E.validateUtf8Chunk (B.pack [0x63]) of
-  (result, st) -> whenEqProp result 1 . property $ isRight st
-t_p_utf8_2   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of
-  (result, st) -> whenEqProp result 3 . property $ isRight st
-t_p_utf8_3   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of
-  (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_4   = case E.validateUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of
-  (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_5   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of
-  (result, st) -> whenEqProp result 5 . property $ isRight st
-t_p_utf8_6   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of
-  (result, st) -> whenEqProp result 2 . property $ isRight st
-t_p_utf8_7   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of
-  (result, st) -> whenEqProp result 3 . property $ isRight st
-t_p_utf8_8   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of
-  (result, st) -> whenEqProp result 0 $ st === Left 3
-t_p_utf8_9   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of
-  (result, st) -> whenEqProp result 2 $ st === Left 3
-t_p_utf8_0   = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of
-  (result, st) -> whenEqProp result 2 $ st === Left 3
-
-t_pn_utf8_1   = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of
-  (result0, mS) -> whenEqProp result0 0 $
-    case mS of
-      Left _ -> counterexample (show mS) False
-      Right s -> case E.validateNextUtf8Chunk (B.pack [0x80]) s of
-        (result1, mS1) -> whenEqProp result1 1 $
-          if isLeft mS1
-          then counterexample (show mS1) False
-          else case E.validateNextUtf8Chunk (B.pack [0x7f]) s of
-            (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0
-t_pn_utf8_2   = case E.validateUtf8Chunk (B.pack [0xF0]) of
-  (result0, mS0) -> whenEqProp result0 0 $
-    case mS0 of
-      Left _ -> counterexample (show mS0) False
-      Right s0 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s0 of
-        (result1, mS1) -> whenEqProp result1 (-1) .
-          whenEqProp mS1 (Left 0) $
-          case E.validateNextUtf8Chunk (B.pack [0x90]) s0 of
-            (result2, mS2) -> whenEqProp result2 (-1) $
-              case mS2 of
-                Left _ -> counterexample (show mS2) False
-                Right s1 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s1 of
-                  (result3, mS3) -> whenEqProp result3 (-2) .
-                    whenEqProp mS3 (Left 0) $
-                    case E.validateNextUtf8Chunk (B.pack [0x80]) s1 of
-                      (result4, mS4) -> whenEqProp result4 (-2) $
-                        case mS4 of
-                          Left _ -> counterexample (show mS3) False
-                          Right s2 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s2 of
-                            (result5, mS5) -> whenEqProp result5 (-3) .
-                              whenEqProp mS5 (Left 0) $
-                              case E.validateNextUtf8Chunk (B.pack [0x80]) s2 of
-                                (result6, mS6) -> whenEqProp result6 1 $
-                                  property $ isRight mS6
-t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of
-  (len1, eS1) -> whenEqProp len1 0 $ case eS1 of
-    Left _ -> counterexample (show eS1) False
-    Right s1 -> E.partialUtf8CodePoint s1 === E.PartialUtf8CodePoint 0x01c20000 .&&.
-      if isUtf8StateIsComplete $ E.utf8CodePointState s1
-      then counterexample (show $ E.utf8CodePointState s1) False
-      else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of
-        (len2, eS2) -> len2 === 1 .&&. eS2 === Left 2
-
-t_utf8_c     = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
+t_p_utf8_1   = testValidateUtf8_ [0x63] 1
+t_p_utf8_2   = testValidateUtf8_ [0x63, 0x63, 0x63] 3
+t_p_utf8_3   = testValidateUtf8_ [0x63, 0x63, 0xc2, 0x80, 0x63] 5
+t_p_utf8_4   = testValidateUtf8_ [0x63, 0xe1, 0x80, 0x80, 0x63] 5
+t_p_utf8_5   = testValidateUtf8_ [0xF0, 0x90, 0x80, 0x80, 0x63] 5
+t_p_utf8_6   = testValidateUtf8_ [0x63, 0x63, 0xF0, 0x90, 0x80] 2
+t_p_utf8_7   = testValidateUtf8_ [0x63, 0x63, 0x63, 0xF0, 0x90] 3
+t_p_utf8_8   = testValidateUtf8Fail [0xF0, 0x90, 0x80, 0x63, 0x63] 0
+t_p_utf8_9   = testValidateUtf8Fail [0x63, 0x63, 0x80, 0x63, 0x63] 2
+t_p_utf8_0   = testValidateUtf8Fail [0x63, 0x63, 0xe1, 0x63, 0x63] 2
+
+testValidateUtf8With ::
+  (B.ByteString -> (Int, Maybe E.Utf8State)) ->
+  (Maybe E.Utf8State -> IO r) ->
+  [Word8] -> Int -> IO r
+testValidateUtf8With validate k xs expectedLen = case validate (B.pack xs) of
+  (len, s) -> do
+    len @?= expectedLen
+    k s
+
+expectJust :: Maybe a -> IO a
+expectJust Nothing = assertFailure "Unexpected Nothing"
+expectJust (Just s) = pure s
+
+expectNothing :: Maybe a -> IO ()
+expectNothing Nothing = pure ()
+expectNothing (Just _) = assertFailure "Unexpected Just"
+
+testValidateUtf8 :: [Word8] -> Int -> IO E.Utf8State
+testValidateUtf8 = testValidateUtf8With E.validateUtf8Chunk expectJust
+
+testValidateUtf8_ :: [Word8] -> Int -> IO ()
+testValidateUtf8_ = testValidateUtf8With E.validateUtf8Chunk (void . expectJust)
+
+testValidateUtf8Fail :: [Word8] -> Int -> IO ()
+testValidateUtf8Fail = testValidateUtf8With E.validateUtf8Chunk expectNothing
+
+testValidateUtf8More :: E.Utf8State -> [Word8] -> Int -> IO E.Utf8State
+testValidateUtf8More s =  testValidateUtf8With (E.validateUtf8More s) expectJust
+
+testValidateUtf8MoreFail :: E.Utf8State -> [Word8] -> Int -> IO ()
+testValidateUtf8MoreFail s = testValidateUtf8With (E.validateUtf8More s) expectNothing
+
+t_pn_utf8_1 = do
+  s <- testValidateUtf8 [0xF0, 0x90, 0x80] 0
+  _ <- testValidateUtf8More s [0x80] 1
+  testValidateUtf8MoreFail s [0x7f] 0
+t_pn_utf8_2 = do
+  s0 <- testValidateUtf8 [0xF0] 0
+  testValidateUtf8MoreFail s0 [0x7f] 0
+  s1 <- testValidateUtf8More s0 [0x90] 0
+  testValidateUtf8MoreFail s1 [0x7f] 0
+  s2 <- testValidateUtf8More s1 [0x80] 0
+  testValidateUtf8MoreFail s2 [0x7f] 0
+  _ <- testValidateUtf8More s2 [0x80] 1
+  pure ()
+t_pn_utf8_3 = do
+  s1 <- testValidateUtf8 [0xc2] 0
+  let c = E.partialUtf8CodePoint s1
+  assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c
+  testValidateUtf8MoreFail s1 [0x80, 0x80] 1
+
+t_utf8_c     = (E.strictBuilderToText . fst3 . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
@@ -121,6 +118,9 @@ tl_utf32LE   = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
 t_utf32BE    = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
 tl_utf32BE   = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 
+fst3 :: (a, b, c) -> a
+fst3 (a, _, _) = a
+
 runBuilder :: B.Builder -> B.ByteString
 runBuilder =
   -- Use smallish buffers to exercise bufferFull case as well
@@ -268,62 +268,63 @@ t_decode_with_error3 =
 t_decode_with_error4 =
   E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa"
 
-t_decode_with_error1' =
-  case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) of
-    E.Some x1 bs1 f1 -> whenEqProp x1 "" . whenEqProp bs1 (B.pack [0xc2]) $
-      case f1 $ B.pack [0x80, 0x80] of
-        E.Some x2 bs2 _ -> whenEqProp x2 "\x80x" $ bs2 === mempty
+t_decode_with_error1' = do
+  E.Some x1 bs1 f1 <- pure $ E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2])
+  x1 @?= ""
+  bs1 @?= B.pack [0xc2]
+  E.Some x2 bs2 _ <- pure $ f1 $ B.pack [0x80, 0x80]
+  x2 @?= "\x80x"
+  bs2 @?= mempty
 t_decode_with_error2' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) of
-    E.Some x _ _ -> x === "xa"
+    E.Some x _ _ -> x @?= "xa"
 t_decode_with_error3' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97]) of
-    E.Some x _ _ -> x === "xaa"
+    E.Some x _ _ -> x @?= "xaa"
 t_decode_with_error4' =
   case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of
-    E.Some x _ _ -> x === "xaaa"
-t_decode_with_error5' = ioProperty $ do
+    E.Some x _ _ -> x @?= "xaaa"
+t_decode_with_error5' = do
   ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81])
-  pure $ case ret of
-    Left (_ :: E.UnicodeException) -> True
-    Right{} -> False
-
-t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of
-  ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
-    case eS1 of
-      Left _ -> counterexample (show eS1) False
-      Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
-        partCP === E.PartialUtf8CodePoint 0x01c20000 .&&.
-        E.partUtf8CPLen partCP === 1 .&&.
-        E.partUtf8CPUnsafeIndex 0 partCP === 0xc2 .&&.
-        if isUtf8StateIsComplete $ E.utf8CodePointState s1
-        then counterexample (show $ E.utf8CodePointState s1) False
-        else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of
-          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left (E.PartialUtf8CodePoint 0x1c20000)], 2) $
-            case eS2 of
-              Right _ -> counterexample (show eS2) False
-              Left res -> res === (2, mempty)
-
-t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of
-  ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $
-    case eS1 of
-      Left _ -> counterexample (show eS1) False
-      Right s1 -> let partCP = E.partialUtf8CodePoint s1 in
-        partCP === E.PartialUtf8CodePoint 0x01f00000 .&&.
-        E.partUtf8CPLen partCP === 1 .&&.
-        E.partUtf8CPUnsafeIndex 0 partCP === 0xf0 .&&.
-        if isUtf8StateIsComplete $ E.utf8CodePointState s1
-        then counterexample (show $ E.utf8CodePointState s1) False
-        else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of
-          ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (-1, [], 0) $
-            case eS2 of
-              Left _ -> counterexample (show eS2) False
-              Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in
-                partCP2 === E.PartialUtf8CodePoint 0x03f09080 .&&.
-                E.partUtf8CPLen partCP2 === 3 .&&.
-                E.partUtf8CPUnsafeIndex 0 partCP2 === 0xf0 .&&.
-                E.partUtf8CPUnsafeIndex 1 partCP2 === 0x90 .&&.
-                E.partUtf8CPUnsafeIndex 2 partCP2 === 0x80
+  case ret of
+    Left (_ :: E.UnicodeException) -> pure ()
+    Right{} -> assertFailure "Unexpected success"
+
+testDecodeUtf8With :: (Maybe E.Utf8State -> IO r) -> E.Utf8State -> [Word8] -> T.Text -> IO r
+testDecodeUtf8With k s xs expected =
+  let xs' = B.pack xs in
+  case E.decodeUtf8More s xs' of
+    (prefix, bs, s') -> do
+      let txt = E.strictBuilderToText prefix
+      txt @?= expected
+      if T.null txt then
+        bs @?= xs'
+      else
+        let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s)
+        in E.encodeUtf8 txt <> bs @?= c <> xs'
+      k s'
+
+testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State
+testDecodeUtf8 = testDecodeUtf8With (\ms -> case ms of
+  Just s -> pure s
+  Nothing -> assertFailure "Unexpected failure")
+
+testDecodeUtf8Fail :: E.Utf8State -> [Word8] -> T.Text -> IO ()
+testDecodeUtf8Fail = testDecodeUtf8With (\ms -> case ms of
+  Just _ -> assertFailure "Unexpected failure"
+  Nothing -> pure ())
+
+t_decode_chunk1 = do
+  s1 <- testDecodeUtf8 E.startUtf8State [0xc2] ""
+  let c1 = E.partialUtf8CodePoint s1
+  E.partUtf8Len c1 @?= 1
+  testDecodeUtf8Fail s1 [0x80, 0x80] "\128"
+
+t_decode_chunk2 = do
+  s1 <- testDecodeUtf8 E.startUtf8State [0xf0] ""
+  s2 <- testDecodeUtf8 s1 [0x90, 0x80] ""
+  _  <- testDecodeUtf8 s2 [0x80, 0x41] "\65536A"
+  pure ()
 
 t_infix_concat bs1 text bs2 =
   forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) ->
@@ -337,24 +338,8 @@ testTranscoding =
     testProperty "tl_ascii" tl_ascii,
     testProperty "t_latin1" t_latin1,
     testProperty "tl_latin1" tl_latin1,
-    testProperty "t_p_utf8_1" t_p_utf8_1,
-    testProperty "t_p_utf8_2" t_p_utf8_2,
-    testProperty "t_p_utf8_3" t_p_utf8_3,
-    testProperty "t_p_utf8_4" t_p_utf8_4,
-    testProperty "t_p_utf8_5" t_p_utf8_5,
-    testProperty "t_p_utf8_6" t_p_utf8_6,
-    testProperty "t_p_utf8_7" t_p_utf8_7,
-    testProperty "t_p_utf8_8" t_p_utf8_8,
-    testProperty "t_p_utf8_9" t_p_utf8_9,
-    testProperty "t_p_utf8_0" t_p_utf8_0,
-    testProperty "t_pn_utf8_1" t_pn_utf8_1,
-    testProperty "t_pn_utf8_2" t_pn_utf8_2,
-    testProperty "t_pn_utf8_3" t_pn_utf8_3,
-    testProperty "t_utf8_c" t_utf8_c,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
-    testProperty "t_utf8_incr" t_utf8_incr,
-    testProperty "t_utf8_undecoded" t_utf8_undecoded,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
     testProperty "t_utf16LE" t_utf16LE,
@@ -381,13 +366,31 @@ testTranscoding =
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,
-      testProperty "t_decode_with_error1'" t_decode_with_error1',
-      testProperty "t_decode_with_error2'" t_decode_with_error2',
-      testProperty "t_decode_with_error3'" t_decode_with_error3',
-      testProperty "t_decode_with_error4'" t_decode_with_error4',
-      testProperty "t_decode_with_error5'" t_decode_with_error5',
-      testProperty "t_decode_chunk1" t_decode_chunk1,
-      testProperty "t_decode_chunk2" t_decode_chunk2,
       testProperty "t_infix_concat" t_infix_concat
+    ],
+    testGroup "streaming" [
+      testProperty "t_utf8_undecoded" t_utf8_undecoded,
+      testProperty "t_utf8_incr" t_utf8_incr,
+      testProperty "t_utf8_c" t_utf8_c,
+      testCase "t_p_utf8_1" t_p_utf8_1,
+      testCase "t_p_utf8_2" t_p_utf8_2,
+      testCase "t_p_utf8_3" t_p_utf8_3,
+      testCase "t_p_utf8_4" t_p_utf8_4,
+      testCase "t_p_utf8_5" t_p_utf8_5,
+      testCase "t_p_utf8_6" t_p_utf8_6,
+      testCase "t_p_utf8_7" t_p_utf8_7,
+      testCase "t_p_utf8_8" t_p_utf8_8,
+      testCase "t_p_utf8_9" t_p_utf8_9,
+      testCase "t_p_utf8_0" t_p_utf8_0,
+      testCase "t_pn_utf8_1" t_pn_utf8_1,
+      testCase "t_pn_utf8_2" t_pn_utf8_2,
+      testCase "t_pn_utf8_3" t_pn_utf8_3,
+      testCase "t_decode_chunk1" t_decode_chunk1,
+      testCase "t_decode_chunk2" t_decode_chunk2
+      testCase "t_decode_with_error1'" t_decode_with_error1',
+      testCase "t_decode_with_error2'" t_decode_with_error2',
+      testCase "t_decode_with_error3'" t_decode_with_error3',
+      testCase "t_decode_with_error4'" t_decode_with_error4',
+      testCase "t_decode_with_error5'" t_decode_with_error5',
     ]
   ]
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index 58ad3c47..8f36f7ec 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -27,8 +27,6 @@ module Tests.QuickCheckUtils
     , eqPSqrt
 
     , write_read
-
-    , whenEqProp
     ) where
 
 import Control.Arrow ((***))
@@ -286,9 +284,3 @@ newtype SkewedBool = Skewed { getSkewed :: Bool }
 
 instance Arbitrary SkewedBool where
   arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)]
-
--- like Control.Monad.when, but with properties instead of monad values
-whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property
-whenEqProp a b next = if a == b
-  then next
-  else a === b

From 6a65ed66a159340a4351383dad33ea772abca359 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Fri, 3 Feb 2023 22:10:21 +0000
Subject: [PATCH 65/87] Rework docs

---
 src/Data/Text/Encoding.hs             |  44 +++--
 src/Data/Text/Internal/Encoding.hs    | 255 ++++++++++++++++++--------
 tests/Tests/Properties/Transcoding.hs |   4 +-
 3 files changed, 208 insertions(+), 95 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 6e52d24d..9d7ed6d0 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -23,24 +23,14 @@
 
 module Data.Text.Encoding
     (
-    -- * ByteString validation
-    -- $validation
-      Utf8State
-    , validateUtf8Chunk
-    , validateUtf8More
-    , startUtf8State
-
     -- * Decoding ByteStrings to Text
     -- $strict
 
     -- ** Total Functions #total#
     -- $total
-    , decodeLatin1
+      decodeLatin1
     , decodeASCIIPrefix
-    , decodeUtf8Chunk
     , decodeUtf8Lenient
-
-    -- *** Catchable failure
     , decodeUtf8'
 
     -- *** Controllable error handling
@@ -55,6 +45,15 @@ module Data.Text.Encoding
     , streamDecodeUtf8With
     , Decoding(..)
 
+    -- *** Incremental decoding
+    -- $incremental
+    , decodeUtf8Chunk
+    , decodeUtf8More
+    , Utf8State
+    , startUtf8State
+    , StrictBuilder()
+    , strictBuilderToText
+
     -- ** Partial Functions
     -- $partial
     , decodeASCII
@@ -77,6 +76,11 @@ module Data.Text.Encoding
     -- * Encoding Text using ByteString Builders
     , encodeUtf8Builder
     , encodeUtf8BuilderEscaped
+
+    -- * ByteString validation
+    -- $validation
+    , validateUtf8Chunk
+    , validateUtf8More
     ) where
 
 import Control.Exception (evaluate, try)
@@ -154,7 +158,8 @@ import qualified Data.ByteString.Unsafe as B
 -- To drop references to the input bytestring, force the prefix
 -- (using 'seq' or @BangPatterns@) and drop references to the suffix.
 --
--- Properties:
+-- === Properties
+--
 -- - If @(prefix, suffix) = decodeAsciiPrefix s@, then @'encodeUtf8' prefix <> suffix = s@.
 -- - Either @suffix@ is empty, or @'B.head' suffix > 127@.
 --
@@ -535,3 +540,18 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
 encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
+
+-- $incremental
+-- The functions 'decodeUtf8Chunk' and 'decodeUtf8More' provide more
+-- control for error-handling and streaming.
+--
+-- - You get an UTF-8 prefix of the given 'ByteString' up to the next error.
+--   For example this lets you insert or delete arbitrary text, or do some
+--   stateful operations, before resuming.
+--   In contrast, the older stream-oriented interface only lets you substitute
+--   a single fixed 'Char' for each invalid byte in 'OnDecodeError'.
+-- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks
+--   before doing the copying work to construct a 'Text', or you can
+--   output decoded fragments immediately as a lazy 'Data.Text.Lazy.Text'.
+--
+-- For even lower-level primitives, see 'validateUtf8Chunk' and 'validateUtf8More'.
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 7ae90d8a..fc8be607 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -3,7 +3,12 @@
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ViewPatterns #-}
+
 -- |
+-- Internals of "Data.Text.Encoding".
+--
+-- If you'd like to depend on something from here that's not in "Data.Text.Encoding",
+-- please request to have it exported!
 module Data.Text.Internal.Encoding where
 
 import Control.Monad.ST (ST, runST)
@@ -11,6 +16,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits ((.&.), shiftL, shiftR)
 import Data.ByteString (ByteString)
 import Data.Functor (void)
+import Data.Semigroup (Semigroup(..))
 import Data.Text.Internal (Text(..), empty, safe)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
@@ -36,31 +42,46 @@ import qualified Data.ByteString.Unsafe as B
 #endif
 
 -- | State of decoding a 'ByteString' in UTF-8.
--- It consists of a value representing whether or
--- not the last byte is a complete code point, and on incompletion what
--- the 1 to 3 end bytes are that make up the incomplete code point.
+-- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More',
+-- 'decodeUtf8Chunk', 'decodeUtf8More').
+
+-- Internal invariant:
+-- the first component is the initial state if and only if
+-- the second component is empty.
+--
+-- @
+-- 'utf9CodePointState' s = 'utf8StartState'
+-- <=>
+-- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0
+-- @
+--
+-- @since 2.0.2
 data Utf8State = Utf8State
-  { -- | Current UTF-8 code point state of the 'ByteString's
-    -- that have been validated thus far.
+  { -- | State of the UTF-8 state machine.
     utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState
-    -- | Get the incomplete UTF-8 code point of the 'ByteString's that
-    -- have been validated thus far. The first byte of the 'Word32'
-    -- indicates the number of bytes of the code point are available,
-    -- and is followed by the bytes of the code point.
+    -- | Bytes of the currently incomplete code point (if any).
   , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
   }
   deriving (Eq, Ord, Show)
 
--- | This represtents the starting state of a UTF-8 validation check.
+-- | Initial 'Utf8State'.
+--
+-- @since 2.0.2
 startUtf8State :: Utf8State 
 startUtf8State = Utf8State utf8StartState partUtf8Empty
 
--- | Prefix of a valid UTF-8 encoded code point encoded in 4 bytes.
--- This consists of a length (in bytes) between 1 and 3 stored in the most
--- significant byte, and the actual bytes in the rest of the word.
+-- | Prefix of a UTF-8 code point encoded in 4 bytes,
+-- possibly empty.
+--
+-- - The most significant byte contains the number of bytes,
+--   between 0 and 3.
+-- - The remaining bytes hold the incomplete code point.
+-- - Unused bytes must be 0.
 --
--- All of its operations are the functions below.
+-- All of operations available on it are the functions below.
 -- The constructor should never be used outside of those.
+--
+-- @since 2.0.2
 newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
   deriving (Eq, Ord, Show)
 
@@ -72,6 +93,8 @@ partUtf8Empty = PartialUtf8CodePoint 0
 partUtf8Len :: PartialUtf8CodePoint -> Int
 partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
 
+-- | Length of the code point once completed (it is known in the first byte).
+-- 0 if empty.
 partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
 partUtf8CompleteLen c@(PartialUtf8CodePoint w)
   | partUtf8Len c == 0 = 0
@@ -83,6 +106,8 @@ partUtf8CompleteLen c@(PartialUtf8CodePoint w)
     firstByte = (w `shiftR` 16) .&. 255
 
 -- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@.
+--
+-- Unsafe: no bounds checking.
 partUtf8UnsafeIndex ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -94,6 +119,9 @@ partUtf8UnsafeIndex (PartialUtf8CodePoint w) n =
 #endif
   fromIntegral $ w `shiftR` (16 - 8 * n)
 
+-- | Append some bytes.
+--
+-- Unsafe: no bounds checking.
 partUtf8UnsafeAppend ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -113,7 +141,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs =
       then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i)))
       else w
 
--- | This avoids recursion to unfold to straightline code.
+-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it unfolds to straightline code.
 partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
 partUtf8Foldr f x0 c = case partUtf8Len c of
     0 -> x0
@@ -123,6 +151,7 @@ partUtf8Foldr f x0 c = case partUtf8Len c of
   where
     build i x = f (partUtf8UnsafeIndex c i) x
 
+-- | Convert 'PartialUtf8CodePoint' to 'ByteString'.
 partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
 partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr ->
   partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0
@@ -132,31 +161,37 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
 #endif
 
--- | Validate a 'ByteString' as a UTF-8-encoded text.
+-- | Validate a 'ByteString' as UTF-8-encoded text. To be continued using 'validateUtf8More'.
+--
+-- See also 'validateUtf8More' for details on the result of this function.
 --
--- @validateUtf8More chunk = (n, es)@
+-- @
+-- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State'
+-- @
+--
+-- === Properties
+--
+-- Given:
+--
+-- @
+-- 'validateUtf8Chunk' chunk = (n, ms)
+-- @
 --
--- This function returns two values:
+-- - The prefix is valid UTF-8. In particular, it should be accepted
+--   by this validation:
 --
--- * The value 'n' indicates the longest prefix of the 'ByteString'
---   that is valid UTF-8-encoded data.
--- * The value 'es' indicates whether the 'ByteString'
+--   @
+--   'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State')
+--   @
 --
---     * (@Left p@) contains an invalid code point and where the next
---       (potentially valid) code point begins, so that @p - n@ is the
---       number of invalid bytes, or
---     * (@Right s@) is valid, and all of the remaining bytes starting
---       at inbex 'n' are the beginning of an incomplete UTF-8 code
---       point, and 's' is the resulting 'Utf8State' value, which
---       can be used to validate against a following 'ByteString' with
---       'validateNextUtf8Chunk'.
+-- @since 2.0.2
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
 validateUtf8Chunk = validateUtf8ChunkFrom 0
 
 -- | Add an offset to the index returned by 'validateUtf8More'.
 --
 -- @
--- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8More' . 'B.drop' n
+-- validateUtf8ChunkFrom n = first (+ n) . 'validateUtf8More' . 'B.drop' n
 -- @
 validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
 validateUtf8ChunkFrom ofs bs
@@ -202,32 +237,51 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
             Nothing -> (utf8End, Nothing)
       | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
 
--- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text.
+-- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text.
 --
--- @validateNextUtf8Chunk chunk s = (n, es)@
+-- Returns a pair:
 --
--- This function returns two values:
+-- 1. The first component @n@ is the end position, relative to the current
+--    chunk, of the longest prefix of the accumulated bytestring which is valid UTF-8.
+--    @n@ may be negative: that happens when an incomplete code point started in
+--    a previous chunk and is not completed by the current chunk (either
+--    that code point is still incomplete, or it is broken by an invalid byte).
 --
--- * The value 'n' indicates the end position of longest prefix of the
---   'ByteString' that is valid UTF-8-encoded data from the starting
---   state 's'. If 's' contains an incomplete code point, the input
---   'ByteString' is considered a continuation. As a result 'n' will be
---   negative if the code point is still incomplete or is proven to be
---   invalid.
---   
--- * The value 'es' indicates whether the 'ByteString'
+-- 2. The second component @ms@ indicates the following:
+--
+--     - if @ms = Nothing@, the remainder of the chunk contains an invalid byte,
+--       within four bytes from position @n@;
+--     - if @ms = Just s'@, you can carry on validating another chunk
+--       by calling 'validateUtf8More' with the new state @s'@.
+--
+-- === Properties
+--
+-- Given:
+--
+-- @
+-- 'validateUtf8More' s chunk = (n, ms)
+-- @
 --
---     * (@Left p@) contains an invalid code point and where the next
---       (potentially valid) code point begins, so that @p - n@ is the
---       number of invalid bytes, or
---     * (@Right s'@) is valid, and all of the remaining bytes starting
---       at index 'n' are the beginning of an incomplete UTF-8 code
---       point, and `s'` is the resulting 'Utf8State' value, which
---       can be used to validate against a following 'ByteString'.
+-- - If the chunk is invalid, it cannot be extended to be valid.
+--
+--   @
+--   ms = Nothing
+--   ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing)
+--   @
+--
+-- - Validating two chunks sequentially is the same as validating them
+--   together at once:
+--
+--   @
+--   ms = Just s'
+--   ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more)
+--   @
+--
+-- @since 2.0.2
 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
 validateUtf8More st@(Utf8State s0 part) bs
   | len > 0 = loop 0 s0
-  | otherwise = (0, Just st)
+  | otherwise = (- partUtf8Len part, Just st)
   where
     len = B.length bs
     -- Complete an incomplete code point (if there is one)
@@ -238,21 +292,33 @@ validateUtf8More st@(Utf8State s0 part) bs
         case updateUtf8State (B.index bs i) s of
           Nothing -> (0, Nothing)
           Just s' -> loop (i + 1) s'
-      | otherwise = (0, Just (Utf8State s (partUtf8UnsafeAppend part bs)))
+      | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
--- | Construct an Array. This is currently an internal data structure which is
--- only used to construct Text (so for example byteStringToStrictBuilder should
--- only be applied to valid UTF-8 bytestrings).
+-- | A delayed representation of strict 'Text'.
+
+-- For internal purposes, this is instead used as a delayed 'Array':
+-- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder',
+-- 'byteStringToStrictBuilder', 'partUtf8ToStrictBuilder').
+--
+-- @since 2.0.2
 data StrictBuilder = StrictBuilder
   { sbLength :: {-# UNPACK #-} !Int
   , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
   }
 
+-- | Concatenation of 'StrictBuilder' is right-biased:
+-- the right builder will be run first. This allows a builder to
+-- run tail-recursively when it was accumulated left-to-right.
+instance Semigroup StrictBuilder where
+  (<>) = appendRStrictBuilder
+
+instance Monoid StrictBuilder where
+  mempty = emptyStrictBuilder
+  mappend = (<>)
+
 emptyStrictBuilder :: StrictBuilder
 emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
 
--- | Right-biased append: run the right action first.  This allows a builder to
--- run tail-recursively when accumulating text left-to-right.
 appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
 appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
   StrictBuilder (n1 + n2) (\arr ofs -> do
@@ -278,8 +344,11 @@ word8ToStrictBuilder w =
 
 partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
 partUtf8ToStrictBuilder =
-  partUtf8Foldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder
+  partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder
 
+-- | Use 'StrictBuilder' to build 'Text'.
+--
+-- @since 2.0.2
 strictBuilderToText :: StrictBuilder -> Text
 strictBuilderToText (StrictBuilder 0 _) = empty
 strictBuilderToText (StrictBuilder n write) = runST (do
@@ -288,28 +357,50 @@ strictBuilderToText (StrictBuilder n write) = runST (do
   arr <- A.unsafeFreeze dst
   pure (Text arr 0 n))
 
--- | Decode a 'ByteString' in the context of what has been already been decoded.
+-- | Decode another chunk in an ongoing UTF-8 stream.
+--
+-- Returns a triple:
 --
--- The 'ByteString' is validated against the 'Utf8State' using the rules
--- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added
--- to the input 'TextDataStack' which is returned with the end position of the
--- valid prefix, and either the resulting 'Utf8State'
--- (@Right Utf8State@) or the position of the of the first (potentially)
--- valid byte after the invalid bytes with remainder of the 'ByteString'
--- (@Left (Int, ByteString)@).
+-- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate
+--    chunks with @('<>')@ or output them with 'strictBuilderToText'.
+-- 2. The undecoded remainder of the given chunk, for diagnosing errors
+--    and resuming (presumably after skipping some bytes).
+-- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered
+--    (it will be within the first 4 bytes of the undecoded remainder).
+--
+-- @since 2.0.2
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
   case validateUtf8More s bs of
     (len, s') ->
-      let builder | len == 0 = emptyStrictBuilder
+      let builder | len <= 0 = emptyStrictBuilder
                   | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s)
-                    `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs)
+                    <> byteStringToStrictBuilder (B.take len bs)
       in (builder, B.drop len bs, s')
 
--- | Decode a 'ByteString' against a start 'Utf8State' with an empty
--- 'TextDataStack'.
+-- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'.
+--
+-- See 'decodeUtf8More' for details on the result.
+--
+-- === Properties
 --
--- @decodeUtf8More bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@
+-- @
+-- 'decodeUtf8Chunk' = 'decodeUtf8More' 'startUtf8State'
+-- @
+--
+-- Given:
+--
+-- @
+-- 'decodeUtf8Chunk' chunk = (builder, rest, ms)
+-- @
+--
+-- @builder@ is a prefix and @rest@ is a suffix of @chunk@.
+--
+-- @
+-- 'Data.Text.Encoding.encodeUtf8' ('strictBuilderToText' builder) '<>' rest = chunk
+-- @
+--
+-- @since 2.0.2
 decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8Chunk = decodeUtf8More startUtf8State
 
@@ -321,7 +412,7 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State
 skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder
 skipIncomplete onErr s =
   partUtf8Foldr
-    (appendRStrictBuilder . handleUtf8Error onErr)
+    ((<>) . handleUtf8Error onErr)
     emptyStrictBuilder (partialUtf8CodePoint s)
 
 handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder
@@ -336,14 +427,16 @@ decodeUtf8With1 ::
 #endif
   (Word8 -> Maybe Char) -> ByteString -> Text
 decodeUtf8With1 onErr bs0 = strictBuilderToText $
-    builder `appendRStrictBuilder` skipIncomplete onErr s
+    builder <> skipIncomplete onErr s
   where
     (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0
 
--- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'.
--- This uses an 'OnDecodeError' to process bad bytes.
--- This is not a very pretty legacy API.
--- See 'decodeUtf8More' for a more flexible alternative.
+-- | Helper for 'Data.Text.Encoding.decodeUtf8With',
+-- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy
+-- 'Data.Text.Lazy.Encoding.decodeUtf8With',
+-- which use an 'OnDecodeError' to process bad bytes.
+--
+-- See 'decodeUtf8Chunk' for a more flexible alternative.
 decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -353,21 +446,21 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder
   where
     loop s i !builder =
       let nonEmptyPrefix len = builder
-            `appendRStrictBuilder` partUtf8ToStrictBuilder (partialUtf8CodePoint s)
-            `appendRStrictBuilder` byteStringToStrictBuilder (B.take len (B.drop i bs))
+            <> partUtf8ToStrictBuilder (partialUtf8CodePoint s)
+            <> byteStringToStrictBuilder (B.take len (B.drop i bs))
       in case validateUtf8More s (B.drop i bs) of
         (len, Nothing) ->
-          if len == 0 && utf8CodePointState s /= utf8StartState
+          if len < 0
           then
             -- loop is strict on builder, so if onErr raises an error it will be forced here.
-            let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s
+            let builder' = builder <> skipIncomplete onErr s
             in loop startUtf8State i builder'
           else
             let builder' = nonEmptyPrefix len
-                  `appendRStrictBuilder` handleUtf8Error onErr (B.index bs (i + len))
+                  <> handleUtf8Error onErr (B.index bs (i + len))
             in loop startUtf8State (i + len + 1) builder'
         (len, Just s') ->
-          let builder' = if len == 0 then builder else nonEmptyPrefix len
+          let builder' = if len <= 0 then builder else nonEmptyPrefix len
               undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s')
                 then B.drop (i + len) bs  -- Reuse bs if possible
                 else partUtf8ToByteString (partialUtf8CodePoint s')
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 8e7a614e..d3e85ca9 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -386,11 +386,11 @@ testTranscoding =
       testCase "t_pn_utf8_2" t_pn_utf8_2,
       testCase "t_pn_utf8_3" t_pn_utf8_3,
       testCase "t_decode_chunk1" t_decode_chunk1,
-      testCase "t_decode_chunk2" t_decode_chunk2
+      testCase "t_decode_chunk2" t_decode_chunk2,
       testCase "t_decode_with_error1'" t_decode_with_error1',
       testCase "t_decode_with_error2'" t_decode_with_error2',
       testCase "t_decode_with_error3'" t_decode_with_error3',
       testCase "t_decode_with_error4'" t_decode_with_error4',
-      testCase "t_decode_with_error5'" t_decode_with_error5',
+      testCase "t_decode_with_error5'" t_decode_with_error5'
     ]
   ]

From 27d23210c10004e0fcefb8846ea8f16173459687 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Fri, 3 Feb 2023 23:08:40 +0000
Subject: [PATCH 66/87] Fix tests

---
 src/Data/Text/Encoding.hs             |  2 +-
 src/Data/Text/Internal/Encoding.hs    | 11 +++++++----
 tests/Tests/Properties/Transcoding.hs | 12 ++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 9d7ed6d0..65a1ce82 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -547,7 +547,7 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 --
 -- - You get an UTF-8 prefix of the given 'ByteString' up to the next error.
 --   For example this lets you insert or delete arbitrary text, or do some
---   stateful operations, before resuming.
+--   stateful operations before resuming, such as keeping track of error locations.
 --   In contrast, the older stream-oriented interface only lets you substitute
 --   a single fixed 'Char' for each invalid byte in 'OnDecodeError'.
 -- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index fc8be607..58c79bc3 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -11,6 +11,9 @@
 -- please request to have it exported!
 module Data.Text.Internal.Encoding where
 
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
 import Control.Monad.ST (ST, runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits ((.&.), shiftL, shiftR)
@@ -113,9 +116,9 @@ partUtf8UnsafeIndex ::
   HasCallStack =>
 #endif
   PartialUtf8CodePoint -> Int -> Word8
-partUtf8UnsafeIndex (PartialUtf8CodePoint w) n =
+partUtf8UnsafeIndex _c@(PartialUtf8CodePoint w) n =
 #if defined(ASSERTS)
-  assert (0 <= n && n < partUtf8Len w) $
+  assert (0 <= n && n < partUtf8Len _c) $
 #endif
   fromIntegral $ w `shiftR` (16 - 8 * n)
 
@@ -290,7 +293,7 @@ validateUtf8More st@(Utf8State s0 part) bs
       | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs
       | i < len =
         case updateUtf8State (B.index bs i) s of
-          Nothing -> (0, Nothing)
+          Nothing -> (- partUtf8Len part, Nothing)
           Just s' -> loop (i + 1) s'
       | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
@@ -461,7 +464,7 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder
             in loop startUtf8State (i + len + 1) builder'
         (len, Just s') ->
           let builder' = if len <= 0 then builder else nonEmptyPrefix len
-              undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s')
+              undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s')
                 then B.drop (i + len) bs  -- Reuse bs if possible
                 else partUtf8ToByteString (partialUtf8CodePoint s')
           in (builder', undecoded, s')
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index d3e85ca9..038b6506 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -88,14 +88,14 @@ testValidateUtf8MoreFail s = testValidateUtf8With (E.validateUtf8More s) expectN
 t_pn_utf8_1 = do
   s <- testValidateUtf8 [0xF0, 0x90, 0x80] 0
   _ <- testValidateUtf8More s [0x80] 1
-  testValidateUtf8MoreFail s [0x7f] 0
+  testValidateUtf8MoreFail s [0x7f] (-3)
 t_pn_utf8_2 = do
   s0 <- testValidateUtf8 [0xF0] 0
-  testValidateUtf8MoreFail s0 [0x7f] 0
-  s1 <- testValidateUtf8More s0 [0x90] 0
-  testValidateUtf8MoreFail s1 [0x7f] 0
-  s2 <- testValidateUtf8More s1 [0x80] 0
-  testValidateUtf8MoreFail s2 [0x7f] 0
+  testValidateUtf8MoreFail s0 [0x7f] (-1)
+  s1 <- testValidateUtf8More s0 [0x90] (-1)
+  testValidateUtf8MoreFail s1 [0x7f] (-2)
+  s2 <- testValidateUtf8More s1 [0x80] (-2)
+  testValidateUtf8MoreFail s2 [0x7f] (-3)
   _ <- testValidateUtf8More s2 [0x80] 1
   pure ()
 t_pn_utf8_3 = do

From 4479e6e054eb7b584f5a1aa35f9edd5a86168e70 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Fri, 3 Feb 2023 23:46:57 +0000
Subject: [PATCH 67/87] Space

---
 src/Data/Text/Internal/Encoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 58c79bc3..de149119 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -70,7 +70,7 @@ data Utf8State = Utf8State
 -- | Initial 'Utf8State'.
 --
 -- @since 2.0.2
-startUtf8State :: Utf8State 
+startUtf8State :: Utf8State
 startUtf8State = Utf8State utf8StartState partUtf8Empty
 
 -- | Prefix of a UTF-8 code point encoded in 4 bytes,

From 01d8824a998aa1f67961b4920dd5cf86a833f1bf Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sat, 4 Feb 2023 00:20:30 +0000
Subject: [PATCH 68/87] Fix test for old base

---
 tests/Tests/Properties/Transcoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 038b6506..7f02c0f4 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -301,7 +301,7 @@ testDecodeUtf8With k s xs expected =
         bs @?= xs'
       else
         let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s)
-        in E.encodeUtf8 txt <> bs @?= c <> xs'
+        in E.encodeUtf8 txt `B.append` bs @?= c `B.append` xs'
       k s'
 
 testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State

From 25859ad34f9a95a8377c87155d867cc1ec144ee1 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sat, 4 Feb 2023 00:28:41 +0000
Subject: [PATCH 69/87] Add textToStrictBuilder

---
 src/Data/Text/Encoding.hs             |  3 ++-
 src/Data/Text/Internal/Encoding.hs    | 24 +++++++++++++++++-------
 tests/Tests/Properties/Transcoding.hs |  6 ++++++
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 65a1ce82..6a7a614f 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -45,7 +45,7 @@ module Data.Text.Encoding
     , streamDecodeUtf8With
     , Decoding(..)
 
-    -- *** Incremental decoding
+    -- *** Incremental UTF-8 decoding
     -- $incremental
     , decodeUtf8Chunk
     , decodeUtf8More
@@ -53,6 +53,7 @@ module Data.Text.Encoding
     , startUtf8State
     , StrictBuilder()
     , strictBuilderToText
+    , textToStrictBuilder
 
     -- ** Partial Functions
     -- $partial
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index de149119..7d89719f 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -324,9 +324,9 @@ emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
 
 appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
 appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
-  StrictBuilder (n1 + n2) (\arr ofs -> do
-    write2 arr (ofs + n1)
-    write1 arr ofs)
+  StrictBuilder (n1 + n2) (\dst ofs -> do
+    write2 dst (ofs + n1)
+    write1 dst ofs)
 
 copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s ()
 copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
@@ -335,15 +335,15 @@ copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
 
 byteStringToStrictBuilder :: ByteString -> StrictBuilder
 byteStringToStrictBuilder bs =
-  StrictBuilder (B.length bs) (\arr ofs -> copyFromByteString arr ofs bs)
+  StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
 
 charToStrictBuilder :: Char -> StrictBuilder
 charToStrictBuilder c =
-  StrictBuilder (utf8Length c) (\arr ofs -> void (Char.unsafeWrite arr ofs (safe c)))
+  StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
 
 word8ToStrictBuilder :: Word8 -> StrictBuilder
 word8ToStrictBuilder w =
-  StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w)
+  StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w)
 
 partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
 partUtf8ToStrictBuilder =
@@ -360,6 +360,13 @@ strictBuilderToText (StrictBuilder n write) = runST (do
   arr <- A.unsafeFreeze dst
   pure (Text arr 0 n))
 
+-- | Copy 'Text' in a 'StrictBuilder'
+--
+-- @since 2.0.2
+textToStrictBuilder :: Text -> StrictBuilder
+textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
+  A.copyI n dst dstOfs src srcOfs)
+
 -- | Decode another chunk in an ongoing UTF-8 stream.
 --
 -- Returns a triple:
@@ -455,8 +462,11 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder
         (len, Nothing) ->
           if len < 0
           then
-            -- loop is strict on builder, so if onErr raises an error it will be forced here.
+            -- If the first byte cannot complete the partial code point in s,
+            -- retry from startUtf8State.
             let builder' = builder <> skipIncomplete onErr s
+            -- Note: loop is strict on builder, so if onErr raises an error it will
+            -- be forced here, short-circuiting the loop as desired.
             in loop startUtf8State i builder'
           else
             let builder' = nonEmptyPrefix len
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 7f02c0f4..57f09139 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -331,6 +331,9 @@ t_infix_concat bs1 text bs2 =
   text `T.isInfixOf`
     E.decodeUtf8With onErr (B.concat [bs1, E.encodeUtf8 text, bs2])
 
+t_textToStrictBuilder =
+  (E.strictBuilderToText . E.textToStrictBuilder) `eq` id
+
 testTranscoding :: TestTree
 testTranscoding =
   testGroup "transcoding" [
@@ -392,5 +395,8 @@ testTranscoding =
       testCase "t_decode_with_error3'" t_decode_with_error3',
       testCase "t_decode_with_error4'" t_decode_with_error4',
       testCase "t_decode_with_error5'" t_decode_with_error5'
+    ],
+    testGroup "strictBuilder" [
+      testProperty "textToStrictBuilder" t_textToStrictBuilder
     ]
   ]

From ace7c61764fa859a825a58b9a327f7239098733d Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sat, 4 Feb 2023 00:28:50 +0000
Subject: [PATCH 70/87] Update changelog.md

---
 changelog.md | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/changelog.md b/changelog.md
index 136d89a3..e63283c1 100644
--- a/changelog.md
+++ b/changelog.md
@@ -4,28 +4,22 @@
 
 ### 2.0.2
 
-* A suite of functions have been added in `Data.Text.Encoding` that
-  allow decoding to be aborted on errors without the need to raise an
-  `error` and `catch` it elsewhere:
+* Add decoding functions in `Data.Text.Encoding` that allow
+  more control for error handling and for how to allocate text.
+  (https://github.com/haskell/text/pull/448 Thanks to David Sledge)
   * `decodeASCIIPrefix`
-  * `TextDataStack`
-  * `dataStack`
-  * `stackLen`
-  * `emptyStack`
   * `decodeUtf8Chunk`
-  * `decodeNextUtf8Chunk`
-  * `pushText`
-  * `stackToText`
-
-* Added functions to validate `ByteString`s that represent encoded text:
+  * `decodeUtf8More`
   * `Utf8ValidState`
-  * `partialUtf8CodePoint`
-  * `partUtf8CPLen`
-  * `wordAtPartUft8CP`
-  * `utf8CodePointState`
-  * `validateUtf8Chunk`
-  * `validateNextUtf8Chunk`
   * `startUtf8ValidState`
+  * `StrictBuilder`
+  * `strictBuilderToText`
+  * `textToStrictBuilder`
+  * `validateUtf8Chunk`
+  * `validateUtf8More`
+
+* Fix quadratic slowdown when decoding invalid UTF-8 bytestrings
+  (https://github.com/haskell/text/issues/495)
 
 ### 2.0.1
 

From 2603294e28d5ab6adcc2a4d4e3f74ff392a36c6e Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sat, 4 Feb 2023 03:37:57 +0000
Subject: [PATCH 71/87] Add laws

---
 src/Data/Text/Internal/Encoding.hs    |  64 ++++++++--
 tests/Tests/Properties/Transcoding.hs | 165 +++++++++++++++++++++++++-
 2 files changed, 217 insertions(+), 12 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 7d89719f..54c84e34 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -183,9 +183,9 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 -- - The prefix is valid UTF-8. In particular, it should be accepted
 --   by this validation:
 --
---   @
---   'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State')
---   @
+--     @
+--     'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State')
+--     @
 --
 -- @since 2.0.2
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
@@ -267,18 +267,18 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
 --
 -- - If the chunk is invalid, it cannot be extended to be valid.
 --
---   @
---   ms = Nothing
---   ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing)
---   @
+--     @
+--     ms = Nothing
+--     ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing)
+--     @
 --
 -- - Validating two chunks sequentially is the same as validating them
 --   together at once:
 --
---   @
---   ms = Just s'
---   ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more)
---   @
+--     @
+--     ms = Just s'
+--     ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more)
+--     @
 --
 -- @since 2.0.2
 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
@@ -379,6 +379,48 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
 --    (it will be within the first 4 bytes of the undecoded remainder).
 --
 -- @since 2.0.2
+--
+-- === Properties
+--
+-- Given:
+--
+-- @
+-- (pre, suf, ms) = 'decodeUtf8More' s chunk
+-- @
+--
+-- 1. If the output @pre@ is nonempty (alternatively, if @length chunk > length suf@)
+--
+--     @
+--     s2b pre \`'Data.ByteString.append'\` suf = p2b s \`'Data.ByteString.append'\` chunk
+--     @
+--
+--     where
+--
+--     @
+--     s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.strictBuilderToText'
+--     p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString'
+--     @
+--
+-- 2. If the output @pre@ is empty (alternatively, if @length chunk = length suf@)
+--
+--     @suf = chunk@
+--
+-- 3. Decoding chunks separately is equivalent to decoding their concatenation.
+--
+--     Given:
+--
+--     @
+--     (pre1, suf1, Just s1) = 'decodeUtf8More' s chunk1
+--     (pre2, suf2,     ms2) = 'decodeUtf8More' s1 chunk2
+--     (pre3, suf3,     ms3) = 'decodeUtf8More' s (chunk1 \`B.append\` chunk2)
+--     @
+--
+--     we have:
+--
+--     @
+--     s2b (pre1 '<>' pre2) = s2b pre3
+--     ms2 = ms3
+--     @
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
   case validateUtf8More s bs of
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 57f09139..7bf2067f 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -12,6 +12,8 @@ import Prelude hiding (head, tail)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Data.Functor (void)
+import Data.Maybe (isNothing)
+import Data.Semigroup ((<>))
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -104,6 +106,105 @@ t_pn_utf8_3 = do
   assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c
   testValidateUtf8MoreFail s1 [0x80, 0x80] 1
 
+-- Precondition: (i, ms1) = E.validateUtf8More s chunk
+--
+-- The index points to the end of the longest valid prefix
+-- of prechunk `B.append` chunk
+pre_validateUtf8More_validPrefix s chunk i =
+  let prechunk = E.partUtf8ToByteString (E.partialUtf8CodePoint s) in
+  -- Note: i <= 0 implies take i = id
+  let (j, ms2) = E.validateUtf8Chunk (B.take (B.length prechunk + i) (prechunk `B.append` chunk)) in
+  counterexample (show prechunk) $
+    (B.length prechunk + i, ms2) === (j, Just E.startUtf8State)
+
+-- Precondition: (i, Nothing) = E.validateUtf8More s chunk
+--
+-- Appending to an invalid chunk yields another invalid chunk.
+pre_validateUtf8More_maximalPrefix s chunk i more =
+  E.validateUtf8More s (chunk `B.append` more) === (i, Nothing)
+
+-- Precondition: (i, Just s1) = E.validateUtf8More s chunk
+pre_validateUtf8More_suffix s chunk i s1 =
+  if 0 <= i
+  then B.drop i chunk === p2b s1         -- The state s1 contains a suffix of the chunk.
+  else p2b s `B.append` chunk === p2b s1 -- Or the chunk extends the incomplete code point in s1.
+
+-- Precondition: (i, Just s1) = E.validateUtf8More s chunk1
+--
+-- Validating two chunks sequentially is equivalent to validating them at once.
+pre_validateUtf8More_append s chunk1 s1 chunk2 =
+  let (j, ms2) = E.validateUtf8More s1 chunk2 in
+  (B.length chunk1 + j, ms2) === E.validateUtf8More s (chunk1 `B.append` chunk2)
+
+-- These wrappers use custom generators to satisfy the above properties.
+
+t_validateUtf8More_validPrefix = property $ do
+  cex@(s, chunk, i, _ms1) <- randomMoreChunk
+  pure $ counterexample (show cex) $
+    pre_validateUtf8More_validPrefix s chunk i
+
+t_validateUtf8More_maximalPrefix = property $ do
+  -- We want chunks that fail validation: force their size to be big,..
+  cex@(s, chunk, i, ms1) <- scale (* 3) arbitraryMoreChunk
+  pure $ counterexample (show cex) $
+    -- ... and just use rejection sampling
+    isNothing ms1 ==>
+    pre_validateUtf8More_maximalPrefix s chunk i
+
+t_validateUtf8More_valid = property $ do
+  cex@(s, chunk1, i, s1, chunk2) <- validMoreChunks
+  pure $ counterexample (show cex) $
+    pre_validateUtf8More_suffix s chunk1 i s1 .&&.
+    pre_validateUtf8More_append s chunk1 s1 chunk2
+
+randomMoreChunk, arbitraryMoreChunk, validMoreChunk :: Gen (E.Utf8State, B.ByteString, Int, Maybe E.Utf8State)
+randomMoreChunk = oneof [arbitraryMoreChunk, validMoreChunk]
+
+arbitraryMoreChunk = do
+  s <- randomUtf8State
+  chunk <- arbitrary
+  let (i, ms1) = E.validateUtf8More s chunk
+  pure (s, chunk, i, ms1)
+
+-- | Generate a random state by parsing a prefix of a Char
+randomUtf8State :: Gen E.Utf8State
+randomUtf8State = do
+  c <- arbitrary
+  chunk <- elements (B.inits (E.encodeUtf8 (T.singleton c)))
+  case E.validateUtf8Chunk chunk of
+    (_, Just s) -> pure s
+    (_, Nothing) -> error "should not happen"
+
+-- | Make a valid chunk, i.e., (s, chunk) such that
+--
+-- validateUtf8More s chunk = (i, Just s1)
+--
+-- Also returning i and s1 to not repeat work.
+validMoreChunk = do
+  (s, chunk, i, s1, _chunk2) <- validMoreChunks
+  pure (s, chunk, i, Just s1)
+
+-- | Make a valid chunk by slicing a valid UTF8 bs,
+-- and also provide a second chunk which is a valid extension
+-- with 0.5 probability.
+validMoreChunks :: Gen (E.Utf8State, B.ByteString, Int, E.Utf8State, B.ByteString)
+validMoreChunks = do
+  bs <- E.encodeUtf8 <$> scale (* 3) arbitrary
+  -- Take an intermediate state.
+  -- No need to go too far since code points are at most 4 bytes long
+  i <- choose (0, 3)
+  let (bs0, bs1) = B.splitAt i bs
+  case E.validateUtf8Chunk bs0 of
+    (_, Just s) -> do
+      j <- choose (0, B.length bs1)
+      let (chunk1, chunk2') = B.splitAt j bs1
+      case E.validateUtf8More s chunk1 of
+        (n1, Just s1) -> do
+          chunk2 <- oneof [pure chunk2', arbitrary]
+          pure (s, chunk1, n1, s1, chunk2)
+        (_, Nothing) -> error "should not happen"
+    (_, Nothing) -> error "should not happen"
+
 t_utf8_c     = (E.strictBuilderToText . fst3 . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id
 t_utf8       = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
@@ -334,6 +435,60 @@ t_infix_concat bs1 text bs2 =
 t_textToStrictBuilder =
   (E.strictBuilderToText . E.textToStrictBuilder) `eq` id
 
+-- decodeUtf8Chunk splits the input bytestring
+t_decodeUtf8Chunk_split chunk =
+  let (pre, suf, _ms) = E.decodeUtf8Chunk chunk
+  in s2b pre `B.append` suf === chunk
+
+-- decodeUtf8More mostly splits the input bytestring,
+-- also inserting bytes from the partial code point in s.
+--
+-- This is wrapped by t_decodeUtf8More_split to have more
+-- likely valid chunks.
+t_decodeUtf8More_split' s chunk =
+  let (pre, suf, _ms) = E.decodeUtf8More s chunk
+  in if B.length chunk > B.length suf
+  then s2b pre `B.append` suf === p2b s `B.append` chunk
+  else suf === chunk
+
+-- The output state of decodeUtf8More contains the suffix.
+--
+-- Precondition (valid chunk): ms = Just s'
+pre_decodeUtf8More_suffix s chunk =
+  let (_pre, suf, ms) = E.decodeUtf8More s chunk
+  in case ms of
+    Nothing -> discard
+    Just s' -> if B.length chunk > B.length suf
+      then p2b s' === suf
+      else p2b s' === p2b s `B.append` suf
+
+-- Decoding chunks separately is equivalent to decoding their concatenation.
+pre_decodeUtf8More_append s chunk1 chunk2 =
+  let (pre1, suf1, ms1) = E.decodeUtf8More s chunk1 in
+  case ms1 of
+    Nothing -> discard
+    Just s1 ->
+      let (pre2, suf2, ms2) = E.decodeUtf8More s1 chunk2 in
+      let (pre3, suf3, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in
+      (s2b (pre1 <> pre2), ms2) === (s2b pre3, ms3)
+
+-- Properties for any chunk
+-- (but do try to generate valid chunks often enough)
+t_decodeUtf8More1 = property $ do
+  cex@(s, chunk, _, _) <- randomMoreChunk
+  pure $ counterexample (show cex) $
+    t_decodeUtf8More_split' s chunk
+
+-- Properties that require valid chunks
+t_decodeUtf8More2 = property $ do
+  cex@(s, chunk, _, _, chunk2) <- validMoreChunks
+  pure $ counterexample (show cex) $
+    pre_decodeUtf8More_suffix s chunk .&&.
+    pre_decodeUtf8More_append s chunk chunk2
+
+s2b = E.encodeUtf8 . E.strictBuilderToText
+p2b = E.partUtf8ToByteString . E.partialUtf8CodePoint
+
 testTranscoding :: TestTree
 testTranscoding =
   testGroup "transcoding" [
@@ -371,6 +526,11 @@ testTranscoding =
       testProperty "t_decode_with_error4" t_decode_with_error4,
       testProperty "t_infix_concat" t_infix_concat
     ],
+    testGroup "validate" [
+      testProperty "t_validateUtf8More_validPrefix" t_validateUtf8More_validPrefix,
+      testProperty "t_validateUtf8More_maximalPrefix" t_validateUtf8More_maximalPrefix,
+      testProperty "t_validateUtf8More_valid" t_validateUtf8More_valid
+    ],
     testGroup "streaming" [
       testProperty "t_utf8_undecoded" t_utf8_undecoded,
       testProperty "t_utf8_incr" t_utf8_incr,
@@ -394,7 +554,10 @@ testTranscoding =
       testCase "t_decode_with_error2'" t_decode_with_error2',
       testCase "t_decode_with_error3'" t_decode_with_error3',
       testCase "t_decode_with_error4'" t_decode_with_error4',
-      testCase "t_decode_with_error5'" t_decode_with_error5'
+      testCase "t_decode_with_error5'" t_decode_with_error5',
+      testProperty "t_decodeUtf8Chunk_split" t_decodeUtf8Chunk_split,
+      testProperty "t_decodeUtf8More1" t_decodeUtf8More1,
+      testProperty "t_decodeUtf8More2" t_decodeUtf8More2
     ],
     testGroup "strictBuilder" [
       testProperty "textToStrictBuilder" t_textToStrictBuilder

From 01344a27166a0c49350f29ce6fde58d720cdeaa1 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sat, 4 Feb 2023 19:48:23 +0000
Subject: [PATCH 72/87] Add some inline pragmas

---
 src/Data/Text/Encoding.hs          |  1 +
 src/Data/Text/Internal/Encoding.hs | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 6a7a614f..cd3593de 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -175,6 +175,7 @@ decodeASCIIPrefix bs = if B.null bs
           Text (A.ByteArray arr) 0 len
         suffix = B.drop len bs in
     (prefix, suffix)
+{-# INLINE decodeASCIIPrefix #-}
 
 -- | Length of the longest ASCII prefix.
 asciiPrefixLength :: ByteString -> Int
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 54c84e34..893f426a 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -144,7 +144,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs =
       then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i)))
       else w
 
--- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it unfolds to straightline code.
+-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code.
 partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
 partUtf8Foldr f x0 c = case partUtf8Len c of
     0 -> x0
@@ -153,6 +153,7 @@ partUtf8Foldr f x0 c = case partUtf8Len c of
     _ -> build 0 (build 1 (build 2 x0))
   where
     build i x = f (partUtf8UnsafeIndex c i) x
+{-# INLINE partUtf8Foldr #-}
 
 -- | Convert 'PartialUtf8CodePoint' to 'ByteString'.
 partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
@@ -340,14 +341,16 @@ byteStringToStrictBuilder bs =
 charToStrictBuilder :: Char -> StrictBuilder
 charToStrictBuilder c =
   StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
+{-# INLINE charToStrictBuilder #-}
 
 word8ToStrictBuilder :: Word8 -> StrictBuilder
-word8ToStrictBuilder w =
+word8ToStrictBuilder !w =
   StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w)
 
+-- Eta-expanded to inline partUtf8Foldr
 partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
-partUtf8ToStrictBuilder =
-  partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder
+partUtf8ToStrictBuilder c =
+  partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c
 
 -- | Use 'StrictBuilder' to build 'Text'.
 --

From c36c95f6f3173d76c55c51e33bfa8cbbebb257bb Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sun, 5 Feb 2023 22:52:56 +0000
Subject: [PATCH 73/87] Optimize first iteration of decodeUtf8With1

---
 src/Data/Text/Internal/Encoding.hs | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 893f426a..316cce22 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -32,6 +32,7 @@ import Data.Text.Internal.Encoding.Utf8
   (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as BI
+import qualified Data.ByteString.Short.Internal as SBS
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Unsafe.Char as Char
 #if defined(ASSERTS)
@@ -324,6 +325,8 @@ emptyStrictBuilder :: StrictBuilder
 emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
 
 appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
+appendRStrictBuilder (StrictBuilder 0 _) b2 = b2
+appendRStrictBuilder b1 (StrictBuilder 0 _) = b1
 appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
   StrictBuilder (n1 + n2) (\dst ofs -> do
     write2 dst (ofs + n1)
@@ -476,15 +479,28 @@ handleUtf8Error onErr w = case onErr w of
   Nothing -> emptyStrictBuilder
 
 -- | Helper for 'decodeUtfWith'.
+--
+-- This could be shorter by calling decodeUtf8With2 directly,
+-- but we make the first call validateUtf8Chunk directly so that
 decodeUtf8With1 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
   (Word8 -> Maybe Char) -> ByteString -> Text
-decodeUtf8With1 onErr bs0 = strictBuilderToText $
-    builder <> skipIncomplete onErr s
-  where
-    (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0
+decodeUtf8With1 onErr bs = case validateUtf8Chunk bs of
+    (len, Just s)
+      | len == B.length bs ->
+        let !(SBS.SBS arr) = SBS.toShort bs in
+        Text (A.ByteArray arr) 0 len
+      | otherwise -> strictBuilderToText $
+          byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr s
+    (len, Nothing) ->
+       let (builder, _, s) = decodeUtf8With2 onErr startUtf8State (B.drop (len + 1) bs) in
+       strictBuilderToText $
+         byteStringToStrictBuilder (B.take len bs) <>
+         handleUtf8Error onErr (B.index bs len) <>
+         builder <>
+         skipIncomplete onErr s
 
 -- | Helper for 'Data.Text.Encoding.decodeUtf8With',
 -- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy

From 89f22071c219397036319cdd1d4af6ece6c071dd Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Sun, 5 Feb 2023 23:48:04 +0000
Subject: [PATCH 74/87] Go CPS

---
 src/Data/Text/Encoding.hs          |   4 +-
 src/Data/Text/Internal/Encoding.hs | 139 ++++++++++++++++-------------
 src/Data/Text/Lazy/Encoding.hs     |   6 +-
 3 files changed, 81 insertions(+), 68 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index cd3593de..ecd89948 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -325,7 +325,7 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = loop startUtf8State
   where
     loop s chunk =
-      let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) s chunk
+      let (builder, undecoded, s') = decodeUtf8With2 onErr invalidUtf8Msg s chunk
       in Some (strictBuilderToText builder) undecoded (loop s')
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
@@ -337,7 +337,7 @@ decodeUtf8With ::
   HasCallStack =>
 #endif
   OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr = decodeUtf8With1 (onErr invalidUtf8Msg . Just)
+decodeUtf8With onErr = decodeUtf8With1 onErr invalidUtf8Msg
 
 invalidUtf8Msg :: String
 invalidUtf8Msg = "Data.Text.Encoding: Invalid UTF-8 stream"
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 316cce22..5236bf14 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
     UnliftedFFITypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ViewPatterns #-}
@@ -20,16 +21,17 @@ import Data.Bits ((.&.), shiftL, shiftR)
 import Data.ByteString (ByteString)
 import Data.Functor (void)
 import Data.Semigroup (Semigroup(..))
-import Data.Text.Internal (Text(..), empty, safe)
-import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word32, Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr)
 import Foreign.Storable (pokeElemOff)
+import Data.Text.Encoding.Error (OnDecodeError)
+import Data.Text.Internal (Text(..), empty, safe)
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
   (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as BI
 import qualified Data.ByteString.Short.Internal as SBS
@@ -146,6 +148,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs =
       else w
 
 -- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code.
+{-# INLINE partUtf8Foldr #-}
 partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
 partUtf8Foldr f x0 c = case partUtf8Len c of
     0 -> x0
@@ -154,7 +157,6 @@ partUtf8Foldr f x0 c = case partUtf8Len c of
     _ -> build 0 (build 1 (build 2 x0))
   where
     build i x = f (partUtf8UnsafeIndex c i) x
-{-# INLINE partUtf8Foldr #-}
 
 -- | Convert 'PartialUtf8CodePoint' to 'ByteString'.
 partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
@@ -191,15 +193,19 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 --
 -- @since 2.0.2
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
-validateUtf8Chunk = validateUtf8ChunkFrom 0
+validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,)
 
--- | Add an offset to the index returned by 'validateUtf8More'.
+-- Add an offset to the index returned by 'validateUtf8More'.
 --
 -- @
--- validateUtf8ChunkFrom n = first (+ n) . 'validateUtf8More' . 'B.drop' n
+-- validateUtf8ChunkFrom n bs (,) = (first (+ n) . 'validateUtf8More' . 'B.drop' n) bs
 -- @
-validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
-validateUtf8ChunkFrom ofs bs
+--
+-- CPS: inlining the continuation lets us avoid allocating a @Maybe@ in the
+-- @decode...@ functions.
+{-# INLINE validateUtf8ChunkFrom #-}
+validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
+validateUtf8ChunkFrom ofs bs k
 #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
   | guessUtf8Boundary > 0 &&
     -- the rest of the bytestring valid utf-8 up to the boundary
@@ -211,36 +217,36 @@ validateUtf8ChunkFrom ofs bs
 #else
       B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
 #endif
-    ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) bs
+    ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
     -- No
-  | otherwise = slowValidateUtf8ChunkFrom ofs bs
-    where
-      len = B.length bs - ofs
-      isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n)
-      guessUtf8Boundary
-        | isBoundary 3 0xf0 = len - 3  -- third to last char starts a four-byte code point
-        | isBoundary 2 0xe0 = len - 2  -- pre-last char starts a three-or-four-byte code point
-        | isBoundary 1 0xc2 = len - 1  -- last char starts a two-(or more-)byte code point
-        | otherwise = len
+  | otherwise = if guessUtf8Boundary == len then k len (Just startUtf8State) else slowValidateUtf8ChunkFrom ofs
+  where
+    len = B.length bs - ofs
+    isBoundary n p = len >= n && p (B.index bs (ofs + len - n))
+    guessUtf8Boundary
+      | isBoundary 1 (<= 0x80) = len      -- last char is ASCII
+      | isBoundary 1 (0xc2 <=) = len - 1  -- last char starts a two-(or more-)byte code point
+      | isBoundary 2 (0xe0 <=) = len - 2  -- pre-last char starts a three-or-four-byte code point
+      | isBoundary 3 (0xf0 <=) = len - 3  -- third to last char starts a four-byte code point
+      | otherwise = len
 #else
-  = slowValidateUtf8ChunkFrom ofs bs
+  = slowValidateUtf8ChunkFrom ofs
+  where
 #endif
+    -- A pure Haskell implementation of validateUtf8More.
+    -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
+    -- indices to let us avoid this function.
+    slowValidateUtf8ChunkFrom :: Int -> r
+    slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8StartState
 
--- | A pure Haskell implementation of validateUtf8More.
---
--- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
--- indices to let us avoid this function.
-slowValidateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State)
-slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
-  where
-    loop !utf8End i s
+    slowLoop !utf8End i s
       | i < B.length bs =
           case updateUtf8State (B.index bs i) s of
             Just s' ->
               let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End
-              in loop utf8End' (i + 1) s'
-            Nothing -> (utf8End, Nothing)
-      | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
+              in slowLoop utf8End' (i + 1) s'
+            Nothing -> k utf8End Nothing
+      | otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
 
 -- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text.
 --
@@ -284,20 +290,26 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState
 --
 -- @since 2.0.2
 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
-validateUtf8More st@(Utf8State s0 part) bs
+validateUtf8More st bs = validateUtf8MoreCont st bs (,)
+
+-- CPS: inlining the continuation lets us make more tail calls and avoid
+-- allocating a @Maybe@ in @decodeWith1/2@.
+{-# INLINE validateUtf8MoreCont #-}
+validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
+validateUtf8MoreCont st@(Utf8State s0 part) bs k
   | len > 0 = loop 0 s0
-  | otherwise = (- partUtf8Len part, Just st)
+  | otherwise = k (- partUtf8Len part) (Just st)
   where
     len = B.length bs
     -- Complete an incomplete code point (if there is one)
     -- and then jump to validateUtf8ChunkFrom
     loop !i s
-      | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs
+      | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs k
       | i < len =
         case updateUtf8State (B.index bs i) s of
-          Nothing -> (- partUtf8Len part, Nothing)
+          Nothing -> k (- partUtf8Len part) Nothing
           Just s' -> loop (i + 1) s'
-      | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs)))
+      | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
 -- | A delayed representation of strict 'Text'.
 
@@ -341,10 +353,10 @@ byteStringToStrictBuilder :: ByteString -> StrictBuilder
 byteStringToStrictBuilder bs =
   StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
 
+{-# INLINE charToStrictBuilder #-}
 charToStrictBuilder :: Char -> StrictBuilder
 charToStrictBuilder c =
   StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
-{-# INLINE charToStrictBuilder #-}
 
 word8ToStrictBuilder :: Word8 -> StrictBuilder
 word8ToStrictBuilder !w =
@@ -429,12 +441,11 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
 --     @
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
-  case validateUtf8More s bs of
-    (len, s') ->
-      let builder | len <= 0 = emptyStrictBuilder
-                  | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s)
-                    <> byteStringToStrictBuilder (B.take len bs)
-      in (builder, B.drop len bs, s')
+  validateUtf8MoreCont s bs $ \len ms ->
+    let builder | len <= 0 = emptyStrictBuilder
+                | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s)
+                  <> byteStringToStrictBuilder (B.take len bs)
+    in (builder, B.drop len bs, ms)
 
 -- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'.
 --
@@ -467,14 +478,16 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State
 -- value. The bytes are the positions from 'errStart' (inclusive) to
 -- 'errEnd' (exclusive). Any substitute characters are pushed onto the
 -- supplied 'TextDataStack' argument.
-skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder
-skipIncomplete onErr s =
+{-# INLINE skipIncomplete #-}
+skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
+skipIncomplete onErr msg s =
   partUtf8Foldr
-    ((<>) . handleUtf8Error onErr)
+    ((<>) . handleUtf8Error onErr msg)
     emptyStrictBuilder (partialUtf8CodePoint s)
 
-handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder
-handleUtf8Error onErr w = case onErr w of
+{-# INLINE handleUtf8Error #-}
+handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
+handleUtf8Error onErr msg w = case onErr msg (Just w) of
   Just c -> charToStrictBuilder c
   Nothing -> emptyStrictBuilder
 
@@ -486,21 +499,21 @@ decodeUtf8With1 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  (Word8 -> Maybe Char) -> ByteString -> Text
-decodeUtf8With1 onErr bs = case validateUtf8Chunk bs of
-    (len, Just s)
+  OnDecodeError -> String -> ByteString -> Text
+decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms of
+    Just s
       | len == B.length bs ->
         let !(SBS.SBS arr) = SBS.toShort bs in
         Text (A.ByteArray arr) 0 len
       | otherwise -> strictBuilderToText $
-          byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr s
-    (len, Nothing) ->
-       let (builder, _, s) = decodeUtf8With2 onErr startUtf8State (B.drop (len + 1) bs) in
+          byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr msg s
+    Nothing ->
+       let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in
        strictBuilderToText $
          byteStringToStrictBuilder (B.take len bs) <>
-         handleUtf8Error onErr (B.index bs len) <>
+         handleUtf8Error onErr msg (B.index bs len) <>
          builder <>
-         skipIncomplete onErr s
+         skipIncomplete onErr msg s
 
 -- | Helper for 'Data.Text.Encoding.decodeUtf8With',
 -- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy
@@ -512,28 +525,28 @@ decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  (Word8 -> Maybe Char) -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
-decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder
+  OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
+decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder
   where
     loop s i !builder =
       let nonEmptyPrefix len = builder
             <> partUtf8ToStrictBuilder (partialUtf8CodePoint s)
             <> byteStringToStrictBuilder (B.take len (B.drop i bs))
-      in case validateUtf8More s (B.drop i bs) of
-        (len, Nothing) ->
+      in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of
+        Nothing ->
           if len < 0
           then
             -- If the first byte cannot complete the partial code point in s,
             -- retry from startUtf8State.
-            let builder' = builder <> skipIncomplete onErr s
+            let builder' = builder <> skipIncomplete onErr msg s
             -- Note: loop is strict on builder, so if onErr raises an error it will
             -- be forced here, short-circuiting the loop as desired.
             in loop startUtf8State i builder'
           else
             let builder' = nonEmptyPrefix len
-                  <> handleUtf8Error onErr (B.index bs (i + len))
+                  <> handleUtf8Error onErr msg (B.index bs (i + len))
             in loop startUtf8State (i + len + 1) builder'
-        (len, Just s') ->
+        Just s' ->
           let builder' = if len <= 0 then builder else nonEmptyPrefix len
               undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s')
                 then B.drop (i + len) bs  -- Reuse bs if possible
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 1def3f51..5b6607bf 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -110,10 +110,10 @@ decodeUtf8With onErr = loop TE.startUtf8State
   where
     chunkb builder t | TE.sbLength builder == 0 = t
                     | otherwise = Chunk (TE.strictBuilderToText builder) t
-    loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr' s b of
+    loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr msg s b of
       (builder, _, s') -> chunkb builder (loop s' bs)
-    loop s B.Empty = chunkb (TE.skipIncomplete onErr' s) Empty
-    onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just
+    loop s B.Empty = chunkb (TE.skipIncomplete onErr msg s) Empty
+    msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.

From b3ec653b7394ee9f861f30d9a67825b7dd3aa377 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 00:19:01 +0000
Subject: [PATCH 75/87] More doc, explicit exports

---
 src/Data/Text/Internal/Encoding.hs | 54 ++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 5236bf14..96e1e6c4 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -6,11 +6,27 @@
 {-# LANGUAGE ViewPatterns #-}
 
 -- |
+-- Module      : Data.Text.Internal.Builder
+-- License     : BSD-style (see LICENSE)
+-- Stability   : experimental
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
 -- Internals of "Data.Text.Encoding".
 --
--- If you'd like to depend on something from here that's not in "Data.Text.Encoding",
--- please request to have it exported!
-module Data.Text.Internal.Encoding where
+-- @since 2.0.2
+module Data.Text.Internal.Encoding
+  ( validateUtf8Chunk
+  , validateUtf8More
+  , decodeUtf8Chunk
+  , decodeUtf8More
+  , decodeUtf8With1
+  , decodeUtf8With2
+  , Utf8State
+  , startUtf8State
+  ) where
 
 #if defined(ASSERTS)
 import Control.Exception (assert)
@@ -20,7 +36,9 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits ((.&.), shiftL, shiftR)
 import Data.ByteString (ByteString)
 import Data.Functor (void)
+#if MIN_VERSION_base(4,11,0)
 import Data.Semigroup (Semigroup(..))
+#endif
 import Data.Word (Word32, Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr)
@@ -43,8 +61,6 @@ import GHC.Stack (HasCallStack)
 
 #ifdef SIMDUTF
 import Foreign.C.Types (CInt(..))
-#elif !MIN_VERSION_bytestring(0,11,2)
-import qualified Data.ByteString.Unsafe as B
 #endif
 
 -- | State of decoding a 'ByteString' in UTF-8.
@@ -312,12 +328,12 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k
       | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
 -- | A delayed representation of strict 'Text'.
-
--- For internal purposes, this is instead used as a delayed 'Array':
--- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder',
--- 'byteStringToStrictBuilder', 'partUtf8ToStrictBuilder').
 --
 -- @since 2.0.2
+
+-- For internal purposes, this is instead abused as a delayed 'Array':
+-- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder',
+-- 'byteStringToStrictBuilder', 'utf8StateToStrictBuilder').
 data StrictBuilder = StrictBuilder
   { sbLength :: {-# UNPACK #-} !Int
   , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
@@ -367,6 +383,9 @@ partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
 partUtf8ToStrictBuilder c =
   partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c
 
+utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
+utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint
+
 -- | Use 'StrictBuilder' to build 'Text'.
 --
 -- @since 2.0.2
@@ -439,11 +458,13 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
 --     s2b (pre1 '<>' pre2) = s2b pre3
 --     ms2 = ms3
 --     @
+--
+-- @since 2.0.2
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
   validateUtf8MoreCont s bs $ \len ms ->
     let builder | len <= 0 = emptyStrictBuilder
-                | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s)
+                | otherwise = utf8StateToStrictBuilder s
                   <> byteStringToStrictBuilder (B.take len bs)
     in (builder, B.drop len bs, ms)
 
@@ -491,10 +512,13 @@ handleUtf8Error onErr msg w = case onErr msg (Just w) of
   Just c -> charToStrictBuilder c
   Nothing -> emptyStrictBuilder
 
--- | Helper for 'decodeUtfWith'.
+-- | Helper for 'Data.Text.Encoding.decodeUtfWith'.
+--
+-- This could be shorter by calling decodeUtf8With2 directly, but we make the
+-- first call validateUtf8Chunk directly to return even faster in successful
+-- cases.
 --
--- This could be shorter by calling decodeUtf8With2 directly,
--- but we make the first call validateUtf8Chunk directly so that
+-- @since 2.0.2
 decodeUtf8With1 ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -521,6 +545,8 @@ decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms o
 -- which use an 'OnDecodeError' to process bad bytes.
 --
 -- See 'decodeUtf8Chunk' for a more flexible alternative.
+--
+-- @since 2.0.2
 decodeUtf8With2 ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -530,7 +556,7 @@ decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder
   where
     loop s i !builder =
       let nonEmptyPrefix len = builder
-            <> partUtf8ToStrictBuilder (partialUtf8CodePoint s)
+            <> utf8StateToStrictBuilder s
             <> byteStringToStrictBuilder (B.take len (B.drop i bs))
       in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of
         Nothing ->

From 7656b018bafd90a9e9de3a7a92567bbcce58cf1b Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 00:51:34 +0000
Subject: [PATCH 76/87] Make StrictBuilder module, explicit exports

---
 src/Data/Text/Internal/Encoding.hs      | 135 ++++++++----------------
 src/Data/Text/Internal/StrictBuilder.hs | 107 +++++++++++++++++++
 src/Data/Text/Lazy/Encoding.hs          |   3 +-
 tests/Tests/Properties/Transcoding.hs   |  29 ++---
 text.cabal                              |   1 +
 5 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 96e1e6c4..df1fe903 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -26,17 +26,22 @@ module Data.Text.Internal.Encoding
   , decodeUtf8With2
   , Utf8State
   , startUtf8State
+  , StrictBuilder()
+  , strictBuilderToText
+  , textToStrictBuilder
+
+    -- * Internal
+  , skipIncomplete
+  , getCompleteLen
+  , getPartialUtf8
   ) where
 
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
-import Control.Monad.ST (ST, runST)
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits ((.&.), shiftL, shiftR)
 import Data.ByteString (ByteString)
-import Data.Functor (void)
-#if MIN_VERSION_base(4,11,0)
+#if !MIN_VERSION_base(4,11,0)
 import Data.Semigroup (Semigroup(..))
 #endif
 import Data.Word (Word32, Word8)
@@ -44,17 +49,18 @@ import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr)
 import Foreign.Storable (pokeElemOff)
 import Data.Text.Encoding.Error (OnDecodeError)
-import Data.Text.Internal (Text(..), empty, safe)
+import Data.Text.Internal (Text(..))
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
-  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length)
+  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
+import Data.Text.Internal.StrictBuilder (StrictBuilder)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as BI
 import qualified Data.ByteString.Short.Internal as SBS
 import qualified Data.Text.Array as A
-import qualified Data.Text.Internal.Unsafe.Char as Char
+import qualified Data.Text.Internal.StrictBuilder as SB
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif
@@ -63,6 +69,12 @@ import GHC.Stack (HasCallStack)
 import Foreign.C.Types (CInt(..))
 #endif
 
+strictBuilderToText :: StrictBuilder -> Text
+strictBuilderToText = SB.toText
+
+textToStrictBuilder :: Text -> StrictBuilder
+textToStrictBuilder = SB.fromText
+
 -- | State of decoding a 'ByteString' in UTF-8.
 -- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More',
 -- 'decodeUtf8Chunk', 'decodeUtf8More').
@@ -179,6 +191,14 @@ partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
 partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr ->
   partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0
 
+-- Exported for testing.
+getCompleteLen :: Utf8State -> Int
+getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint
+
+-- | Exported for testing.
+getPartialUtf8 :: Utf8State -> B.ByteString
+getPartialUtf8 = partUtf8ToByteString . partialUtf8CodePoint
+
 #ifdef SIMDUTF
 foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
     :: Ptr Word8 -> CSize -> IO CInt
@@ -327,89 +347,20 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k
           Just s' -> loop (i + 1) s'
       | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
--- | A delayed representation of strict 'Text'.
---
--- @since 2.0.2
-
--- For internal purposes, this is instead abused as a delayed 'Array':
--- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder',
--- 'byteStringToStrictBuilder', 'utf8StateToStrictBuilder').
-data StrictBuilder = StrictBuilder
-  { sbLength :: {-# UNPACK #-} !Int
-  , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
-  }
-
--- | Concatenation of 'StrictBuilder' is right-biased:
--- the right builder will be run first. This allows a builder to
--- run tail-recursively when it was accumulated left-to-right.
-instance Semigroup StrictBuilder where
-  (<>) = appendRStrictBuilder
-
-instance Monoid StrictBuilder where
-  mempty = emptyStrictBuilder
-  mappend = (<>)
-
-emptyStrictBuilder :: StrictBuilder
-emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
-
-appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
-appendRStrictBuilder (StrictBuilder 0 _) b2 = b2
-appendRStrictBuilder b1 (StrictBuilder 0 _) = b1
-appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
-  StrictBuilder (n1 + n2) (\dst ofs -> do
-    write2 dst (ofs + n1)
-    write1 dst ofs)
-
-copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s ()
-copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
-  unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do
-    unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len
-
-byteStringToStrictBuilder :: ByteString -> StrictBuilder
-byteStringToStrictBuilder bs =
-  StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
-
-{-# INLINE charToStrictBuilder #-}
-charToStrictBuilder :: Char -> StrictBuilder
-charToStrictBuilder c =
-  StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
-
-word8ToStrictBuilder :: Word8 -> StrictBuilder
-word8ToStrictBuilder !w =
-  StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w)
-
 -- Eta-expanded to inline partUtf8Foldr
 partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
 partUtf8ToStrictBuilder c =
-  partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c
+  partUtf8Foldr ((<>) . SB.unsafeFromWord8) mempty c
 
 utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
 utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint
 
--- | Use 'StrictBuilder' to build 'Text'.
---
--- @since 2.0.2
-strictBuilderToText :: StrictBuilder -> Text
-strictBuilderToText (StrictBuilder 0 _) = empty
-strictBuilderToText (StrictBuilder n write) = runST (do
-  dst <- A.new n
-  write dst 0
-  arr <- A.unsafeFreeze dst
-  pure (Text arr 0 n))
-
--- | Copy 'Text' in a 'StrictBuilder'
---
--- @since 2.0.2
-textToStrictBuilder :: Text -> StrictBuilder
-textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
-  A.copyI n dst dstOfs src srcOfs)
-
 -- | Decode another chunk in an ongoing UTF-8 stream.
 --
 -- Returns a triple:
 --
 -- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate
---    chunks with @('<>')@ or output them with 'strictBuilderToText'.
+--    chunks with @('<>')@ or output them with 'SB.toText'.
 -- 2. The undecoded remainder of the given chunk, for diagnosing errors
 --    and resuming (presumably after skipping some bytes).
 -- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered
@@ -434,7 +385,7 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
 --     where
 --
 --     @
---     s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.strictBuilderToText'
+--     s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.toText'
 --     p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString'
 --     @
 --
@@ -463,9 +414,9 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
   validateUtf8MoreCont s bs $ \len ms ->
-    let builder | len <= 0 = emptyStrictBuilder
+    let builder | len <= 0 = mempty
                 | otherwise = utf8StateToStrictBuilder s
-                  <> byteStringToStrictBuilder (B.take len bs)
+                  <> SB.unsafeFromByteString (B.take len bs)
     in (builder, B.drop len bs, ms)
 
 -- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'.
@@ -487,7 +438,7 @@ decodeUtf8More s bs =
 -- @builder@ is a prefix and @rest@ is a suffix of @chunk@.
 --
 -- @
--- 'Data.Text.Encoding.encodeUtf8' ('strictBuilderToText' builder) '<>' rest = chunk
+-- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk
 -- @
 --
 -- @since 2.0.2
@@ -499,18 +450,20 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State
 -- value. The bytes are the positions from 'errStart' (inclusive) to
 -- 'errEnd' (exclusive). Any substitute characters are pushed onto the
 -- supplied 'TextDataStack' argument.
+--
+-- Exported for lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'.
 {-# INLINE skipIncomplete #-}
 skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
 skipIncomplete onErr msg s =
   partUtf8Foldr
     ((<>) . handleUtf8Error onErr msg)
-    emptyStrictBuilder (partialUtf8CodePoint s)
+    mempty (partialUtf8CodePoint s)
 
 {-# INLINE handleUtf8Error #-}
 handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
 handleUtf8Error onErr msg w = case onErr msg (Just w) of
-  Just c -> charToStrictBuilder c
-  Nothing -> emptyStrictBuilder
+  Just c -> SB.fromChar c
+  Nothing -> mempty
 
 -- | Helper for 'Data.Text.Encoding.decodeUtfWith'.
 --
@@ -529,12 +482,12 @@ decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms o
       | len == B.length bs ->
         let !(SBS.SBS arr) = SBS.toShort bs in
         Text (A.ByteArray arr) 0 len
-      | otherwise -> strictBuilderToText $
-          byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr msg s
+      | otherwise -> SB.toText $
+          SB.unsafeFromByteString (B.take len bs) <> skipIncomplete onErr msg s
     Nothing ->
        let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in
-       strictBuilderToText $
-         byteStringToStrictBuilder (B.take len bs) <>
+       SB.toText $
+         SB.unsafeFromByteString (B.take len bs) <>
          handleUtf8Error onErr msg (B.index bs len) <>
          builder <>
          skipIncomplete onErr msg s
@@ -552,12 +505,12 @@ decodeUtf8With2 ::
   HasCallStack =>
 #endif
   OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
-decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder
+decodeUtf8With2 onErr msg s0 bs = loop s0 0 mempty
   where
     loop s i !builder =
       let nonEmptyPrefix len = builder
             <> utf8StateToStrictBuilder s
-            <> byteStringToStrictBuilder (B.take len (B.drop i bs))
+            <> SB.unsafeFromByteString (B.take len (B.drop i bs))
       in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of
         Nothing ->
           if len < 0
diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs
index e69de29b..a8cdcad0 100644
--- a/src/Data/Text/Internal/StrictBuilder.hs
+++ b/src/Data/Text/Internal/StrictBuilder.hs
@@ -0,0 +1,107 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE RankNTypes #-}
+
+-- |
+-- Module      : Data.Text.Internal.Builder
+-- License     : BSD-style (see LICENSE)
+-- Stability   : experimental
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- @since 2.0.2
+
+module Data.Text.Internal.StrictBuilder
+  ( StrictBuilder(..)
+  , toText
+  , fromChar
+  , fromText
+
+    -- * Unsafe
+  , unsafeFromByteString
+  , unsafeFromWord8
+  ) where
+
+import Control.Monad.ST (ST, runST)
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Data.Functor (void)
+import Data.Word (Word8)
+import Data.ByteString (ByteString)
+import Data.Text.Internal (Text(..), empty, safe)
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import qualified Data.ByteString as B
+import qualified Data.Text.Array as A
+import qualified Data.Text.Internal.Unsafe.Char as Char
+
+-- | A delayed representation of strict 'Text'.
+--
+-- @since 2.0.2
+
+-- For internal purposes, this is instead abused as a delayed 'Array':
+-- it may not actually be valid 'Text' (e.g., 'unsafeFromWord8',
+-- 'unsafeFromByteString').
+data StrictBuilder = StrictBuilder
+  { sbLength :: {-# UNPACK #-} !Int
+  , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
+  }
+
+-- | Use 'StrictBuilder' to build 'Text'.
+--
+-- @since 2.0.2
+toText :: StrictBuilder -> Text
+toText (StrictBuilder 0 _) = empty
+toText (StrictBuilder n write) = runST (do
+  dst <- A.new n
+  write dst 0
+  arr <- A.unsafeFreeze dst
+  pure (Text arr 0 n))
+
+-- | Concatenation of 'StrictBuilder' is right-biased:
+-- the right builder will be run first. This allows a builder to
+-- run tail-recursively when it was accumulated left-to-right.
+instance Semigroup StrictBuilder where
+  (<>) = appendRStrictBuilder
+
+instance Monoid StrictBuilder where
+  mempty = emptyStrictBuilder
+  mappend = (<>)
+
+emptyStrictBuilder :: StrictBuilder
+emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ())
+
+appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder
+appendRStrictBuilder (StrictBuilder 0 _) b2 = b2
+appendRStrictBuilder b1 (StrictBuilder 0 _) = b1
+appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) =
+  StrictBuilder (n1 + n2) (\dst ofs -> do
+    write2 dst (ofs + n1)
+    write1 dst ofs)
+
+copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s ()
+copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
+  unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do
+    unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len
+
+-- | Copy a 'ByteString'. Note: This may not be valid text.
+unsafeFromByteString :: ByteString -> StrictBuilder
+unsafeFromByteString bs =
+  StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
+
+{-# INLINE fromChar #-}
+fromChar :: Char -> StrictBuilder
+fromChar c =
+  StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
+
+unsafeFromWord8 :: Word8 -> StrictBuilder
+unsafeFromWord8 !w =
+  StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w)
+
+-- | Copy 'Text' in a 'StrictBuilder'
+--
+-- @since 2.0.2
+fromText :: Text -> StrictBuilder
+fromText (Text src srcOfs n) = StrictBuilder n (\dst dstOfs ->
+  A.copyI n dst dstOfs src srcOfs)
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
index 5b6607bf..06a06224 100644
--- a/src/Data/Text/Lazy/Encoding.hs
+++ b/src/Data/Text/Lazy/Encoding.hs
@@ -69,6 +69,7 @@ import qualified Data.Text.Encoding as TE
 import qualified Data.Text.Internal.Encoding as TE
 import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
 import qualified Data.Text.Internal.Lazy.Fusion as F
+import qualified Data.Text.Internal.StrictBuilder as SB
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 
 -- $strict
@@ -108,7 +109,7 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr = loop TE.startUtf8State
   where
-    chunkb builder t | TE.sbLength builder == 0 = t
+    chunkb builder t | SB.sbLength builder == 0 = t
                     | otherwise = Chunk (TE.strictBuilderToText builder) t
     loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr msg s b of
       (builder, _, s') -> chunkb builder (loop s' bs)
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 7bf2067f..49ce4f2e 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -1,19 +1,20 @@
 -- | Tests for encoding and decoding
 
-{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-}
+{-# LANGUAGE CPP, OverloadedStrings, ScopedTypeVariables #-}
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
 {-# LANGUAGE BangPatterns #-}
 module Tests.Properties.Transcoding
     ( testTranscoding
     ) where
 
--- import Debug.Trace (trace)
 import Prelude hiding (head, tail)
 import Data.Bits ((.&.), shiftR)
 import Data.Char (chr, ord)
 import Data.Functor (void)
 import Data.Maybe (isNothing)
+#if !MIN_VERSION_base(4,11,0)
 import Data.Semigroup ((<>))
+#endif
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -102,8 +103,7 @@ t_pn_utf8_2 = do
   pure ()
 t_pn_utf8_3 = do
   s1 <- testValidateUtf8 [0xc2] 0
-  let c = E.partialUtf8CodePoint s1
-  assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c
+  assertBool "PartialUtf8 must be partial" $ B.length (E.getPartialUtf8 s1) < E.getCompleteLen s1
   testValidateUtf8MoreFail s1 [0x80, 0x80] 1
 
 -- Precondition: (i, ms1) = E.validateUtf8More s chunk
@@ -111,7 +111,7 @@ t_pn_utf8_3 = do
 -- The index points to the end of the longest valid prefix
 -- of prechunk `B.append` chunk
 pre_validateUtf8More_validPrefix s chunk i =
-  let prechunk = E.partUtf8ToByteString (E.partialUtf8CodePoint s) in
+  let prechunk = E.getPartialUtf8 s in
   -- Note: i <= 0 implies take i = id
   let (j, ms2) = E.validateUtf8Chunk (B.take (B.length prechunk + i) (prechunk `B.append` chunk)) in
   counterexample (show prechunk) $
@@ -351,11 +351,6 @@ t_decode_utf8_lenient :: Property
 t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
     decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
 
--- t_decode_utf8_lenient =
---   let !res = trace "HEREHEREHERE" $
---         EL.decodeUtf8With E.lenientDecode (BS.Chunk (B.pack [0xe1]) (BS.Chunk (B.pack [0xa0]) BS.Empty)) === TL.fromStrict (T.pack "\xFFFD\xFFFD") in
---   trace "THERETHERETHERE" res
-
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
 -- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -401,8 +396,7 @@ testDecodeUtf8With k s xs expected =
       if T.null txt then
         bs @?= xs'
       else
-        let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s)
-        in E.encodeUtf8 txt `B.append` bs @?= c `B.append` xs'
+        E.encodeUtf8 txt `B.append` bs @?= E.getPartialUtf8 s `B.append` xs'
       k s'
 
 testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State
@@ -417,8 +411,7 @@ testDecodeUtf8Fail = testDecodeUtf8With (\ms -> case ms of
 
 t_decode_chunk1 = do
   s1 <- testDecodeUtf8 E.startUtf8State [0xc2] ""
-  let c1 = E.partialUtf8CodePoint s1
-  E.partUtf8Len c1 @?= 1
+  B.length (E.getPartialUtf8 s1) @?= 1
   testDecodeUtf8Fail s1 [0x80, 0x80] "\128"
 
 t_decode_chunk2 = do
@@ -464,12 +457,12 @@ pre_decodeUtf8More_suffix s chunk =
 
 -- Decoding chunks separately is equivalent to decoding their concatenation.
 pre_decodeUtf8More_append s chunk1 chunk2 =
-  let (pre1, suf1, ms1) = E.decodeUtf8More s chunk1 in
+  let (pre1, _, ms1) = E.decodeUtf8More s chunk1 in
   case ms1 of
     Nothing -> discard
     Just s1 ->
-      let (pre2, suf2, ms2) = E.decodeUtf8More s1 chunk2 in
-      let (pre3, suf3, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in
+      let (pre2, _, ms2) = E.decodeUtf8More s1 chunk2 in
+      let (pre3, _, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in
       (s2b (pre1 <> pre2), ms2) === (s2b pre3, ms3)
 
 -- Properties for any chunk
@@ -487,7 +480,7 @@ t_decodeUtf8More2 = property $ do
     pre_decodeUtf8More_append s chunk chunk2
 
 s2b = E.encodeUtf8 . E.strictBuilderToText
-p2b = E.partUtf8ToByteString . E.partialUtf8CodePoint
+p2b = E.getPartialUtf8
 
 testTranscoding :: TestTree
 testTranscoding =
diff --git a/text.cabal b/text.cabal
index 0d420fa2..05946247 100644
--- a/text.cabal
+++ b/text.cabal
@@ -169,6 +169,7 @@ library
     Data.Text.Internal.Private
     Data.Text.Internal.Read
     Data.Text.Internal.Search
+    Data.Text.Internal.StrictBuilder
     Data.Text.Internal.Unsafe
     Data.Text.Internal.Unsafe.Char
     Data.Text.Lazy

From f663a6e64beb726a23ada780798afa0fb094ac87 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 01:20:57 +0000
Subject: [PATCH 77/87] Revert changes to Data.Text.Internal.Encoding.Utf8

---
 changelog.md                            |  6 +++
 src/Data/Text/Internal/Encoding.hs      | 29 ++++++-----
 src/Data/Text/Internal/Encoding/Utf8.hs | 66 ++++++++++++++++++-------
 3 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/changelog.md b/changelog.md
index e63283c1..2b712fc6 100644
--- a/changelog.md
+++ b/changelog.md
@@ -21,6 +21,12 @@
 * Fix quadratic slowdown when decoding invalid UTF-8 bytestrings
   (https://github.com/haskell/text/issues/495)
 
+* Add internal module `Data.Text.Internal.StrictBuilder`
+
+* Add internal module `Data.Text.Internal.Encoding`
+
+* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState`
+
 ### 2.0.1
 
 * Improve portability of C and C++ code.
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index df1fe903..95ec8f3c 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -52,7 +52,7 @@ import Data.Text.Encoding.Error (OnDecodeError)
 import Data.Text.Internal (Text(..))
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
-  (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete)
+  (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState)
 import Data.Text.Internal.StrictBuilder (StrictBuilder)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Unsafe (unsafeDupablePerformIO)
@@ -92,17 +92,17 @@ textToStrictBuilder = SB.fromText
 -- @since 2.0.2
 data Utf8State = Utf8State
   { -- | State of the UTF-8 state machine.
-    utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState
+    utf8CodePointState :: {-# UNPACK #-} !DecoderState
     -- | Bytes of the currently incomplete code point (if any).
   , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
   }
-  deriving (Eq, Ord, Show)
+  deriving (Eq, Show)
 
 -- | Initial 'Utf8State'.
 --
 -- @since 2.0.2
 startUtf8State :: Utf8State
-startUtf8State = Utf8State utf8StartState partUtf8Empty
+startUtf8State = Utf8State utf8AcceptState partUtf8Empty
 
 -- | Prefix of a UTF-8 code point encoded in 4 bytes,
 -- possibly empty.
@@ -117,7 +117,7 @@ startUtf8State = Utf8State utf8StartState partUtf8Empty
 --
 -- @since 2.0.2
 newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
-  deriving (Eq, Ord, Show)
+  deriving (Eq, Show)
 
 -- | Empty prefix.
 partUtf8Empty :: PartialUtf8CodePoint
@@ -273,15 +273,14 @@ validateUtf8ChunkFrom ofs bs k
     -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
     -- indices to let us avoid this function.
     slowValidateUtf8ChunkFrom :: Int -> r
-    slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8StartState
+    slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8AcceptState
 
     slowLoop !utf8End i s
       | i < B.length bs =
-          case updateUtf8State (B.index bs i) s of
-            Just s' ->
-              let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End
-              in slowLoop utf8End' (i + 1) s'
-            Nothing -> k utf8End Nothing
+          case updateDecoderState (B.index bs i) s of
+            s' | s' == utf8RejectState -> k utf8End Nothing
+               | s' == utf8AcceptState -> slowLoop (i + 1) (i + 1) s'
+               | otherwise -> slowLoop utf8End (i + 1) s'
       | otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs))))
 
 -- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text.
@@ -340,11 +339,11 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k
     -- Complete an incomplete code point (if there is one)
     -- and then jump to validateUtf8ChunkFrom
     loop !i s
-      | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs k
+      | s == utf8AcceptState = validateUtf8ChunkFrom i bs k
       | i < len =
-        case updateUtf8State (B.index bs i) s of
-          Nothing -> k (- partUtf8Len part) Nothing
-          Just s' -> loop (i + 1) s'
+        case updateDecoderState (B.index bs i) s of
+          s' | s' == utf8RejectState -> k (- partUtf8Len part) Nothing
+             | otherwise -> loop (i + 1) s'
       | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs)))
 
 -- Eta-expanded to inline partUtf8Foldr
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index 148995a8..c8357677 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -34,10 +34,14 @@ module Data.Text.Internal.Encoding.Utf8
     , validate3
     , validate4
     -- * Naive decoding
-    , Utf8CodePointState
-    , utf8StartState
-    , updateUtf8State
-    , isUtf8StateIsComplete
+    , DecoderState(..)
+    , utf8AcceptState
+    , utf8RejectState
+    , updateDecoderState
+    , DecoderResult(..)
+    , CodePoint(..)
+    , utf8DecodeStart
+    , utf8DecodeContinue
     ) where
 
 #if defined(ASSERTS)
@@ -45,7 +49,7 @@ import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
 import Data.Bits (Bits(..), FiniteBits(..))
-import Data.Char (ord)
+import Data.Char (ord, chr)
 import GHC.Exts
 import GHC.Word (Word8(..))
 
@@ -242,14 +246,17 @@ byteToClass n = ByteClass (W8# el#)
     table# :: Addr#
     table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
 
-newtype Utf8CodePointState = Utf8CodePointState Word8
-  deriving (Eq, Ord, Show)
+newtype DecoderState = DecoderState Word8
+  deriving (Eq, Show)
 
-utf8StartState :: Utf8CodePointState
-utf8StartState = Utf8CodePointState 0
+utf8AcceptState :: DecoderState
+utf8AcceptState = DecoderState 0
 
-transitionUtf8State :: ByteClass -> Utf8CodePointState -> Utf8CodePointState
-transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W8# el#)
+utf8RejectState :: DecoderState
+utf8RejectState = DecoderState 12
+
+updateState :: ByteClass -> DecoderState -> DecoderState
+updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
   where
     !(I# n#) = word8ToInt (c + s)
     el# = indexWord8OffAddr# table# n#
@@ -257,10 +264,35 @@ transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W
     table# :: Addr#
     table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
 
-updateUtf8State :: Word8 -> Utf8CodePointState -> Maybe Utf8CodePointState
-updateUtf8State w st = case transitionUtf8State (byteToClass w) st of
-  Utf8CodePointState 12 -> Nothing
-  st' -> Just st'
+updateDecoderState :: Word8 -> DecoderState -> DecoderState
+updateDecoderState b s = updateState (byteToClass b) s
+
+newtype CodePoint = CodePoint Int
 
-isUtf8StateIsComplete :: Utf8CodePointState -> Bool
-isUtf8StateIsComplete (Utf8CodePointState s) = s == 0
+-- | @since 2.0
+data DecoderResult
+  = Accept !Char
+  | Incomplete !DecoderState !CodePoint
+  | Reject
+
+-- | @since 2.0
+utf8DecodeStart :: Word8 -> DecoderResult
+utf8DecodeStart !w
+  | st == utf8AcceptState = Accept (chr (word8ToInt w))
+  | st == utf8RejectState = Reject
+  | otherwise             = Incomplete st (CodePoint cp)
+  where
+    cl@(ByteClass cl') = byteToClass w
+    st = updateState cl utf8AcceptState
+    cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w
+
+-- | @since 2.0
+utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
+utf8DecodeContinue !w !st (CodePoint !cp)
+  | st' == utf8AcceptState = Accept (chr cp')
+  | st' == utf8RejectState = Reject
+  | otherwise              = Incomplete st' (CodePoint cp')
+  where
+    cl  = byteToClass w
+    st' = updateState cl st
+    cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f)

From bc659212ee2e916f97c749f8e5264f09153344be Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 01:32:24 +0000
Subject: [PATCH 78/87] Docs

---
 src/Data/Text/Encoding.hs          |  4 ++--
 src/Data/Text/Internal/Encoding.hs | 21 ++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index ecd89948..498ff889 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -547,12 +547,12 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 -- The functions 'decodeUtf8Chunk' and 'decodeUtf8More' provide more
 -- control for error-handling and streaming.
 --
--- - You get an UTF-8 prefix of the given 'ByteString' up to the next error.
+-- - Those functions return an UTF-8 prefix of the given 'ByteString' up to the next error.
 --   For example this lets you insert or delete arbitrary text, or do some
 --   stateful operations before resuming, such as keeping track of error locations.
 --   In contrast, the older stream-oriented interface only lets you substitute
 --   a single fixed 'Char' for each invalid byte in 'OnDecodeError'.
--- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks
+-- - That prefix is encoded as a 'StrictBuilder', so you can accumulate chunks
 --   before doing the copying work to construct a 'Text', or you can
 --   output decoded fragments immediately as a lazy 'Data.Text.Lazy.Text'.
 --
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 95ec8f3c..6bec2215 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -76,7 +76,7 @@ textToStrictBuilder :: Text -> StrictBuilder
 textToStrictBuilder = SB.fromText
 
 -- | State of decoding a 'ByteString' in UTF-8.
--- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More',
+-- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More',
 -- 'decodeUtf8Chunk', 'decodeUtf8More').
 
 -- Internal invariant:
@@ -444,13 +444,12 @@ decodeUtf8More s bs =
 decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8Chunk = decodeUtf8More startUtf8State
 
--- | Call an error handler with the give 'String' message for each byte
--- in given 'ByteString' and lead data in the given 'Utf8State'
--- value. The bytes are the positions from 'errStart' (inclusive) to
--- 'errEnd' (exclusive). Any substitute characters are pushed onto the
--- supplied 'TextDataStack' argument.
+-- | Call the error handler on each byte of the partial code point stored in
+-- 'Utf8State' and append the results.
 --
--- Exported for lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'.
+-- Exported for use in lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'.
+--
+-- @since 2.0.2
 {-# INLINE skipIncomplete #-}
 skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
 skipIncomplete onErr msg s =
@@ -464,13 +463,13 @@ handleUtf8Error onErr msg w = case onErr msg (Just w) of
   Just c -> SB.fromChar c
   Nothing -> mempty
 
--- | Helper for 'Data.Text.Encoding.decodeUtfWith'.
+-- | Helper for 'Data.Text.Encoding.decodeUtf8With'.
 --
--- This could be shorter by calling decodeUtf8With2 directly, but we make the
+-- @since 2.0.2
+
+-- This could be shorter by calling 'decodeUtf8With2' directly, but we make the
 -- first call validateUtf8Chunk directly to return even faster in successful
 -- cases.
---
--- @since 2.0.2
 decodeUtf8With1 ::
 #if defined(ASSERTS)
   HasCallStack =>

From 28a7460555485ba806f440dd4c59b86b5e181204 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 02:24:01 +0000
Subject: [PATCH 79/87] Fix short-circuit

---
 src/Data/Text/Internal/Encoding.hs | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 6bec2215..1a90e626 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -231,20 +231,16 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
 validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,)
 
--- Add an offset to the index returned by 'validateUtf8More'.
+-- Assume bytes up to offset @ofs@ have been validated already.
 --
--- @
--- validateUtf8ChunkFrom n bs (,) = (first (+ n) . 'validateUtf8More' . 'B.drop' n) bs
--- @
---
--- CPS: inlining the continuation lets us avoid allocating a @Maybe@ in the
--- @decode...@ functions.
+-- Using CPS lets us inline the continuation and avoid allocating a @Maybe@
+-- in the @decode...@ functions.
 {-# INLINE validateUtf8ChunkFrom #-}
 validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
 validateUtf8ChunkFrom ofs bs k
 #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2)
   | guessUtf8Boundary > 0 &&
-    -- the rest of the bytestring valid utf-8 up to the boundary
+    -- the rest of the bytestring is valid utf-8 up to the boundary
     (
 #ifdef SIMDUTF
       withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $
@@ -253,14 +249,15 @@ validateUtf8ChunkFrom ofs bs k
 #else
       B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
 #endif
-    ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
+    ) = if guessUtf8Boundary == len then k (B.length bs) (Just startUtf8State)
+        else slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
     -- No
-  | otherwise = if guessUtf8Boundary == len then k len (Just startUtf8State) else slowValidateUtf8ChunkFrom ofs
+  | otherwise = slowValidateUtf8ChunkFrom ofs
   where
     len = B.length bs - ofs
     isBoundary n p = len >= n && p (B.index bs (ofs + len - n))
     guessUtf8Boundary
-      | isBoundary 1 (<= 0x80) = len      -- last char is ASCII
+      | isBoundary 1 (<= 0x80) = len      -- last char is ASCII (common short-circuit)
       | isBoundary 1 (0xc2 <=) = len - 1  -- last char starts a two-(or more-)byte code point
       | isBoundary 2 (0xe0 <=) = len - 2  -- pre-last char starts a three-or-four-byte code point
       | isBoundary 3 (0xf0 <=) = len - 3  -- third to last char starts a four-byte code point

From 9e8e0f959055166b07ae798604fa7e4623698315 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 02:31:36 +0000
Subject: [PATCH 80/87] Doc

---
 src/Data/Text/Internal/Encoding.hs      | 26 ++++++++++++++-----------
 src/Data/Text/Internal/StrictBuilder.hs | 20 ++++++++++++++-----
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 1a90e626..bcb98004 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -69,15 +69,23 @@ import GHC.Stack (HasCallStack)
 import Foreign.C.Types (CInt(..))
 #endif
 
+-- | Use 'StrictBuilder' to build 'Text'.
+--
+-- @since 2.0.2
 strictBuilderToText :: StrictBuilder -> Text
 strictBuilderToText = SB.toText
 
+-- | Copy 'Text' in a 'StrictBuilder'
+--
+-- @since 2.0.2
 textToStrictBuilder :: Text -> StrictBuilder
 textToStrictBuilder = SB.fromText
 
 -- | State of decoding a 'ByteString' in UTF-8.
 -- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More',
 -- 'decodeUtf8Chunk', 'decodeUtf8More').
+--
+-- @since 2.0.2
 
 -- Internal invariant:
 -- the first component is the initial state if and only if
@@ -88,8 +96,6 @@ textToStrictBuilder = SB.fromText
 -- <=>
 -- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0
 -- @
---
--- @since 2.0.2
 data Utf8State = Utf8State
   { -- | State of the UTF-8 state machine.
     utf8CodePointState :: {-# UNPACK #-} !DecoderState
@@ -191,7 +197,7 @@ partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
 partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr ->
   partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0
 
--- Exported for testing.
+-- | Exported for testing.
 getCompleteLen :: Utf8State -> Int
 getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint
 
@@ -212,6 +218,8 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 -- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State'
 -- @
 --
+-- @since 2.0.2
+--
 -- === Properties
 --
 -- Given:
@@ -226,8 +234,6 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
 --     @
 --     'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State')
 --     @
---
--- @since 2.0.2
 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
 validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,)
 
@@ -297,6 +303,8 @@ validateUtf8ChunkFrom ofs bs k
 --     - if @ms = Just s'@, you can carry on validating another chunk
 --       by calling 'validateUtf8More' with the new state @s'@.
 --
+-- @since 2.0.2
+--
 -- === Properties
 --
 -- Given:
@@ -319,8 +327,6 @@ validateUtf8ChunkFrom ofs bs k
 --     ms = Just s'
 --     ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more)
 --     @
---
--- @since 2.0.2
 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
 validateUtf8More st bs = validateUtf8MoreCont st bs (,)
 
@@ -405,8 +411,6 @@ utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint
 --     s2b (pre1 '<>' pre2) = s2b pre3
 --     ms2 = ms3
 --     @
---
--- @since 2.0.2
 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8More s bs =
   validateUtf8MoreCont s bs $ \len ms ->
@@ -419,6 +423,8 @@ decodeUtf8More s bs =
 --
 -- See 'decodeUtf8More' for details on the result.
 --
+-- @since 2.0.2
+--
 -- === Properties
 --
 -- @
@@ -436,8 +442,6 @@ decodeUtf8More s bs =
 -- @
 -- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk
 -- @
---
--- @since 2.0.2
 decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
 decodeUtf8Chunk = decodeUtf8More startUtf8State
 
diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs
index a8cdcad0..d1c44248 100644
--- a/src/Data/Text/Internal/StrictBuilder.hs
+++ b/src/Data/Text/Internal/StrictBuilder.hs
@@ -19,6 +19,7 @@ module Data.Text.Internal.StrictBuilder
   , fromText
 
     -- * Unsafe
+    -- $unsafe
   , unsafeFromByteString
   , unsafeFromWord8
   ) where
@@ -39,10 +40,6 @@ import qualified Data.Text.Internal.Unsafe.Char as Char
 -- | A delayed representation of strict 'Text'.
 --
 -- @since 2.0.2
-
--- For internal purposes, this is instead abused as a delayed 'Array':
--- it may not actually be valid 'Text' (e.g., 'unsafeFromWord8',
--- 'unsafeFromByteString').
 data StrictBuilder = StrictBuilder
   { sbLength :: {-# UNPACK #-} !Int
   , sbWrite :: forall s. A.MArray s -> Int -> ST s ()
@@ -85,16 +82,29 @@ copyFromByteString dst ofs src = withBS src $ \ srcFPtr len ->
   unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do
     unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len
 
--- | Copy a 'ByteString'. Note: This may not be valid text.
+-- | Copy a 'ByteString'.
+--
+-- Unsafe: This may not be valid UTF-8 text.
+--
+-- @since 2.0.2
 unsafeFromByteString :: ByteString -> StrictBuilder
 unsafeFromByteString bs =
   StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
 
+-- |
+-- @since 2.0.2
 {-# INLINE fromChar #-}
 fromChar :: Char -> StrictBuilder
 fromChar c =
   StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c)))
 
+-- $unsafe
+-- For internal purposes, we abuse 'StrictBuilder' as a delayed 'Array' rather
+-- than 'Text': it may not actually be valid 'Text'.
+
+-- | Unsafe: This may not be valid UTF-8 text.
+--
+-- @since 2.0.2
 unsafeFromWord8 :: Word8 -> StrictBuilder
 unsafeFromWord8 !w =
   StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w)

From 1f5a873aae8b6a5a3b70ae69cd3aa66bfc3710bb Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 02:59:13 +0000
Subject: [PATCH 81/87] Sort imports

---
 src/Data/Text/Encoding.hs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 498ff889..c0f99f7b 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -88,30 +88,30 @@ import Control.Exception (evaluate, try)
 import Control.Monad.ST (runST)
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Bits (shiftR, (.&.))
-import Data.ByteString (ByteString)
-import qualified Data.ByteString.Short.Internal as SBS
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
-import Data.Text.Internal (Text(..), empty)
-import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word8)
 import Foreign.C.Types (CSize(..))
 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
 import Foreign.Storable (poke, peekByteOff)
 import GHC.Exts (byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
+import Data.ByteString (ByteString)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
+import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Encoding
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Text.Show ()
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as B
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
+import qualified Data.ByteString.Short.Internal as SBS
 import qualified Data.Text.Array as A
-import Data.Text.Internal.Encoding
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Fusion as F
-import Data.Text.Show ()
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
 #endif

From 72f91ee96e06fe9632b03874fe3a9749292f3191 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 03:09:06 +0000
Subject: [PATCH 82/87] Undo useless optimization

---
 src/Data/Text/Internal/Encoding.hs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index bcb98004..31d81103 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -255,8 +255,7 @@ validateUtf8ChunkFrom ofs bs k
 #else
       B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
 #endif
-    ) = if guessUtf8Boundary == len then k (B.length bs) (Just startUtf8State)
-        else slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
+    ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary)
     -- No
   | otherwise = slowValidateUtf8ChunkFrom ofs
   where

From d347bba6a0cc34393c0b503689558c1088c1598d Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 03:10:12 +0000
Subject: [PATCH 83/87] import Semigroup for old base

---
 src/Data/Text/Internal/StrictBuilder.hs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs
index d1c44248..84a57264 100644
--- a/src/Data/Text/Internal/StrictBuilder.hs
+++ b/src/Data/Text/Internal/StrictBuilder.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE CPP #-}
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE RankNTypes #-}
 
@@ -29,6 +30,9 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Data.Functor (void)
 import Data.Word (Word8)
 import Data.ByteString (ByteString)
+#if !MIN_VERSION_base(4,11,0)
+import Data.Semigroup (Semigroup(..))
+#endif
 import Data.Text.Internal (Text(..), empty, safe)
 import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8 (utf8Length)

From 9a0c8a80f17efc3ebf73b922f562942e49e1fcad Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 07:13:53 +0000
Subject: [PATCH 84/87] Clean up imports

---
 src/Data/Text/Encoding.hs             |  5 -----
 src/Data/Text/Internal/Encoding.hs    | 10 +++++-----
 tests/Tests/Properties/Transcoding.hs |  1 -
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index c0f99f7b..500461a1 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -116,11 +116,6 @@ import qualified Data.Text.Internal.Fusion as F
 import GHC.Stack (HasCallStack)
 #endif
 
-#ifdef SIMDUTF
-#elif !MIN_VERSION_bytestring(0,11,2)
-import qualified Data.ByteString.Unsafe as B
-#endif
-
 -- $validation
 -- These functions are for validating 'ByteString's as encoded text.
 
diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs
index 31d81103..26a24afb 100644
--- a/src/Data/Text/Internal/Encoding.hs
+++ b/src/Data/Text/Internal/Encoding.hs
@@ -45,17 +45,12 @@ import Data.ByteString (ByteString)
 import Data.Semigroup (Semigroup(..))
 #endif
 import Data.Word (Word32, Word8)
-import Foreign.C.Types (CSize(..))
-import Foreign.Ptr (Ptr)
 import Foreign.Storable (pokeElemOff)
 import Data.Text.Encoding.Error (OnDecodeError)
 import Data.Text.Internal (Text(..))
-import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Encoding.Utf8
   (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState)
 import Data.Text.Internal.StrictBuilder (StrictBuilder)
-import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Unsafe (unsafeDupablePerformIO)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Internal as BI
 import qualified Data.ByteString.Short.Internal as SBS
@@ -66,7 +61,12 @@ import GHC.Stack (HasCallStack)
 #endif
 
 #ifdef SIMDUTF
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Foreign.C.Types (CSize(..))
 import Foreign.C.Types (CInt(..))
+import Foreign.Ptr (Ptr)
 #endif
 
 -- | Use 'StrictBuilder' to build 'Text'.
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 49ce4f2e..7f907d28 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -2,7 +2,6 @@
 
 {-# LANGUAGE CPP, OverloadedStrings, ScopedTypeVariables #-}
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
-{-# LANGUAGE BangPatterns #-}
 module Tests.Properties.Transcoding
     ( testTranscoding
     ) where

From 64fd0291c44e8655c7d662478e801447cc4fab5b Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 07:17:59 +0000
Subject: [PATCH 85/87] Minimize test diff

---
 src/Data/Text/Encoding.hs             |  1 -
 tests/Tests/Properties/Transcoding.hs | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 500461a1..6f355ac6 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -3,7 +3,6 @@
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE ViewPatterns #-}
 -- |
 -- Module      : Data.Text.Encoding
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 7f907d28..9c1f108e 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -490,6 +490,8 @@ testTranscoding =
     testProperty "tl_latin1" tl_latin1,
     testProperty "t_utf8" t_utf8,
     testProperty "t_utf8'" t_utf8',
+    testProperty "t_utf8_undecoded" t_utf8_undecoded,
+    testProperty "t_utf8_incr" t_utf8_incr,
     testProperty "tl_utf8" tl_utf8,
     testProperty "tl_utf8'" tl_utf8',
     testProperty "t_utf16LE" t_utf16LE,
@@ -516,6 +518,11 @@ testTranscoding =
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,
+      testCase "t_decode_with_error1'" t_decode_with_error1',
+      testCase "t_decode_with_error2'" t_decode_with_error2',
+      testCase "t_decode_with_error3'" t_decode_with_error3',
+      testCase "t_decode_with_error4'" t_decode_with_error4',
+      testCase "t_decode_with_error5'" t_decode_with_error5',
       testProperty "t_infix_concat" t_infix_concat
     ],
     testGroup "validate" [
@@ -524,8 +531,6 @@ testTranscoding =
       testProperty "t_validateUtf8More_valid" t_validateUtf8More_valid
     ],
     testGroup "streaming" [
-      testProperty "t_utf8_undecoded" t_utf8_undecoded,
-      testProperty "t_utf8_incr" t_utf8_incr,
       testProperty "t_utf8_c" t_utf8_c,
       testCase "t_p_utf8_1" t_p_utf8_1,
       testCase "t_p_utf8_2" t_p_utf8_2,
@@ -542,11 +547,6 @@ testTranscoding =
       testCase "t_pn_utf8_3" t_pn_utf8_3,
       testCase "t_decode_chunk1" t_decode_chunk1,
       testCase "t_decode_chunk2" t_decode_chunk2,
-      testCase "t_decode_with_error1'" t_decode_with_error1',
-      testCase "t_decode_with_error2'" t_decode_with_error2',
-      testCase "t_decode_with_error3'" t_decode_with_error3',
-      testCase "t_decode_with_error4'" t_decode_with_error4',
-      testCase "t_decode_with_error5'" t_decode_with_error5',
       testProperty "t_decodeUtf8Chunk_split" t_decodeUtf8Chunk_split,
       testProperty "t_decodeUtf8More1" t_decodeUtf8More1,
       testProperty "t_decodeUtf8More2" t_decodeUtf8More2

From e136cadb12fa599e68b5d8dbdfc9cd9a43f3cf38 Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 07:19:14 +0000
Subject: [PATCH 86/87] test: sort imports

---
 tests/Tests/Properties/Transcoding.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
index 9c1f108e..bf8d1ee8 100644
--- a/tests/Tests/Properties/Transcoding.hs
+++ b/tests/Tests/Properties/Transcoding.hs
@@ -14,6 +14,7 @@ import Data.Maybe (isNothing)
 #if !MIN_VERSION_base(4,11,0)
 import Data.Semigroup ((<>))
 #endif
+import Data.Word (Word8)
 import Test.QuickCheck hiding ((.&.))
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -34,7 +35,6 @@ import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Internal.Encoding as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
-import Data.Word (Word8)
 
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) === a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t

From 74db3ebede83c903dcf274210d8ae31e59b010bf Mon Sep 17 00:00:00 2001
From: Li-yao Xia <lysxia@gmail.com>
Date: Mon, 6 Feb 2023 23:51:45 +0000
Subject: [PATCH 87/87] Apply suggestions

---
 changelog.md              | 2 +-
 src/Data/Text/Encoding.hs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/changelog.md b/changelog.md
index 2b712fc6..19deee72 100644
--- a/changelog.md
+++ b/changelog.md
@@ -25,7 +25,7 @@
 
 * Add internal module `Data.Text.Internal.Encoding`
 
-* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState`
+* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState` and export `utf8{Accept,Reject}State` from the same module.
 
 ### 2.0.1
 
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 6f355ac6..65f32b91 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -50,7 +50,7 @@ module Data.Text.Encoding
     , decodeUtf8More
     , Utf8State
     , startUtf8State
-    , StrictBuilder()
+    , StrictBuilder
     , strictBuilderToText
     , textToStrictBuilder