From b509efd122e094afc6d05532d12c97e7c1110bc8 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Sun, 26 Jun 2022 23:40:29 -0600 Subject: [PATCH 01/87] Monad support for stream decoding --- src/Data/Text/Encoding.hs | 87 ++++++++++++++++++--------- src/Data/Text/Encoding/Error.hs | 11 ++++ tests/Tests/Properties/Transcoding.hs | 38 ++++++++++++ text.cabal | 5 +- 4 files changed, 111 insertions(+), 30 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 54f16722..78d4769a 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -44,6 +44,7 @@ module Data.Text.Encoding -- *** Stream oriented decoding -- $stream , streamDecodeUtf8With + , streamDecodeUtf8WithM , Decoding(..) -- ** Partial Functions @@ -73,13 +74,19 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) -import Control.Monad.ST (runST, ST) +import Control.Monad.Fix (fix) +import Control.Monad.ST (runST) +import Control.Monad.ST.Trans (runSTT) +import Control.Monad.ST.Trans.Internal (liftST) +import Control.Monad.Trans.Class (lift) import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) +import Data.Function ((&)) +import Data.Functor.Identity (Identity(..), runIdentity) +import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), safe, empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) @@ -142,13 +149,16 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -- anything except ASCII and copies buffer or throws an error otherwise. -- decodeASCII :: ByteString -> Text -decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - c_is_ascii src (src `plusPtr` len) - if asciiPrefixLen == len - then let !(SBS.SBS arr) = SBS.toShort bs in - return (Text (A.ByteArray arr) 0 len) - else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen +decodeASCII bs = withBS bs $ \fp len -> + if len == 0 + then empty + else runST $ do + asciiPrefixLen <- fmap cSizeToInt . unsafeIOToST . unsafeWithForeignPtr fp $ \src -> + c_is_ascii src (src `plusPtr` len) + if asciiPrefixLen == len + then let !(SBS.SBS arr) = SBS.toShort bs in + return (Text (A.ByteArray arr) 0 len) + else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -231,7 +241,8 @@ decodeUtf8With onErr bs Nothing -> txt' Just c -> T.singleton c `append` txt') where - (txt, undecoded) = decodeUtf8With2 onErr mempty bs + (txt, undecoded) = runIdentity . decodeUtf8With2 mempty bs $ \ desc' mWord8 -> + Identity $ onErr desc' mWord8 txt' = decodeUtf8With onErr (B.tail undecoded) desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" @@ -240,9 +251,10 @@ decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => #endif - OnDecodeError -> ByteString -> ByteString -> (Text, ByteString) -decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do - marr <- A.new len' + Monad m => + ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString) +decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do + marr <- liftST $ A.new len' outer marr len' 0 0 where len = len1 + len2 @@ -274,11 +286,10 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) step _ st = st - outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString) outer dst dstLen = inner where inner srcOff dstOff - | srcOff >= len = do + | srcOff >= len = liftST $ do A.shrinkM dst dstOff arr <- A.unsafeFreeze dst return (Text arr 0 dstOff, mempty) @@ -288,25 +299,27 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) , isValidBS bs = do - withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) | dstOff + 4 > dstLen = do let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' + dst' <- liftST $ A.resizeM dst dstLen' outer dst' dstLen' srcOff dstOff | otherwise = case decodeFrom srcOff of Accept c -> do - d <- unsafeWrite dst dstOff c + d <- liftST $ unsafeWrite dst dstOff c inner (srcOff + d) (dstOff + d) - Reject -> case onErr desc (Just (index srcOff)) of - Nothing -> inner (srcOff + 1) dstOff - Just c -> do - d <- unsafeWrite dst dstOff (safe c) - inner (srcOff + 1) (dstOff + d) - Incomplete{} -> do + Reject -> do + res <- lift $ onErrM desc (Just (index srcOff)) + case res of + Nothing -> inner (srcOff + 1) dstOff + Just c -> do + d <- liftST $ unsafeWrite dst dstOff (safe c) + inner (srcOff + 1) (dstOff + d) + Incomplete{} -> liftST $ do A.shrinkM dst dstOff arr <- A.unsafeFreeze dst let bs = if srcOff >= len1 @@ -369,6 +382,22 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do -- If given invalid input, an exception will be thrown by the function -- or continuation where it is encountered. +-- | Decode, in a monadic- and stream-oriented way using CPS, a lazy +-- 'ByteString' containing UTF-8 encoded text. +streamDecodeUtf8WithM :: +#if defined(ASSERTS) + HasCallStack => +#endif + Monad m => + ByteString + -> OnDecodeErrorM m + -> (Text -> ByteString -> (ByteString -> m b) -> m b) + -> m b +streamDecodeUtf8WithM bstr onErrM f = fix (\ go bs1 bs2 -> do + (txt, undecoded) <- decodeUtf8With2 bs1 bs2 onErrM + f txt undecoded (go undecoded) + ) mempty bstr + -- | A stream oriented decoding result. -- -- @since 1.0.0.0 @@ -406,11 +435,11 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = go mempty - where - go bs1 bs2 = Some txt undecoded (go undecoded) - where - (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 +streamDecodeUtf8With onErr = fix (\ go bs1 bs2 -> + runIdentity (decodeUtf8With2 bs1 bs2 (\ desc' mWord8 -> + Identity $ onErr desc' mWord8)) & (\ (txt, undecoded) -> + Some txt undecoded (go undecoded)) + ) mempty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index ea9e0997..99226cb1 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -25,6 +25,7 @@ module Data.Text.Encoding.Error UnicodeException(..) , OnError , OnDecodeError + , OnDecodeErrorM , OnEncodeError -- * Useful error handling functions , lenientDecode @@ -59,9 +60,19 @@ import Numeric (showHex) -- report. type OnError a b = String -> Maybe a -> Maybe b +-- | Similar to 'OnError' but in a monadic context which allows +-- additional capabilities. Depending on the monad(s) used, these +-- capabilities include logging decode errors (IO and/or logger monads) +-- and abort processessing without the need to use 'error' or 'throw' +-- (continuation monad). +type OnErrorM a m b = String -> Maybe a -> m (Maybe b) + -- | A handler for a decoding error. type OnDecodeError = OnError Word8 Char +-- | A monadic handler for a decoding error. +type OnDecodeErrorM m = OnErrorM Word8 m Char + -- | A handler for an encoding error. {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} type OnEncodeError = OnError Char Word8 diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 0fabf62e..2d98af16 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -6,6 +6,8 @@ module Tests.Properties.Transcoding ( testTranscoding ) where +import Control.Concurrent.MVar (newEmptyMVar, tryPutMVar, tryTakeMVar) +import Control.Monad.Cont (cont, runCont) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) import Test.QuickCheck hiding ((.&.)) @@ -21,6 +23,8 @@ import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Char8 as BC import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BLC +import Data.Functor.Identity (Identity(..)) +import Data.Maybe (isJust) import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E @@ -199,11 +203,40 @@ t_decode_with_error3' = t_decode_with_error4' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of E.Some x _ _ -> x === "xaaa" +t_decode_with_error2'' = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ -> pure $ Just 'x') + (\ x _ _ -> pure $ x === "xa") +t_decode_with_error3'' = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ -> pure $ Just 'x') + (\ x _ _ -> pure $ x === "xaa") +t_decode_with_error4'' = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ -> pure $ Just 'x') + (\ x _ _ -> pure $ x === "xaaa") t_decode_with_error5' = ioProperty $ do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) pure $ case ret of Left (_ :: E.UnicodeException) -> True Right{} -> False +-- log error test +t_decode_with_error5'' = + ioProperty $ do + mVar <- newEmptyMVar + E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) + (\desc _ -> do + res <- tryPutMVar mVar desc + pure . Just $ if res then 'x' else 'y') + (\ x _ _ -> do + mDesc <- tryTakeMVar mVar + pure $ x == "xaaa" && isJust mDesc) +-- test case demonstrating how to stop processing without 'throw' or 'error' +t_decode_with_error6'' = + runCont (do + E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) + -- early exit + (\ _ _ -> cont $ \ _ -> True) + -- this should not be executed + (\ _ _ _ -> pure False) + ) id t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -250,7 +283,12 @@ testTranscoding = testProperty "t_decode_with_error2'" t_decode_with_error2', testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', + testProperty "t_decode_with_error2''" t_decode_with_error2'', + testProperty "t_decode_with_error3''" t_decode_with_error3'', + testProperty "t_decode_with_error4''" t_decode_with_error4'', testProperty "t_decode_with_error5'" t_decode_with_error5', + testProperty "t_decode_with_error5''" t_decode_with_error5'', + testProperty "t_decode_with_error6''" t_decode_with_error6'', testProperty "t_infix_concat" t_infix_concat ] ] diff --git a/text.cabal b/text.cabal index 488f1451..388512d7 100644 --- a/text.cabal +++ b/text.cabal @@ -181,13 +181,15 @@ library Data.Text.Show build-depends: + STMonadTrans >= 0.4.6 && < 0.4.7, array >= 0.3 && < 0.6, base >= 4.9 && < 5, binary >= 0.5 && < 0.9, bytestring >= 0.10.4 && < 0.12, deepseq >= 1.1 && < 1.5, ghc-prim >= 0.2 && < 0.10, - template-haskell >= 2.5 && < 2.20 + template-haskell >= 2.5 && < 2.20, + transformers ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer) @@ -254,6 +256,7 @@ test-suite tests deepseq, directory, ghc-prim, + mtl, tasty, tasty-hunit, tasty-quickcheck, From 54a1ad906885d35f76e8afcfc733c12e4005649d Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Mon, 27 Jun 2022 19:31:00 -0600 Subject: [PATCH 02/87] Refactored remove STMonadTrans dependency --- src/Data/Text/Encoding.hs | 156 +++++++++++++++++++++++++++++--------- text.cabal | 4 +- 2 files changed, 122 insertions(+), 38 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 78d4769a..be2fc3af 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -1,5 +1,5 @@ {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, - UnliftedFFITypes #-} + UnliftedFFITypes, UnboxedTuples #-} {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -76,9 +76,6 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) import Control.Monad.Fix (fix) import Control.Monad.ST (runST) -import Control.Monad.ST.Trans (runSTT) -import Control.Monad.ST.Trans.Internal (liftST) -import Control.Monad.Trans.Class (lift) import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B @@ -96,7 +93,7 @@ import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) -import GHC.Exts (byteArrayContents#, unsafeCoerce#) +import GHC.Exts (byteArrayContents#, runRW#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) @@ -117,6 +114,7 @@ import Foreign.C.Types (CInt(..)) import qualified Data.ByteString.Unsafe as B import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) #endif +import GHC.ST (ST(..), STRep) -- $strict -- @@ -246,17 +244,105 @@ decodeUtf8With onErr bs txt' = decodeUtf8With onErr (B.tail undecoded) desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" --- | Decode two consecutive bytestrings, returning Text and undecoded remainder. +-- -- | Decode two consecutive bytestrings, returning Text and undecoded remainder. +-- decodeUtf8With2 :: +-- #if defined(ASSERTS) +-- HasCallStack => +-- #endif +-- Monad m => +-- ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString) +-- decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do +-- marr <- liftST $ A.new len' +-- outer marr len' 0 0 +-- where +-- len = len1 + len2 +-- len' = len + 4 + +-- index i +-- | i < len1 = B.index bs1 i +-- | otherwise = B.index bs2 (i - len1) + +-- -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 +-- guessUtf8Boundary :: Int +-- guessUtf8Boundary +-- | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII +-- | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point +-- | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point +-- | len2 >= 3 && w2 >= 0xC0 = len2 - 3 +-- | len2 >= 4 && w3 >= 0xC0 = len2 - 4 +-- | otherwise = 0 +-- where +-- w0 = B.index bs2 (len2 - 1) +-- w1 = B.index bs2 (len2 - 2) +-- w2 = B.index bs2 (len2 - 3) +-- w3 = B.index bs2 (len2 - 4) + +-- decodeFrom :: Int -> DecoderResult +-- decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) +-- where +-- step i (Incomplete a b) +-- | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) +-- step _ st = st + +-- outer dst dstLen = inner +-- where +-- inner srcOff dstOff +-- | srcOff >= len = liftST $ do +-- A.shrinkM dst dstOff +-- arr <- A.unsafeFreeze dst +-- return (Text arr 0 dstOff, mempty) + +-- | srcOff >= len1 +-- , srcOff < len1 + guessUtf8Boundary +-- , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen +-- , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) +-- , isValidBS bs = do +-- liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> +-- unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) +-- inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) + +-- | dstOff + 4 > dstLen = do +-- let dstLen' = dstLen + 4 +-- dst' <- liftST $ A.resizeM dst dstLen' +-- outer dst' dstLen' srcOff dstOff + +-- | otherwise = case decodeFrom srcOff of +-- Accept c -> do +-- d <- liftST $ unsafeWrite dst dstOff c +-- inner (srcOff + d) (dstOff + d) +-- Reject -> do +-- res <- lift $ onErrM desc (Just (index srcOff)) +-- case res of +-- Nothing -> inner (srcOff + 1) dstOff +-- Just c -> do +-- d <- liftST $ unsafeWrite dst dstOff (safe c) +-- inner (srcOff + 1) (dstOff + d) +-- Incomplete{} -> liftST $ do +-- A.shrinkM dst dstOff +-- arr <- A.unsafeFreeze dst +-- let bs = if srcOff >= len1 +-- then B.drop (srcOff - len1) bs2 +-- else B.drop srcOff (bs1 `B.append` bs2) +-- return (Text arr 0 dstOff, bs) + +-- desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" + +unST :: ST s a -> STRep s a +unST (ST st) = st + decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => #endif Monad m => - ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString) -decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do - marr <- liftST $ A.new len' - outer marr len' 0 0 + ByteString + -> ByteString + -> OnDecodeErrorM m + -> m (Text, ByteString) +decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do + outer marr len' 0 0 s# where + !(# s#, marr #) = runRW# (unST $ A.new len') len = len1 + len2 len' = len + 4 @@ -288,44 +374,44 @@ decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ outer dst dstLen = inner where - inner srcOff dstOff - | srcOff >= len = liftST $ do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - return (Text arr 0 dstOff, mempty) + inner srcOff dstOff s'# + | srcOff >= len = + let !(# s''#, _ #) = (unST $ A.shrinkM dst dstOff) s'# + !(# _, arr #) = (unST $ A.unsafeFreeze dst) s''# + in pure (Text arr 0 dstOff, mempty) | srcOff >= len1 , srcOff < len1 + guessUtf8Boundary , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) - , isValidBS bs = do - liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) - inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) + , isValidBS bs = + let !(# s''#, _ #) = unST (withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)) s'# + in inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''# - | dstOff + 4 > dstLen = do + | dstOff + 4 > dstLen = let dstLen' = dstLen + 4 - dst' <- liftST $ A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff + !(# s''#, dst' #) = unST (A.resizeM dst dstLen') s'# + in outer dst' dstLen' srcOff dstOff s''# | otherwise = case decodeFrom srcOff of - Accept c -> do - d <- liftST $ unsafeWrite dst dstOff c - inner (srcOff + d) (dstOff + d) + Accept c -> + let !(# s''#, d #) = unST (unsafeWrite dst dstOff c) s'# + in inner (srcOff + d) (dstOff + d) s''# Reject -> do - res <- lift $ onErrM desc (Just (index srcOff)) + res <- onErrM desc (Just (index srcOff)) case res of - Nothing -> inner (srcOff + 1) dstOff - Just c -> do - d <- liftST $ unsafeWrite dst dstOff (safe c) - inner (srcOff + 1) (dstOff + d) - Incomplete{} -> liftST $ do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - let bs = if srcOff >= len1 + Nothing -> inner (srcOff + 1) dstOff s'# + Just c -> + let !(# s''#, d #) = unST (unsafeWrite dst dstOff (safe c)) s'# in + inner (srcOff + 1) (dstOff + d) s''# + Incomplete{} -> + let !(# s''#, _ #) = unST (A.shrinkM dst dstOff) s'# + !(# _, arr #) = unST (A.unsafeFreeze dst) s''# + bs = if srcOff >= len1 then B.drop (srcOff - len1) bs2 else B.drop srcOff (bs1 `B.append` bs2) - return (Text arr 0 dstOff, bs) + in pure (Text arr 0 dstOff, bs) desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" diff --git a/text.cabal b/text.cabal index 388512d7..5231dbac 100644 --- a/text.cabal +++ b/text.cabal @@ -181,15 +181,13 @@ library Data.Text.Show build-depends: - STMonadTrans >= 0.4.6 && < 0.4.7, array >= 0.3 && < 0.6, base >= 4.9 && < 5, binary >= 0.5 && < 0.9, bytestring >= 0.10.4 && < 0.12, deepseq >= 1.1 && < 1.5, ghc-prim >= 0.2 && < 0.10, - template-haskell >= 2.5 && < 2.20, - transformers + template-haskell >= 2.5 && < 2.20 ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer) From fc7b6651001d3ba8a4f064a90ccb01fbc277a6e7 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Mon, 27 Jun 2022 19:33:44 -0600 Subject: [PATCH 03/87] Remove unused commented code --- src/Data/Text/Encoding.hs | 83 --------------------------------------- 1 file changed, 83 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index be2fc3af..16f50d0b 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -244,89 +244,6 @@ decodeUtf8With onErr bs txt' = decodeUtf8With onErr (B.tail undecoded) desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" --- -- | Decode two consecutive bytestrings, returning Text and undecoded remainder. --- decodeUtf8With2 :: --- #if defined(ASSERTS) --- HasCallStack => --- #endif --- Monad m => --- ByteString -> ByteString -> OnDecodeErrorM m -> m (Text, ByteString) --- decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = runSTT $ do --- marr <- liftST $ A.new len' --- outer marr len' 0 0 --- where --- len = len1 + len2 --- len' = len + 4 - --- index i --- | i < len1 = B.index bs1 i --- | otherwise = B.index bs2 (i - len1) - --- -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 --- guessUtf8Boundary :: Int --- guessUtf8Boundary --- | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII --- | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point --- | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point --- | len2 >= 3 && w2 >= 0xC0 = len2 - 3 --- | len2 >= 4 && w3 >= 0xC0 = len2 - 4 --- | otherwise = 0 --- where --- w0 = B.index bs2 (len2 - 1) --- w1 = B.index bs2 (len2 - 2) --- w2 = B.index bs2 (len2 - 3) --- w3 = B.index bs2 (len2 - 4) - --- decodeFrom :: Int -> DecoderResult --- decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) --- where --- step i (Incomplete a b) --- | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) --- step _ st = st - --- outer dst dstLen = inner --- where --- inner srcOff dstOff --- | srcOff >= len = liftST $ do --- A.shrinkM dst dstOff --- arr <- A.unsafeFreeze dst --- return (Text arr 0 dstOff, mempty) - --- | srcOff >= len1 --- , srcOff < len1 + guessUtf8Boundary --- , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen --- , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) --- , isValidBS bs = do --- liftST . withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> --- unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) --- inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) - --- | dstOff + 4 > dstLen = do --- let dstLen' = dstLen + 4 --- dst' <- liftST $ A.resizeM dst dstLen' --- outer dst' dstLen' srcOff dstOff - --- | otherwise = case decodeFrom srcOff of --- Accept c -> do --- d <- liftST $ unsafeWrite dst dstOff c --- inner (srcOff + d) (dstOff + d) --- Reject -> do --- res <- lift $ onErrM desc (Just (index srcOff)) --- case res of --- Nothing -> inner (srcOff + 1) dstOff --- Just c -> do --- d <- liftST $ unsafeWrite dst dstOff (safe c) --- inner (srcOff + 1) (dstOff + d) --- Incomplete{} -> liftST $ do --- A.shrinkM dst dstOff --- arr <- A.unsafeFreeze dst --- let bs = if srcOff >= len1 --- then B.drop (srcOff - len1) bs2 --- else B.drop srcOff (bs1 `B.append` bs2) --- return (Text arr 0 dstOff, bs) - --- desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - unST :: ST s a -> STRep s a unST (ST st) = st From 4dbf4933880c8a084244cfa808f73862eb8c19e5 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Mon, 27 Jun 2022 20:59:04 -0600 Subject: [PATCH 04/87] change source module for runRW# --- src/Data/Text/Encoding.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 16f50d0b..1faaaf61 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -93,7 +93,8 @@ import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) -import GHC.Exts (byteArrayContents#, runRW#, unsafeCoerce#) +import GHC.Exts (byteArrayContents#, unsafeCoerce#) +import GHC.Magic (runRW#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) @@ -331,6 +332,7 @@ decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do in pure (Text arr 0 dstOff, bs) desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" +{-# NOINLINE decodeUtf8With2 #-} -- $stream -- From c052ad64d936161116288a993f1b97c7a0c2c129 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Thu, 30 Jun 2022 23:11:47 -0600 Subject: [PATCH 05/87] Added byte position (Int) argument to DecodeErrorM type. --- src/Data/Text/Encoding.hs | 215 +++++++++++++------------- src/Data/Text/Encoding/Error.hs | 13 +- src/Data/Text/Lazy/Encoding.hs | 46 ++++-- tests/Tests/Properties/Transcoding.hs | 85 +++++++--- text.cabal | 4 +- 5 files changed, 205 insertions(+), 158 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 1faaaf61..1daec591 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -36,6 +36,7 @@ module Data.Text.Encoding -- *** Controllable error handling , decodeUtf8With + , decodeUtf8WithM , decodeUtf16LEWith , decodeUtf16BEWith , decodeUtf32LEWith @@ -43,6 +44,7 @@ module Data.Text.Encoding -- *** Stream oriented decoding -- $stream + , streamDecodeUtf8 , streamDecodeUtf8With , streamDecodeUtf8WithM , Decoding(..) @@ -56,9 +58,6 @@ module Data.Text.Encoding , decodeUtf32LE , decodeUtf32BE - -- *** Stream oriented decoding - , streamDecodeUtf8 - -- * Encoding Text to ByteStrings , encodeUtf8 , encodeUtf16LE @@ -81,7 +80,6 @@ import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS -import Data.Function ((&)) import Data.Functor.Identity (Identity(..), runIdentity) import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), safe, empty, append) @@ -231,108 +229,31 @@ decodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs - | isValidBS bs = - let !(SBS.SBS arr) = SBS.toShort bs in - (Text (A.ByteArray arr) 0 (B.length bs)) - | B.null undecoded = txt - | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of - Nothing -> txt' - Just c -> T.singleton c `append` txt') - where - (txt, undecoded) = runIdentity . decodeUtf8With2 mempty bs $ \ desc' mWord8 -> - Identity $ onErr desc' mWord8 - txt' = decodeUtf8With onErr (B.tail undecoded) - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" +decodeUtf8With onErr bs = + runIdentity $ decodeUtf8WithM bs (\ desc' _ mWord8 -> Identity $ onErr desc' mWord8) -unST :: ST s a -> STRep s a -unST (ST st) = st - -decodeUtf8With2 :: +decodeUtf8WithM :: #if defined(ASSERTS) HasCallStack => #endif Monad m => - ByteString - -> ByteString - -> OnDecodeErrorM m - -> m (Text, ByteString) -decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM = do - outer marr len' 0 0 s# + ByteString -> OnDecodeErrorM m -> m Text +decodeUtf8WithM bs onErrM = + streamDecodeUtf8WithM bs onErrM (\ t bs' bytePos _ -> + fix (\ f bp bs'' t' -> + case B.uncons bs'' of + Just (word8, bs''') -> do + mC <- onErrM desc bp $ Just word8 + f (bp + 1) bs''' $ case mC of + Just c -> t' `append` T.singleton c + _ -> t' + _ -> pure t') bytePos bs' t + ) where - !(# s#, marr #) = runRW# (unST $ A.new len') - len = len1 + len2 - len' = len + 4 - - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 (i - len1) - - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 - where - w0 = B.index bs2 (len2 - 1) - w1 = B.index bs2 (len2 - 2) - w2 = B.index bs2 (len2 - 3) - w3 = B.index bs2 (len2 - 4) - - decodeFrom :: Int -> DecoderResult - decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) - where - step i (Incomplete a b) - | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) - step _ st = st - - outer dst dstLen = inner - where - inner srcOff dstOff s'# - | srcOff >= len = - let !(# s''#, _ #) = (unST $ A.shrinkM dst dstOff) s'# - !(# _, arr #) = (unST $ A.unsafeFreeze dst) s''# - in pure (Text arr 0 dstOff, mempty) - - | srcOff >= len1 - , srcOff < len1 + guessUtf8Boundary - , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen - , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) - , isValidBS bs = - let !(# s''#, _ #) = unST (withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)) s'# - in inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''# - - | dstOff + 4 > dstLen = - let dstLen' = dstLen + 4 - !(# s''#, dst' #) = unST (A.resizeM dst dstLen') s'# - in outer dst' dstLen' srcOff dstOff s''# - - | otherwise = case decodeFrom srcOff of - Accept c -> - let !(# s''#, d #) = unST (unsafeWrite dst dstOff c) s'# - in inner (srcOff + d) (dstOff + d) s''# - Reject -> do - res <- onErrM desc (Just (index srcOff)) - case res of - Nothing -> inner (srcOff + 1) dstOff s'# - Just c -> - let !(# s''#, d #) = unST (unsafeWrite dst dstOff (safe c)) s'# in - inner (srcOff + 1) (dstOff + d) s''# - Incomplete{} -> - let !(# s''#, _ #) = unST (A.shrinkM dst dstOff) s'# - !(# _, arr #) = unST (A.unsafeFreeze dst) s''# - bs = if srcOff >= len1 - then B.drop (srcOff - len1) bs2 - else B.drop srcOff (bs1 `B.append` bs2) - in pure (Text arr 0 dstOff, bs) - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" -{-# NOINLINE decodeUtf8With2 #-} + +unST :: ST s a -> STRep s a +unST (ST st) = st -- $stream -- @@ -396,12 +317,90 @@ streamDecodeUtf8WithM :: Monad m => ByteString -> OnDecodeErrorM m - -> (Text -> ByteString -> (ByteString -> m b) -> m b) + -> (Text -> ByteString -> Int -> (ByteString -> m b) -> m b) -> m b -streamDecodeUtf8WithM bstr onErrM f = fix (\ go bs1 bs2 -> do - (txt, undecoded) <- decodeUtf8With2 bs1 bs2 onErrM - f txt undecoded (go undecoded) - ) mempty bstr +streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do + (txt, (undecoded, bytePos)) <- decodeUtf8With2 bs1 bs2 (\ desc bp'' mWord8 -> onErrM desc (bp'' + bp) mWord8) + let bp' = bytePos + bp + f txt undecoded bp' $ go bp' undecoded + ) 0 mempty bstr + where + decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM' = do + case runRW# $ unST (A.new len') of + (# s#, marr #) -> outer marr len' 0 0 s# + where + len = len1 + len2 + len' = len + 4 + + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + + -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 + guessUtf8Boundary :: Int + guessUtf8Boundary + | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII + | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point + | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point + | len2 >= 3 && w2 >= 0xC0 = len2 - 3 + | len2 >= 4 && w3 >= 0xC0 = len2 - 4 + | otherwise = 0 + where + w0 = B.index bs2 $ len2 - 1 + w1 = B.index bs2 $ len2 - 2 + w2 = B.index bs2 $ len2 - 3 + w3 = B.index bs2 $ len2 - 4 + + decodeFrom :: Int -> DecoderResult + decodeFrom off = step (off + 1) . utf8DecodeStart $ index off + where + step i (Incomplete a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b + step _ st = st + + outer dst dstLen = inner + where + inner srcOff dstOff s'# + | srcOff >= len = + case unST (A.shrinkM dst dstOff) s'# of + (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of + (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff)) + + | srcOff >= len1 + , srcOff < len1 + guessUtf8Boundary + , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen + , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) + , isValidBS bs = + case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s'# of + (# s''#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''# + + | dstOff + 4 > dstLen = + let dstLen' = dstLen + 4 in + case unST (A.resizeM dst dstLen') s'# of + !(# s''#, dst' #) -> outer dst' dstLen' srcOff dstOff s''# + + | otherwise = case decodeFrom srcOff of + Accept c -> + case unST (unsafeWrite dst dstOff c) s'# of + (# s''#, d #) -> inner (srcOff + d) (dstOff + d) s''# + Reject -> do + res <- onErrM' desc srcOff . Just $ index srcOff + case res of + Nothing -> inner (srcOff + 1) dstOff s'# + Just c -> + case unST (unsafeWrite dst dstOff $ safe c) s'# of + (# s''#, d #) -> inner (srcOff + 1) (dstOff + d) s''# + Incomplete{} -> + case unST (A.shrinkM dst dstOff) s'# of + (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of + (# _, arr #) -> + let bs = if srcOff >= len1 + then B.drop (srcOff - len1) bs2 + else B.drop srcOff $ bs1 `B.append` bs2 in + pure (Text arr 0 dstOff, (bs, srcOff)) + + desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" -- | A stream oriented decoding result. -- @@ -440,11 +439,9 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = fix (\ go bs1 bs2 -> - runIdentity (decodeUtf8With2 bs1 bs2 (\ desc' mWord8 -> - Identity $ onErr desc' mWord8)) & (\ (txt, undecoded) -> - Some txt undecoded (go undecoded)) - ) mempty +streamDecodeUtf8With onErr bs = runIdentity $ streamDecodeUtf8WithM bs + (\ desc _ mWord8 -> Identity $ onErr desc mWord8) + (\ txt undecoded _ f -> Identity . Some txt undecoded $ runIdentity . f) -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index 99226cb1..bec2fd55 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -60,18 +60,13 @@ import Numeric (showHex) -- report. type OnError a b = String -> Maybe a -> Maybe b --- | Similar to 'OnError' but in a monadic context which allows --- additional capabilities. Depending on the monad(s) used, these --- capabilities include logging decode errors (IO and/or logger monads) --- and abort processessing without the need to use 'error' or 'throw' --- (continuation monad). -type OnErrorM a m b = String -> Maybe a -> m (Maybe b) - -- | A handler for a decoding error. type OnDecodeError = OnError Word8 Char --- | A monadic handler for a decoding error. -type OnDecodeErrorM m = OnErrorM Word8 m Char +-- | A monadic handler for a decoding error. With certain monads such as +-- 'Maybe', 'Either', and 'Cont', processessing can be abandoned without +-- the need to use 'error' or 'throw'. +type OnDecodeErrorM m = String -> Int -> Maybe Word8 -> m (Maybe Char) -- | A handler for an encoding error. {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index ad361af5..7f6baa3b 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -29,6 +29,7 @@ module Data.Text.Lazy.Encoding -- *** Controllable error handling , decodeUtf8With + , decodeUtf8WithM , decodeUtf16LEWith , decodeUtf16BEWith , decodeUtf32LEWith @@ -56,8 +57,12 @@ module Data.Text.Lazy.Encoding ) where import Control.Exception (evaluate, try) +import Control.Monad.Fix (fix) +import Control.Monad.State (evalStateT, get, state) +import Control.Monad.Trans.Class (lift) +import Data.Functor.Identity (Identity(..)) import Data.Monoid (Monoid(..)) -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) +import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word8) import qualified Data.ByteString as S @@ -106,24 +111,31 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text -decodeUtf8With onErr (B.Chunk b0 bs0) = - case TE.streamDecodeUtf8With onErr b0 of - TE.Some t l f -> chunk t (go f l bs0) +decodeUtf8With onErr = runIdentity . flip decodeUtf8WithM (const . (Identity .) . onErr) + +-- | Decode a 'ByteString' containing UTF-8 encoded text in a monad context. +decodeUtf8WithM :: Monad m => B.ByteString -> OnDecodeErrorM m -> m Text +decodeUtf8WithM (B.Chunk sb lb) onErrM = + evalStateT (TE.streamDecodeUtf8WithM sb (((lift .) .) . onErrM) (\ t cp bp f -> do + (lb', diff) <- get + case lb' of + B.Chunk sb' lb'' -> do + state $ const ((), (lb'', diff . chunk t)) + f sb' + _ -> do + t'' <- lift $ fix (\ f' bp' cp' t' -> + case S.uncons cp' of + Just (word8, cp'') -> do + mC <- onErrM desc bp' $ Just word8 + f' (bp' + 1) cp'' $ case mC of + Just c -> t' `T.snoc` c + _ -> t' + _ -> pure t') bp cp t + pure . diff $ Chunk t'' Empty + )) (lb, id) where - go f0 _ (B.Chunk b bs) = - case f0 b of - TE.Some t l f -> chunk t (go f l bs) - go _ l _ - | S.null l = empty - | otherwise = - let !t = T.pack (skipBytes l) - skipBytes = S.foldr (\x s' -> - case onErr desc (Just x) of - Just c -> c : s' - Nothing -> s') [] in - Chunk t Empty desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" -decodeUtf8With _ _ = empty +decodeUtf8WithM _ _ = pure empty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 2d98af16..5e67f52a 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -6,7 +6,7 @@ module Tests.Properties.Transcoding ( testTranscoding ) where -import Control.Concurrent.MVar (newEmptyMVar, tryPutMVar, tryTakeMVar) +import Control.Concurrent.MVar (newEmptyMVar, newMVar, tryPutMVar, tryTakeMVar) import Control.Monad.Cont (cont, runCont) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) @@ -203,40 +203,75 @@ t_decode_with_error3' = t_decode_with_error4' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of E.Some x _ _ -> x === "xaaa" -t_decode_with_error2'' = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ -> pure $ Just 'x') - (\ x _ _ -> pure $ x === "xa") -t_decode_with_error3'' = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ -> pure $ Just 'x') - (\ x _ _ -> pure $ x === "xaa") -t_decode_with_error4'' = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ -> pure $ Just 'x') - (\ x _ _ -> pure $ x === "xaaa") t_decode_with_error5' = ioProperty $ do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) pure $ case ret of Left (_ :: E.UnicodeException) -> True Right{} -> False +t_decode_withM_error1 = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x') + (\ x _ _ _ -> pure $ x === "xa") +t_decode_withM_error2 = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ _ -> pure $ Just 'x') + (\ x _ _ _ -> pure $ x === "xaa") +t_decode_withM_error3 = + runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x') + (\ x _ _ _ -> pure $ x === "xaaa") -- log error test -t_decode_with_error5'' = +t_decode_withM_error4 = ioProperty $ do mVar <- newEmptyMVar E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - (\desc _ -> do + (\ desc _ _ -> do res <- tryPutMVar mVar desc pure . Just $ if res then 'x' else 'y') - (\ x _ _ -> do + (\ x _ _ _ -> do mDesc <- tryTakeMVar mVar pure $ x == "xaaa" && isJust mDesc) --- test case demonstrating how to stop processing without 'throw' or 'error' -t_decode_with_error6'' = +-- test case demonstrating how to stop processing without 'throw' or 'error' using Either +t_decode_withM_error5 = + case do + E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) + (\ _ errPos _ -> Left errPos) + (\ x _ _ _ -> pure x) of + Left pos -> pos == 0 + Right _ -> False +-- test case demonstrating how to stop processing without 'throw' or 'error' using Maybe +t_decode_withM_error6 = + case do + E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) + (\ _ _ _ -> Nothing) + (\ x _ _ _ -> pure x) of + Just _ -> False + _ -> True +-- test case demonstrating how to stop processing without 'throw' or 'error' using the continuation monad +t_decode_withM_error7 = runCont (do E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) -- early exit - (\ _ _ -> cont $ \ _ -> True) + (\ _ _ _ -> cont $ \ _ -> True) -- this should not be executed - (\ _ _ _ -> pure False) + (\ _ _ _ _ -> pure False) ) id +-- feed more data into the stream using the continuation +t_decode_withM_error8 = + ioProperty $ do + mVar <- newMVar $ B.pack [84, 101, 115, 116] -- 'T' 'e' 's' 't' + E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) + (\ _ _ _ -> pure $ Just 'x') + (\ x _ _ f -> do + mBs <- tryTakeMVar mVar + case mBs of + Just bs -> f bs + _ -> pure $ x == "Test") +tl_decode_withM_error1 = + ioProperty (do + x <- EL.decodeUtf8WithM (BL.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x') + pure $ x == "xa") +tl_decode_withM_error2 = + runIdentity (EL.decodeUtf8WithM (BL.pack [0xE0, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaa" +tl_decode_withM_error3 = + runIdentity (EL.decodeUtf8WithM (BL.pack [0xF0, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaaa" t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -283,12 +318,18 @@ testTranscoding = testProperty "t_decode_with_error2'" t_decode_with_error2', testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', - testProperty "t_decode_with_error2''" t_decode_with_error2'', - testProperty "t_decode_with_error3''" t_decode_with_error3'', - testProperty "t_decode_with_error4''" t_decode_with_error4'', + testProperty "t_decode_withM_error1" t_decode_withM_error1, + testProperty "t_decode_withM_error2" t_decode_withM_error2, + testProperty "t_decode_withM_error3" t_decode_withM_error3, testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_with_error5''" t_decode_with_error5'', - testProperty "t_decode_with_error6''" t_decode_with_error6'', + testProperty "t_decode_withM_error4" t_decode_withM_error4, + testProperty "t_decode_withM_error5" t_decode_withM_error5, + testProperty "t_decode_withM_error6" t_decode_withM_error6, + testProperty "t_decode_withM_error7" t_decode_withM_error7, + testProperty "t_decode_withM_error8" t_decode_withM_error8, + testProperty "tl_decode_withM_error1" tl_decode_withM_error1, + testProperty "tl_decode_withM_error2" tl_decode_withM_error2, + testProperty "tl_decode_withM_error3" tl_decode_withM_error3, testProperty "t_infix_concat" t_infix_concat ] ] diff --git a/text.cabal b/text.cabal index 5231dbac..9ff632d1 100644 --- a/text.cabal +++ b/text.cabal @@ -187,7 +187,9 @@ library bytestring >= 0.10.4 && < 0.12, deepseq >= 1.1 && < 1.5, ghc-prim >= 0.2 && < 0.10, - template-haskell >= 2.5 && < 2.20 + mtl, + template-haskell >= 2.5 && < 2.20, + transformers ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer) From 2e2c5419e0ea250809b71870dc3caa894bc79605 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Fri, 1 Jul 2022 19:04:49 -0600 Subject: [PATCH 06/87] make streamDecodeUtf8WithM safe with Cont and List monads --- src/Data/Text/Encoding.hs | 59 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 1daec591..3596bbb4 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -146,16 +146,13 @@ import GHC.ST (ST(..), STRep) -- anything except ASCII and copies buffer or throws an error otherwise. -- decodeASCII :: ByteString -> Text -decodeASCII bs = withBS bs $ \fp len -> - if len == 0 - then empty - else runST $ do - asciiPrefixLen <- fmap cSizeToInt . unsafeIOToST . unsafeWithForeignPtr fp $ \src -> - c_is_ascii src (src `plusPtr` len) - if asciiPrefixLen == len - then let !(SBS.SBS arr) = SBS.toShort bs in - return (Text (A.ByteArray arr) 0 len) - else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen +decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do + asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + c_is_ascii src (src `plusPtr` len) + if asciiPrefixLen == len + then let !(SBS.SBS arr) = SBS.toShort bs in + return (Text (A.ByteArray arr) 0 len) + else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -360,10 +357,10 @@ streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do outer dst dstLen = inner where - inner srcOff dstOff s'# + inner srcOff dstOff s0# | srcOff >= len = - case unST (A.shrinkM dst dstOff) s'# of - (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of + case unST (A.shrinkM dst dstOff) s0# of + (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff)) | srcOff >= len1 @@ -372,28 +369,36 @@ streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) , isValidBS bs = case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s'# of - (# s''#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s''# + unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s0# of + (# s1#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s1# | dstOff + 4 > dstLen = let dstLen' = dstLen + 4 in - case unST (A.resizeM dst dstLen') s'# of - !(# s''#, dst' #) -> outer dst' dstLen' srcOff dstOff s''# + case unST (A.resizeM dst dstLen') s0# of + !(# s1#, dst' #) -> outer dst' dstLen' srcOff dstOff s1# | otherwise = case decodeFrom srcOff of Accept c -> - case unST (unsafeWrite dst dstOff c) s'# of - (# s''#, d #) -> inner (srcOff + d) (dstOff + d) s''# + case unST (unsafeWrite dst dstOff c) s0# of + (# s1#, d #) -> inner (srcOff + d) (dstOff + d) s1# Reject -> do - res <- onErrM' desc srcOff . Just $ index srcOff - case res of - Nothing -> inner (srcOff + 1) dstOff s'# - Just c -> - case unST (unsafeWrite dst dstOff $ safe c) s'# of - (# s''#, d #) -> inner (srcOff + 1) (dstOff + d) s''# + -- gonna call this text array done + case unST (A.shrinkM dst dstOff) s0# of + -- might not be necessary + (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of + (# _, arr #) -> do + res <- onErrM' desc srcOff . Just $ index srcOff + -- continue on with a copy of the text array + case runRW# (unST $ A.new dstLen) of + (# s2#, dst' #) -> case unST (A.copyI dstOff dst' 0 arr 0) s2# of + (# s3#, _ #) -> case res of + Just c -> + case unST (unsafeWrite dst' dstOff $ safe c) s3# of + (# s4#, d #) -> outer dst' dstLen (srcOff + 1) (dstOff + d) s4# + _ -> outer dst' dstLen (srcOff + 1) dstOff s3# Incomplete{} -> - case unST (A.shrinkM dst dstOff) s'# of - (# s''#, _ #) -> case unST (A.unsafeFreeze dst) s''# of + case unST (A.shrinkM dst dstOff) s0# of + (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of (# _, arr #) -> let bs = if srcOff >= len1 then B.drop (srcOff - len1) bs2 From 41492dc573e2ac5460748adef1fd0ba52e1b7e81 Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Sat, 2 Jul 2022 23:07:32 -0600 Subject: [PATCH 07/87] Added StreamDecode which is like Decoding with more options. Added streamDecodeUtf8With' to take advantage of StremDecode. --- src/Data/Text/Encoding.hs | 276 +++++++++++++------------- src/Data/Text/Encoding/Error.hs | 8 +- src/Data/Text/Lazy/Encoding.hs | 48 ++--- tests/Tests/Properties/Transcoding.hs | 123 ++++-------- text.cabal | 4 +- 5 files changed, 198 insertions(+), 261 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 3596bbb4..a4e01247 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -1,5 +1,5 @@ {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, - UnliftedFFITypes, UnboxedTuples #-} + UnliftedFFITypes #-} {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -36,7 +36,6 @@ module Data.Text.Encoding -- *** Controllable error handling , decodeUtf8With - , decodeUtf8WithM , decodeUtf16LEWith , decodeUtf16BEWith , decodeUtf32LEWith @@ -46,8 +45,9 @@ module Data.Text.Encoding -- $stream , streamDecodeUtf8 , streamDecodeUtf8With - , streamDecodeUtf8WithM + , streamDecodeUtf8With' , Decoding(..) + , StreamDecode(..) -- ** Partial Functions -- $partial @@ -73,15 +73,13 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) -import Control.Monad.Fix (fix) -import Control.Monad.ST (runST) +import Control.Monad.ST (runST, ST) import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS -import Data.Functor.Identity (Identity(..), runIdentity) -import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode, lenientDecode) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), safe, empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) @@ -92,7 +90,6 @@ import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) -import GHC.Magic (runRW#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) @@ -113,7 +110,6 @@ import Foreign.C.Types (CInt(..)) import qualified Data.ByteString.Unsafe as B import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) #endif -import GHC.ST (ST(..), STRep) -- $strict -- @@ -226,37 +222,24 @@ decodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs = - runIdentity $ decodeUtf8WithM bs (\ desc' _ mWord8 -> Identity $ onErr desc' mWord8) - -decodeUtf8WithM :: -#if defined(ASSERTS) - HasCallStack => -#endif - Monad m => - ByteString -> OnDecodeErrorM m -> m Text -decodeUtf8WithM bs onErrM = - streamDecodeUtf8WithM bs onErrM (\ t bs' bytePos _ -> - fix (\ f bp bs'' t' -> - case B.uncons bs'' of - Just (word8, bs''') -> do - mC <- onErrM desc bp $ Just word8 - f (bp + 1) bs''' $ case mC of - Just c -> t' `append` T.singleton c - _ -> t' - _ -> pure t') bytePos bs' t - ) +decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of + Some t unencoded _ -> codePointToInvalid unencoded t where - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" + codePointToInvalid bs' txt = + case B.uncons bs' of + Just (x, bs'') -> codePointToInvalid bs'' $ case onErr desc $ Just x of + Just c -> append txt $ T.singleton c + _ -> txt + _ -> txt -unST :: ST s a -> STRep s a -unST (ST st) = st + desc = "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" -- $stream -- --- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept --- a 'ByteString' that represents a possibly incomplete input (e.g. a --- packet from a network stream) that may not end on a UTF-8 boundary. +-- The 'streamDecodeUtf8', 'streamDecodeUtf8With', and +-- 'streamDecodeUtf8With'' functions accept a 'ByteString' that +-- represents a possibly incomplete input (e.g. a packet from a network +-- stream) that may not end on a UTF-8 boundary. -- -- 1. The maximal prefix of 'Text' that could be decoded from the -- given input. @@ -305,108 +288,6 @@ unST (ST st) = st -- If given invalid input, an exception will be thrown by the function -- or continuation where it is encountered. --- | Decode, in a monadic- and stream-oriented way using CPS, a lazy --- 'ByteString' containing UTF-8 encoded text. -streamDecodeUtf8WithM :: -#if defined(ASSERTS) - HasCallStack => -#endif - Monad m => - ByteString - -> OnDecodeErrorM m - -> (Text -> ByteString -> Int -> (ByteString -> m b) -> m b) - -> m b -streamDecodeUtf8WithM bstr onErrM f = fix (\ go bp bs1 bs2 -> do - (txt, (undecoded, bytePos)) <- decodeUtf8With2 bs1 bs2 (\ desc bp'' mWord8 -> onErrM desc (bp'' + bp) mWord8) - let bp' = bytePos + bp - f txt undecoded bp' $ go bp' undecoded - ) 0 mempty bstr - where - decodeUtf8With2 bs1@(B.length -> len1) bs2@(B.length -> len2) onErrM' = do - case runRW# $ unST (A.new len') of - (# s#, marr #) -> outer marr len' 0 0 s# - where - len = len1 + len2 - len' = len + 4 - - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 - where - w0 = B.index bs2 $ len2 - 1 - w1 = B.index bs2 $ len2 - 2 - w2 = B.index bs2 $ len2 - 3 - w3 = B.index bs2 $ len2 - 4 - - decodeFrom :: Int -> DecoderResult - decodeFrom off = step (off + 1) . utf8DecodeStart $ index off - where - step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st - - outer dst dstLen = inner - where - inner srcOff dstOff s0# - | srcOff >= len = - case unST (A.shrinkM dst dstOff) s0# of - (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of - (# _, arr #) -> pure (Text arr 0 dstOff, (mempty, srcOff)) - - | srcOff >= len1 - , srcOff < len1 + guessUtf8Boundary - , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen - , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) - , isValidBS bs = - case unST (withBS bs $ \fp _ -> unsafeIOToST . unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff) s0# of - (# s1#, _ #) -> inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) s1# - - | dstOff + 4 > dstLen = - let dstLen' = dstLen + 4 in - case unST (A.resizeM dst dstLen') s0# of - !(# s1#, dst' #) -> outer dst' dstLen' srcOff dstOff s1# - - | otherwise = case decodeFrom srcOff of - Accept c -> - case unST (unsafeWrite dst dstOff c) s0# of - (# s1#, d #) -> inner (srcOff + d) (dstOff + d) s1# - Reject -> do - -- gonna call this text array done - case unST (A.shrinkM dst dstOff) s0# of - -- might not be necessary - (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of - (# _, arr #) -> do - res <- onErrM' desc srcOff . Just $ index srcOff - -- continue on with a copy of the text array - case runRW# (unST $ A.new dstLen) of - (# s2#, dst' #) -> case unST (A.copyI dstOff dst' 0 arr 0) s2# of - (# s3#, _ #) -> case res of - Just c -> - case unST (unsafeWrite dst' dstOff $ safe c) s3# of - (# s4#, d #) -> outer dst' dstLen (srcOff + 1) (dstOff + d) s4# - _ -> outer dst' dstLen (srcOff + 1) dstOff s3# - Incomplete{} -> - case unST (A.shrinkM dst dstOff) s0# of - (# s1#, _ #) -> case unST (A.unsafeFreeze dst) s1# of - (# _, arr #) -> - let bs = if srcOff >= len1 - then B.drop (srcOff - len1) bs2 - else B.drop srcOff $ bs1 `B.append` bs2 in - pure (Text arr 0 dstOff, (bs, srcOff)) - - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - -- | A stream oriented decoding result. -- -- @since 1.0.0.0 @@ -444,9 +325,122 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr bs = runIdentity $ streamDecodeUtf8WithM bs - (\ desc _ mWord8 -> Identity $ onErr desc mWord8) - (\ txt undecoded _ f -> Identity . Some txt undecoded $ runIdentity . f) +streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' + where + go t res = case res of + Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With' + IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f + InvalidWord txt _ mWord8 f -> go (t `append` txt) . f $ onErr desc mWord8 + + desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" + +-- | A stream-oriented decoding result of one of three possibilities: +-- +-- * 'Ok' - Decoded text without issue. +-- * 'IncompleteCodePoint' - An incomplete code point at the end of +-- the 'ByteString' which includes the position in the 'ByteString' +-- where the code point start, the incomplete code point, and a +-- continuation that accepts another 'ByteString' as a continuation of +-- the initial input. +-- * 'InvalidWord' - An invalid utf-8 'Word8' which includes the +-- position of in the 'ByteString' where the invalid data occurs, the +-- offending 'Word8', and a continuation that accepts as what 'Char' it +-- should be interpreted if any. +data StreamDecode = + Ok !Text + | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode) + | InvalidWord !Text !Int !(Maybe Word8) (Maybe Char -> StreamDecode) + +-- | Like 'streamDecodeUtf8With', but instead of accepting an +-- 'OnDecodeError' callback, it returns a 'StreamDecode' identifying +-- whether the 'ByteString' decoded without issue, it encountered an +-- incomplete code point at the end of the 'ByteString', or an invalid +-- 'Word8'. +streamDecodeUtf8With' :: ByteString -> StreamDecode +streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty + where + decodeWithPossibleStartChar :: Maybe Char -> ByteString -> ByteString -> StreamDecode + decodeWithPossibleStartChar mStartChar bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do + marr <- A.new len' + case mStartChar of + Just c -> do + d <- unsafeWrite marr 0 (safe c) + outer marr len' 0 d + _ -> outer marr len' 0 0 + where + len = len1 + len2 + len' = len + 4 + + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 (i - len1) + + -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 + guessUtf8Boundary :: Int + guessUtf8Boundary + | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII + | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point + | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point + | len2 >= 3 && w2 >= 0xC0 = len2 - 3 + | len2 >= 4 && w3 >= 0xC0 = len2 - 4 + | otherwise = 0 + where + w0 = B.index bs2 (len2 - 1) + w1 = B.index bs2 (len2 - 2) + w2 = B.index bs2 (len2 - 3) + w3 = B.index bs2 (len2 - 4) + + decodeFrom :: Int -> DecoderResult + decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) + where + step i (Incomplete a b) + | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) + step _ st = st + + outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode + outer dst dstLen = inner + where + inner srcOff dstOff + | srcOff >= len = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + pure . Ok $ Text arr 0 dstOff + + | srcOff >= len1 + , srcOff < len1 + guessUtf8Boundary + , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen + , bs' <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) + , isValidBS bs' = do + withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) + inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) + + | dstOff + 4 > dstLen = do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + + | otherwise = case decodeFrom srcOff of + Accept c -> do + d <- unsafeWrite dst dstOff c + inner (srcOff + d) (dstOff + d) + Reject -> do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + let srcOff' = srcOff + 1 + bs' = if srcOff' >= len1 + then B.drop (srcOff' - len1) bs2 + else B.drop srcOff' (bs1 `B.append` bs2) + pure . InvalidWord (Text arr 0 dstOff) srcOff (Just $ index srcOff) $ \ mChar -> + decodeWithPossibleStartChar mChar bs' mempty + Incomplete{} -> do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + let bs' = if srcOff >= len1 + then B.drop (srcOff - len1) bs2 + else B.drop srcOff (bs1 `B.append` bs2) + pure . IncompleteCodePoint (Text arr 0 dstOff) srcOff bs' $ + decodeWithPossibleStartChar Nothing bs' -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. @@ -655,4 +649,4 @@ cSizeToInt = fromIntegral #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt -#endif +#endif \ No newline at end of file diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index bec2fd55..1dfbf03a 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -25,7 +25,6 @@ module Data.Text.Encoding.Error UnicodeException(..) , OnError , OnDecodeError - , OnDecodeErrorM , OnEncodeError -- * Useful error handling functions , lenientDecode @@ -63,11 +62,6 @@ type OnError a b = String -> Maybe a -> Maybe b -- | A handler for a decoding error. type OnDecodeError = OnError Word8 Char --- | A monadic handler for a decoding error. With certain monads such as --- 'Maybe', 'Either', and 'Cont', processessing can be abandoned without --- the need to use 'error' or 'throw'. -type OnDecodeErrorM m = String -> Int -> Maybe Word8 -> m (Maybe Char) - -- | A handler for an encoding error. {-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} type OnEncodeError = OnError Char Word8 @@ -123,4 +117,4 @@ ignore _ _ = Nothing -- | Replace an invalid input with a valid output. replace :: b -> OnError a b -replace c _ _ = Just c +replace c _ _ = Just c \ No newline at end of file diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 7f6baa3b..dc098daa 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -29,7 +29,6 @@ module Data.Text.Lazy.Encoding -- *** Controllable error handling , decodeUtf8With - , decodeUtf8WithM , decodeUtf16LEWith , decodeUtf16BEWith , decodeUtf32LEWith @@ -57,12 +56,8 @@ module Data.Text.Lazy.Encoding ) where import Control.Exception (evaluate, try) -import Control.Monad.Fix (fix) -import Control.Monad.State (evalStateT, get, state) -import Control.Monad.Trans.Class (lift) -import Data.Functor.Identity (Identity(..)) import Data.Monoid (Monoid(..)) -import Data.Text.Encoding.Error (OnDecodeError, OnDecodeErrorM, UnicodeException, strictDecode) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word8) import qualified Data.ByteString as S @@ -111,31 +106,24 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text -decodeUtf8With onErr = runIdentity . flip decodeUtf8WithM (const . (Identity .) . onErr) - --- | Decode a 'ByteString' containing UTF-8 encoded text in a monad context. -decodeUtf8WithM :: Monad m => B.ByteString -> OnDecodeErrorM m -> m Text -decodeUtf8WithM (B.Chunk sb lb) onErrM = - evalStateT (TE.streamDecodeUtf8WithM sb (((lift .) .) . onErrM) (\ t cp bp f -> do - (lb', diff) <- get - case lb' of - B.Chunk sb' lb'' -> do - state $ const ((), (lb'', diff . chunk t)) - f sb' - _ -> do - t'' <- lift $ fix (\ f' bp' cp' t' -> - case S.uncons cp' of - Just (word8, cp'') -> do - mC <- onErrM desc bp' $ Just word8 - f' (bp' + 1) cp'' $ case mC of - Just c -> t' `T.snoc` c - _ -> t' - _ -> pure t') bp cp t - pure . diff $ Chunk t'' Empty - )) (lb, id) +decodeUtf8With onErr (B.Chunk b0 bs0) = + case TE.streamDecodeUtf8With onErr b0 of + TE.Some t l f -> chunk t (go f l bs0) where + go f0 _ (B.Chunk b bs) = + case f0 b of + TE.Some t l f -> chunk t (go f l bs) + go _ l _ + | S.null l = empty + | otherwise = + let !t = T.pack (skipBytes l) + skipBytes = S.foldr (\x s' -> + case onErr desc (Just x) of + Just c -> c : s' + Nothing -> s') [] in + Chunk t Empty desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" -decodeUtf8WithM _ _ = pure empty +decodeUtf8With _ _ = empty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. @@ -262,4 +250,4 @@ encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt) -- | Encode text using big endian UTF-32 encoding. encodeUtf32BE :: Text -> B.ByteString encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt) -{-# INLINE encodeUtf32BE #-} +{-# INLINE encodeUtf32BE #-} \ No newline at end of file diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 5e67f52a..fe3d1def 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -6,8 +6,6 @@ module Tests.Properties.Transcoding ( testTranscoding ) where -import Control.Concurrent.MVar (newEmptyMVar, newMVar, tryPutMVar, tryTakeMVar) -import Control.Monad.Cont (cont, runCont) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) import Test.QuickCheck hiding ((.&.)) @@ -23,8 +21,6 @@ import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Char8 as BC import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BLC -import Data.Functor.Identity (Identity(..)) -import Data.Maybe (isJust) import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E @@ -208,70 +204,45 @@ t_decode_with_error5' = ioProperty $ do pure $ case ret of Left (_ :: E.UnicodeException) -> True Right{} -> False -t_decode_withM_error1 = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x') - (\ x _ _ _ -> pure $ x === "xa") -t_decode_withM_error2 = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97]) (\ _ _ _ -> pure $ Just 'x') - (\ x _ _ _ -> pure $ x === "xaa") -t_decode_withM_error3 = - runIdentity $ E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x') - (\ x _ _ _ -> pure $ x === "xaaa") --- log error test -t_decode_withM_error4 = - ioProperty $ do - mVar <- newEmptyMVar - E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - (\ desc _ _ -> do - res <- tryPutMVar mVar desc - pure . Just $ if res then 'x' else 'y') - (\ x _ _ _ -> do - mDesc <- tryTakeMVar mVar - pure $ x == "xaaa" && isJust mDesc) --- test case demonstrating how to stop processing without 'throw' or 'error' using Either -t_decode_withM_error5 = - case do - E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - (\ _ errPos _ -> Left errPos) - (\ x _ _ _ -> pure x) of - Left pos -> pos == 0 - Right _ -> False --- test case demonstrating how to stop processing without 'throw' or 'error' using Maybe -t_decode_withM_error6 = - case do - E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - (\ _ _ _ -> Nothing) - (\ x _ _ _ -> pure x) of - Just _ -> False - _ -> True --- test case demonstrating how to stop processing without 'throw' or 'error' using the continuation monad -t_decode_withM_error7 = - runCont (do - E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - -- early exit - (\ _ _ _ -> cont $ \ _ -> True) - -- this should not be executed - (\ _ _ _ _ -> pure False) - ) id --- feed more data into the stream using the continuation -t_decode_withM_error8 = - ioProperty $ do - mVar <- newMVar $ B.pack [84, 101, 115, 116] -- 'T' 'e' 's' 't' - E.streamDecodeUtf8WithM (B.pack [0xC2, 97, 97, 97]) - (\ _ _ _ -> pure $ Just 'x') - (\ x _ _ f -> do - mBs <- tryTakeMVar mVar - case mBs of - Just bs -> f bs - _ -> pure $ x == "Test") -tl_decode_withM_error1 = - ioProperty (do - x <- EL.decodeUtf8WithM (BL.pack [0xC2, 97]) (\ _ _ _ -> pure $ Just 'x') - pure $ x == "xa") -tl_decode_withM_error2 = - runIdentity (EL.decodeUtf8WithM (BL.pack [0xE0, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaa" -tl_decode_withM_error3 = - runIdentity (EL.decodeUtf8WithM (BL.pack [0xF0, 97, 97, 97]) (\ _ _ _ -> pure $ Just 'x')) === "xaaa" + +t_decode_with_error2'' = + case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of + E.InvalidWord t pos mWord8 f -> if pos /= 1 + then pos =/= 1 + else case mWord8 of + Just 0xC2 -> case f $ Just 'x' of + E.Ok x -> t `T.append` x === "axa" + _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False + Just y -> y =/= 0xC2 + _ -> counterexample "Should have been something instead of Nothing" False + _ -> counterexample "The second word (\\xC2) should have been invalid" False +t_decode_with_error3'' = + case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of + E.InvalidWord t pos mWord8 f -> if pos /= 1 + then pos =/= 1 + else case mWord8 of + Just 0xC2 -> case f Nothing of + E.Ok x -> t `T.append` x === "aa" + _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False + Just y -> y =/= 0xC2 + _ -> counterexample "Should have been something instead of Nothing" False + _ -> counterexample "The second word (\\xC2) should have been invalid" False +t_decode_with_error4'' = + case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2 + E.IncompleteCodePoint t pos bs f -> if pos /= 3 + then pos =/= 3 + else if bs /= B.pack [0xe2] + then bs =/= B.pack [0xe2] + else case f (B.pack [0x98]) of + E.IncompleteCodePoint t' pos' bs' f' -> if pos' /= 0 + then pos' =/= 0 + else if bs' /= B.pack [0xe2, 0x98] + then bs' =/= B.pack [0xe2, 0x98] + else case f' (B.pack [0x83]) of + E.Ok x -> "hi ☃" === t `T.append` t' `T.append` x + _ -> counterexample "Should have been a completely decoded text." False + _ -> counterexample "Should have encountered an incomplete code point." False + _ -> counterexample "Should have encountered an incomplete code point." False t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -318,18 +289,10 @@ testTranscoding = testProperty "t_decode_with_error2'" t_decode_with_error2', testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', - testProperty "t_decode_withM_error1" t_decode_withM_error1, - testProperty "t_decode_withM_error2" t_decode_withM_error2, - testProperty "t_decode_withM_error3" t_decode_withM_error3, testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_withM_error4" t_decode_withM_error4, - testProperty "t_decode_withM_error5" t_decode_withM_error5, - testProperty "t_decode_withM_error6" t_decode_withM_error6, - testProperty "t_decode_withM_error7" t_decode_withM_error7, - testProperty "t_decode_withM_error8" t_decode_withM_error8, - testProperty "tl_decode_withM_error1" tl_decode_withM_error1, - testProperty "tl_decode_withM_error2" tl_decode_withM_error2, - testProperty "tl_decode_withM_error3" tl_decode_withM_error3, + testProperty "t_decode_with_error2''" t_decode_with_error2'', + testProperty "t_decode_with_error3''" t_decode_with_error3'', + testProperty "t_decode_with_error4''" t_decode_with_error4'', testProperty "t_infix_concat" t_infix_concat ] - ] + ] \ No newline at end of file diff --git a/text.cabal b/text.cabal index 9ff632d1..5231dbac 100644 --- a/text.cabal +++ b/text.cabal @@ -187,9 +187,7 @@ library bytestring >= 0.10.4 && < 0.12, deepseq >= 1.1 && < 1.5, ghc-prim >= 0.2 && < 0.10, - mtl, - template-haskell >= 2.5 && < 2.20, - transformers + template-haskell >= 2.5 && < 2.20 ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer) From 1a74c5a45351d83a31397c10a57a3fbee82c288c Mon Sep 17 00:00:00 2001 From: "David M. Sledge" Date: Sun, 3 Jul 2022 13:11:38 -0600 Subject: [PATCH 08/87] Extracted Word8 from Maybe in StreamDecode: Offending Word8 is never Nothing. Cleaned up documentation for StreamDecode and streamDecodeUtf8With'. Track byte position across continuations in streamDecodeUtf8With. Made streamDecodeUtf8With' test case a little more comprehensive. --- src/Data/Text/Encoding.hs | 117 ++++++++++++++------------ tests/Tests/Properties/Transcoding.hs | 52 ++++++------ 2 files changed, 93 insertions(+), 76 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index a4e01247..830952ad 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -330,41 +330,54 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' go t res = case res of Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With' IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f - InvalidWord txt _ mWord8 f -> go (t `append` txt) . f $ onErr desc mWord8 + InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8 desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" --- | A stream-oriented decoding result of one of three possibilities: --- --- * 'Ok' - Decoded text without issue. --- * 'IncompleteCodePoint' - An incomplete code point at the end of --- the 'ByteString' which includes the position in the 'ByteString' --- where the code point start, the incomplete code point, and a --- continuation that accepts another 'ByteString' as a continuation of --- the initial input. --- * 'InvalidWord' - An invalid utf-8 'Word8' which includes the --- position of in the 'ByteString' where the invalid data occurs, the --- offending 'Word8', and a continuation that accepts as what 'Char' it --- should be interpreted if any. -data StreamDecode = - Ok !Text - | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode) - | InvalidWord !Text !Int !(Maybe Word8) (Maybe Char -> StreamDecode) +-- | A stream-oriented decoding result of one of three possibilities. +data StreamDecode + -- | The 'ByteString' was decoded without issue. + = Ok + -- | The decoded text. + !Text + -- | An incomplete code point at the end of the 'ByteString'. + | IncompleteCodePoint + -- | The decoded text up to but not including the incomplete code + -- point. + !Text + -- | The position in the 'ByteString' where the code point + -- starts. + !Int + -- | The incomplete code point. + !ByteString + -- | A function that accepts another 'ByteString' as a + -- continuation of the previous input. + (ByteString -> StreamDecode) + -- | An invalid utf-8 'Word8'. + | InvalidWord + -- | The decoded text up to but not including the invalid + -- 'Word8'. + !Text + -- | The position in the 'ByteString' of the offending 'Word8'. + !Int + -- | The offender. + !Word8 + -- | A function that accepts a possible 'Char' as to interpret + -- the 'Word8' in this specific occurrence of it. + (Maybe Char -> StreamDecode) -- | Like 'streamDecodeUtf8With', but instead of accepting an --- 'OnDecodeError' callback, it returns a 'StreamDecode' identifying --- whether the 'ByteString' decoded without issue, it encountered an --- incomplete code point at the end of the 'ByteString', or an invalid --- 'Word8'. +-- 'OnDecodeError' callback and returning a 'Decoding', it returns a +-- 'StreamDecode'. streamDecodeUtf8With' :: ByteString -> StreamDecode -streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty +streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty where - decodeWithPossibleStartChar :: Maybe Char -> ByteString -> ByteString -> StreamDecode - decodeWithPossibleStartChar mStartChar bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do + decodeWithPossibleStartChar :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode + decodeWithPossibleStartChar mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do marr <- A.new len' case mStartChar of Just c -> do - d <- unsafeWrite marr 0 (safe c) + d <- unsafeWrite marr 0 $ safe c outer marr len' 0 d _ -> outer marr len' 0 0 where @@ -373,7 +386,7 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty index i | i < len1 = B.index bs1 i - | otherwise = B.index bs2 (i - len1) + | otherwise = B.index bs2 $ i - len1 -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 guessUtf8Boundary :: Int @@ -391,10 +404,10 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty w3 = B.index bs2 (len2 - 4) decodeFrom :: Int -> DecoderResult - decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) + decodeFrom off = step (off + 1) . utf8DecodeStart $ index off where step i (Incomplete a b) - | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b step _ st = st outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode @@ -409,38 +422,38 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing bs mempty | srcOff >= len1 , srcOff < len1 + guessUtf8Boundary , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen - , bs' <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) + , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 , isValidBS bs' = do withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) - inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) + unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff + inner (len1 + guessUtf8Boundary) $ dstOff + (len1 + guessUtf8Boundary - srcOff) | dstOff + 4 > dstLen = do let dstLen' = dstLen + 4 dst' <- A.resizeM dst dstLen' outer dst' dstLen' srcOff dstOff - | otherwise = case decodeFrom srcOff of - Accept c -> do - d <- unsafeWrite dst dstOff c - inner (srcOff + d) (dstOff + d) - Reject -> do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - let srcOff' = srcOff + 1 - bs' = if srcOff' >= len1 - then B.drop (srcOff' - len1) bs2 - else B.drop srcOff' (bs1 `B.append` bs2) - pure . InvalidWord (Text arr 0 dstOff) srcOff (Just $ index srcOff) $ \ mChar -> - decodeWithPossibleStartChar mChar bs' mempty - Incomplete{} -> do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - let bs' = if srcOff >= len1 - then B.drop (srcOff - len1) bs2 - else B.drop srcOff (bs1 `B.append` bs2) - pure . IncompleteCodePoint (Text arr 0 dstOff) srcOff bs' $ - decodeWithPossibleStartChar Nothing bs' + | otherwise = + let contin off res = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + let bs' = if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + pure $ res arr bs' + in + case decodeFrom srcOff of + Accept c -> do + d <- unsafeWrite dst dstOff c + inner (srcOff + d) $ dstOff + d + Reject -> let srcOff' = srcOff + 1 in + contin srcOff' $ \ arr bs' -> + InvalidWord (Text arr 0 dstOff) (bytePos + srcOff) (index srcOff) $ \ mChar -> + decodeWithPossibleStartChar mChar (bytePos + srcOff') bs' mempty + Incomplete{} -> let bytePos' = bytePos + srcOff in + contin srcOff $ \ arr bs' -> + IncompleteCodePoint (Text arr 0 dstOff) bytePos' bs' $ + decodeWithPossibleStartChar Nothing bytePos' bs' -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index fe3d1def..e6ce765a 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -205,44 +205,47 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False +whenEqProp a b next = if a == b + then next + else a === b + t_decode_with_error2'' = case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of - E.InvalidWord t pos mWord8 f -> if pos /= 1 - then pos =/= 1 - else case mWord8 of - Just 0xC2 -> case f $ Just 'x' of + E.InvalidWord t pos word8 f -> whenEqProp pos 1 + . whenEqProp word8 0xC2 + $ case f $ Just 'x' of E.Ok x -> t `T.append` x === "axa" _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False - Just y -> y =/= 0xC2 - _ -> counterexample "Should have been something instead of Nothing" False _ -> counterexample "The second word (\\xC2) should have been invalid" False t_decode_with_error3'' = case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of - E.InvalidWord t pos mWord8 f -> if pos /= 1 - then pos =/= 1 - else case mWord8 of - Just 0xC2 -> case f Nothing of + E.InvalidWord t pos word8 f -> whenEqProp pos 1 + . whenEqProp word8 0xC2 + $ case f Nothing of E.Ok x -> t `T.append` x === "aa" _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False - Just y -> y =/= 0xC2 - _ -> counterexample "Should have been something instead of Nothing" False _ -> counterexample "The second word (\\xC2) should have been invalid" False t_decode_with_error4'' = case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2 - E.IncompleteCodePoint t pos bs f -> if pos /= 3 - then pos =/= 3 - else if bs /= B.pack [0xe2] - then bs =/= B.pack [0xe2] - else case f (B.pack [0x98]) of - E.IncompleteCodePoint t' pos' bs' f' -> if pos' /= 0 - then pos' =/= 0 - else if bs' /= B.pack [0xe2, 0x98] - then bs' =/= B.pack [0xe2, 0x98] - else case f' (B.pack [0x83]) of - E.Ok x -> "hi ☃" === t `T.append` t' `T.append` x - _ -> counterexample "Should have been a completely decoded text." False + E.IncompleteCodePoint t pos bs f -> whenEqProp pos 3 + . whenEqProp bs (B.pack [0xe2]) + $ case f (B.pack [0x98]) of + E.IncompleteCodePoint t' pos' bs' f' -> whenEqProp pos' 3 + . whenEqProp bs' (B.pack [0xe2, 0x98]) + $ case f' (B.pack [0x83, 32, 0xFF]) of + E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7 + . whenEqProp word8 0xFF + $ case f'' $ Just 'x' of + E.Ok x -> "hi ☃ x" === t `T.append` t' `T.append` t'' `T.append` x + _ -> counterexample "Should have been decoded text." False + _ -> counterexample "Should have been an invalid word." False _ -> counterexample "Should have encountered an incomplete code point." False _ -> counterexample "Should have encountered an incomplete code point." False +t_decode_with_error5'' = + case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2, 0x98, 104]) of -- not quite "hi ☃", the last byte is wrong + E.IncompleteCodePoint _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False + E.Ok x -> counterexample ("What??? " ++ show x) False + E.InvalidWord _ pos mWord8 _ -> counterexample ("Invalid word " ++ show mWord8 ++ " at byte position " ++ show pos ++ ".") True t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -293,6 +296,7 @@ testTranscoding = testProperty "t_decode_with_error2''" t_decode_with_error2'', testProperty "t_decode_with_error3''" t_decode_with_error3'', testProperty "t_decode_with_error4''" t_decode_with_error4'', + testProperty "t_decode_with_error5''" t_decode_with_error5'', testProperty "t_infix_concat" t_infix_concat ] ] \ No newline at end of file From 6b648eef0be84699add2561be08918c83992f94d Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 3 Jul 2022 14:07:43 -0600 Subject: [PATCH 09/87] Indentation error in StreamDecode data type. --- src/Data/Text/Encoding.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 830952ad..e5b6d42e 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -338,8 +338,8 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' data StreamDecode -- | The 'ByteString' was decoded without issue. = Ok - -- | The decoded text. - !Text + -- | The decoded text. + !Text -- | An incomplete code point at the end of the 'ByteString'. | IncompleteCodePoint -- | The decoded text up to but not including the incomplete code From bc72e4eda0af3ac6840920200b776313645c087d Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 3 Jul 2022 14:35:22 -0600 Subject: [PATCH 10/87] I see how it is, Haddock 8.x... --- src/Data/Text/Encoding.hs | 44 +++++++++------------------ src/Data/Text/Encoding/Error.hs | 2 +- src/Data/Text/Lazy/Encoding.hs | 2 +- tests/Tests/Properties/Transcoding.hs | 3 +- 4 files changed, 19 insertions(+), 32 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index e5b6d42e..ba16959d 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -336,35 +336,21 @@ streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' -- | A stream-oriented decoding result of one of three possibilities. data StreamDecode - -- | The 'ByteString' was decoded without issue. - = Ok - -- | The decoded text. - !Text - -- | An incomplete code point at the end of the 'ByteString'. - | IncompleteCodePoint - -- | The decoded text up to but not including the incomplete code - -- point. - !Text - -- | The position in the 'ByteString' where the code point - -- starts. - !Int - -- | The incomplete code point. - !ByteString - -- | A function that accepts another 'ByteString' as a - -- continuation of the previous input. - (ByteString -> StreamDecode) - -- | An invalid utf-8 'Word8'. - | InvalidWord - -- | The decoded text up to but not including the invalid - -- 'Word8'. - !Text - -- | The position in the 'ByteString' of the offending 'Word8'. - !Int - -- | The offender. - !Word8 - -- | A function that accepts a possible 'Char' as to interpret - -- the 'Word8' in this specific occurrence of it. - (Maybe Char -> StreamDecode) + -- | The 'ByteString' was decoded without issue. The value contains the + -- resulting 'Text'. + = Ok !Text + -- | An incomplete code point at the end of the 'ByteString'. The value + -- contains the decoded text up to but not including the incomplete + -- code point, the position in the 'ByteString' where the code point + -- starts, the incomplete code point, and a function that accepts + -- another 'ByteString' as a continuation of the previous input. + | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode) + -- | An invalid utf-8 'Word8'. The value contains the decoded text up + -- to but not including the invalid 'Word8', the position in the + -- 'ByteString' of the offending 'Word8', the offender, and a + -- function that accepts a possible 'Char' as to interpret the + -- 'Word8' in this specific occurrence of it. + | InvalidWord !Text !Int !Word8 (Maybe Char -> StreamDecode) -- | Like 'streamDecodeUtf8With', but instead of accepting an -- 'OnDecodeError' callback and returning a 'Decoding', it returns a diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index 1dfbf03a..ea9e0997 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -117,4 +117,4 @@ ignore _ _ = Nothing -- | Replace an invalid input with a valid output. replace :: b -> OnError a b -replace c _ _ = Just c \ No newline at end of file +replace c _ _ = Just c diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index dc098daa..ad361af5 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -250,4 +250,4 @@ encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt) -- | Encode text using big endian UTF-32 encoding. encodeUtf32BE :: Text -> B.ByteString encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt) -{-# INLINE encodeUtf32BE #-} \ No newline at end of file +{-# INLINE encodeUtf32BE #-} diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index e6ce765a..86d9cf02 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -299,4 +299,5 @@ testTranscoding = testProperty "t_decode_with_error5''" t_decode_with_error5'', testProperty "t_infix_concat" t_infix_concat ] - ] \ No newline at end of file + ] + \ No newline at end of file From 8554b98fe456e235b97d373e0849e61497127fd4 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 5 Jul 2022 15:28:04 -0600 Subject: [PATCH 11/87] Design change to StreamDecode --- src/Data/Text/Encoding.hs | 97 +++++++++++++++------------ tests/Tests/Properties/Transcoding.hs | 46 +++++++------ 2 files changed, 77 insertions(+), 66 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index ba16959d..f9fc2a36 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -74,7 +74,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) import Control.Monad.ST (runST, ST) -import Data.Bits (shiftR, (.&.)) +import Data.Bits (shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B @@ -328,38 +328,35 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' where go t res = case res of - Ok txt -> Some (t `append` txt) mempty $ go empty . streamDecodeUtf8With' - IncompleteCodePoint txt _ bs f -> Some (t `append` txt) bs $ go empty . f + ThusFar txt _ bs f -> Some (t `append` txt) bs $ go empty . f InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8 desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" -- | A stream-oriented decoding result of one of three possibilities. -data StreamDecode - -- | The 'ByteString' was decoded without issue. The value contains the - -- resulting 'Text'. - = Ok !Text - -- | An incomplete code point at the end of the 'ByteString'. The value - -- contains the decoded text up to but not including the incomplete - -- code point, the position in the 'ByteString' where the code point - -- starts, the incomplete code point, and a function that accepts - -- another 'ByteString' as a continuation of the previous input. - | IncompleteCodePoint !Text !Int !ByteString (ByteString -> StreamDecode) - -- | An invalid utf-8 'Word8'. The value contains the decoded text up - -- to but not including the invalid 'Word8', the position in the - -- 'ByteString' of the offending 'Word8', the offender, and a - -- function that accepts a possible 'Char' as to interpret the - -- 'Word8' in this specific occurrence of it. - | InvalidWord !Text !Int !Word8 (Maybe Char -> StreamDecode) +data StreamDecode w + -- | The decoded 'Text' thus far with a possibly incomplete code + -- point at the end of the 'ByteString'. The value contains the + -- decoded text up to but not including the incomplete code point, + -- the position in the 'ByteString' where the code point starts, + -- the incomplete code point, and a function that accepts another + -- 'ByteString' as a continuation of the previous input. + = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w) + -- | An encounter witt an invalid utf-8 'Word8'. The value contains + -- the decoded text up to but not including the invalid 'Word8', + -- the position in the 'ByteString' of the offending 'Word8', the + -- offender, and a function that accepts a possible 'Char' as to + -- interpret the 'Word8' in this specific occurrence of it. + | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w) -- | Like 'streamDecodeUtf8With', but instead of accepting an -- 'OnDecodeError' callback and returning a 'Decoding', it returns a -- 'StreamDecode'. -streamDecodeUtf8With' :: ByteString -> StreamDecode -streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty +streamDecodeUtf8With' :: ByteString -> StreamDecode Word8 +streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty where - decodeWithPossibleStartChar :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode - decodeWithPossibleStartChar mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do + decodeAtOffset :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode Word8 + decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do marr <- A.new len' case mStartChar of Just c -> do @@ -367,6 +364,7 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty outer marr len' 0 d _ -> outer marr len' 0 0 where + wordByteSize = 1 len = len1 + len2 len' = len + 4 @@ -396,14 +394,11 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b step _ st = st - outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s StreamDecode + outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode Word8) outer dst dstLen = inner where inner srcOff dstOff - | srcOff >= len = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - pure . Ok $ Text arr 0 dstOff + | srcOff >= len = thusFar | srcOff >= len1 , srcOff < len1 + guessUtf8Boundary @@ -420,26 +415,40 @@ streamDecodeUtf8With' bs = decodeWithPossibleStartChar Nothing 0 bs mempty outer dst' dstLen' srcOff dstOff | otherwise = - let contin off res = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - let bs' = if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 - pure $ res arr bs' - in case decodeFrom srcOff of Accept c -> do d <- unsafeWrite dst dstOff c inner (srcOff + d) $ dstOff + d - Reject -> let srcOff' = srcOff + 1 in - contin srcOff' $ \ arr bs' -> - InvalidWord (Text arr 0 dstOff) (bytePos + srcOff) (index srcOff) $ \ mChar -> - decodeWithPossibleStartChar mChar (bytePos + srcOff') bs' mempty - Incomplete{} -> let bytePos' = bytePos + srcOff in - contin srcOff $ \ arr bs' -> - IncompleteCodePoint (Text arr 0 dstOff) bytePos' bs' $ - decodeWithPossibleStartChar Nothing bytePos' bs' + Reject -> invalid + Incomplete{} -> thusFar + where + wrapUp f = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + f $ Text arr 0 dstOff + continue off res = wrapUp $ \ t -> + pure . res t $ if off >= len1 + then + if off >= len + then B.empty + else B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + thusFar = + let bytePos' = bytePos + srcOff in + continue srcOff $ \ t bs' -> + ThusFar t bytePos' bs' $ + decodeAtOffset Nothing bytePos' bs' + invalid = + let srcOff' = srcOff + wordByteSize + bytesToWord n word = + if n > 0 + then bytesToWord (n - 1) $ + (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + else word + in + continue srcOff' $ \ t bs' -> + InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' -> + decodeAtOffset mChar' (bytePos + srcOff') bs' mempty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 86d9cf02..f18c0694 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -209,43 +209,45 @@ whenEqProp a b next = if a == b then next else a === b -t_decode_with_error2'' = - case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of +t_decode_utf8_with_error1 = + case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of E.InvalidWord t pos word8 f -> whenEqProp pos 1 . whenEqProp word8 0xC2 $ case f $ Just 'x' of - E.Ok x -> t `T.append` x === "axa" + E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "axa" + . whenEqProp bs B.empty $ pos' === 3 _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False _ -> counterexample "The second word (\\xC2) should have been invalid" False -t_decode_with_error3'' = - case E.streamDecodeUtf8With' (B.pack [97, 0xC2, 97]) of +t_decode_utf8_with_error2 = + case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of E.InvalidWord t pos word8 f -> whenEqProp pos 1 . whenEqProp word8 0xC2 $ case f Nothing of - E.Ok x -> t `T.append` x === "aa" + E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "aa" + . whenEqProp bs B.empty $ pos' === 3 _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False _ -> counterexample "The second word (\\xC2) should have been invalid" False -t_decode_with_error4'' = - case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2]) of -- hi \xe2 - E.IncompleteCodePoint t pos bs f -> whenEqProp pos 3 +t_decode_utf8_with_error3 = + case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2] of -- hi \xe2 + E.ThusFar t pos bs f -> whenEqProp pos 3 . whenEqProp bs (B.pack [0xe2]) - $ case f (B.pack [0x98]) of - E.IncompleteCodePoint t' pos' bs' f' -> whenEqProp pos' 3 + $ case f $ B.pack [0x98] of + E.ThusFar t' pos' bs' f' -> whenEqProp pos' 3 . whenEqProp bs' (B.pack [0xe2, 0x98]) - $ case f' (B.pack [0x83, 32, 0xFF]) of + $ case f' $ B.pack [0x83, 32, 0xFF] of E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7 . whenEqProp word8 0xFF $ case f'' $ Just 'x' of - E.Ok x -> "hi ☃ x" === t `T.append` t' `T.append` t'' `T.append` x + E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" + . whenEqProp pos''' 8 $ bs'' === B.empty _ -> counterexample "Should have been decoded text." False _ -> counterexample "Should have been an invalid word." False _ -> counterexample "Should have encountered an incomplete code point." False _ -> counterexample "Should have encountered an incomplete code point." False -t_decode_with_error5'' = - case E.streamDecodeUtf8With' (B.pack [104, 105, 32, 0xe2, 0x98, 104]) of -- not quite "hi ☃", the last byte is wrong - E.IncompleteCodePoint _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False - E.Ok x -> counterexample ("What??? " ++ show x) False - E.InvalidWord _ pos mWord8 _ -> counterexample ("Invalid word " ++ show mWord8 ++ " at byte position " ++ show pos ++ ".") True +t_decode_utf8_with_error4 = + case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2, 0x98, 104] of -- not quite "hi ☃", the last byte is wrong + E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False + E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2 t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -293,10 +295,10 @@ testTranscoding = testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_with_error2''" t_decode_with_error2'', - testProperty "t_decode_with_error3''" t_decode_with_error3'', - testProperty "t_decode_with_error4''" t_decode_with_error4'', - testProperty "t_decode_with_error5''" t_decode_with_error5'', + testProperty "t_decode_utf8_with_error1" t_decode_utf8_with_error1, + testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2, + testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3, + testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4, testProperty "t_infix_concat" t_infix_concat ] ] From 1bd9ef016459e8fed3ed9d820d66d7e6dc450e85 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 6 Jul 2022 14:19:20 -0600 Subject: [PATCH 12/87] Stream decoding added for utf-16 and utf-32. Either function added for decoding ASCII. --- src/Data/Text/Encoding.hs | 260 +++++++++++++++-------- src/Data/Text/Internal/Encoding/Utf16.hs | 38 +++- src/Data/Text/Internal/Encoding/Utf32.hs | 32 +++ tests/Tests/Properties/Transcoding.hs | 80 +++++++ text.cabal | 1 - 5 files changed, 322 insertions(+), 89 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index f9fc2a36..fecf8adb 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -35,6 +35,7 @@ module Data.Text.Encoding , decodeUtf8' -- *** Controllable error handling + , decodeAsciiE , decodeUtf8With , decodeUtf16LEWith , decodeUtf16BEWith @@ -46,6 +47,8 @@ module Data.Text.Encoding , streamDecodeUtf8 , streamDecodeUtf8With , streamDecodeUtf8With' + , streamDecodeUtf16With + , streamDecodeUtf32With , Decoding(..) , StreamDecode(..) @@ -74,7 +77,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) import Control.Monad.ST (runST, ST) -import Data.Bits (shiftL, shiftR, (.&.), (.|.)) +import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B @@ -85,7 +88,7 @@ import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) -import Data.Word (Word8) +import Data.Word (Word8, Word16, Word32) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) @@ -96,6 +99,8 @@ import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..)) +import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes) +import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F @@ -213,6 +218,20 @@ isValidBS bs = start 0 #endif #endif +-- | Decode a 'ByteString' containing 7-bit ASCII encoded text. +-- +-- This is a total function: On success the decoded 'Text' is within a +-- 'Right' value, and an error ('Left' 'Int') indicates the postion of +-- the offending 'Word8'. +decodeAsciiE :: ByteString -> Either Int Text +decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do + asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + c_is_ascii src (src `plusPtr` len) + pure $ if asciiPrefixLen == len + then let !(SBS.SBS arr) = SBS.toShort bs in + Right (Text (A.ByteArray arr) 0 len) + else Left asciiPrefixLen + -- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- Surrogate code points in replacement character returned by 'OnDecodeError' @@ -349,13 +368,20 @@ data StreamDecode w -- interpret the 'Word8' in this specific occurrence of it. | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w) --- | Like 'streamDecodeUtf8With', but instead of accepting an --- 'OnDecodeError' callback and returning a 'Decoding', it returns a --- 'StreamDecode'. -streamDecodeUtf8With' :: ByteString -> StreamDecode Word8 -streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty +data Progression + = WriteAndAdvance Char (Int -> Int) + | NeedMore + | Invalid + +streamDecodeWith :: (Bits w, Num w) + => Int + -> Bool + -> ByteString + -> ((Int -> Word8) -> Int -> Int -> Progression) + -> StreamDecode w +streamDecodeWith wordByteSize isUtf bs decodeF = decodeAtOffset Nothing 0 bs mempty where - decodeAtOffset :: Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode Word8 + decodeAtOffset :: (Bits w, Num w) => Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode w decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do marr <- A.new len' case mStartChar of @@ -364,91 +390,153 @@ streamDecodeUtf8With' bs = decodeAtOffset Nothing 0 bs mempty outer marr len' 0 d _ -> outer marr len' 0 0 where - wordByteSize = 1 - len = len1 + len2 - len' = len + 4 - + index :: Int -> Word8 index i | i < len1 = B.index bs1 i | otherwise = B.index bs2 $ i - len1 - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 + len :: Int + len = len1 + len2 + len' :: Int + len' = (len `div` wordByteSize) + 4 + + queryutf8Optim srcOff dstOff dstLen dst f g + -- shortcut for utf-8 + | wordByteSize == 1 + , isUtf + , srcOff >= len1 + -- potential valid utf8 content endpoint + , utf8End <- len1 + guessUtf8Boundary + , srcOff < utf8End + -- potential valid utf8 content length + , utf8Len <- utf8End - srcOff + -- potential endpoint in destination array if copied + , dstOff' <- dstOff + utf8Len + , dstOff' <= dstLen + , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 + , isValidBS bs' = do + withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len + f utf8End dstOff' + | otherwise = g where - w0 = B.index bs2 (len2 - 1) - w1 = B.index bs2 (len2 - 2) - w2 = B.index bs2 (len2 - 3) - w3 = B.index bs2 (len2 - 4) + -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 + guessUtf8Boundary :: Int + guessUtf8Boundary + | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII + | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point + | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point + | len2 >= 3 && w2 >= 0xC0 = len2 - 3 + | len2 >= 4 && w3 >= 0xC0 = len2 - 4 + | otherwise = 0 + where + w0 = B.index bs2 (len2 - 1) + w1 = B.index bs2 (len2 - 2) + w2 = B.index bs2 (len2 - 3) + w3 = B.index bs2 (len2 - 4) - decodeFrom :: Int -> DecoderResult - decodeFrom off = step (off + 1) . utf8DecodeStart $ index off + outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode w) + outer dst dstLen = inner where - step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st + inner srcOff dstOff + -- finished + | len - srcOff < 1 = incomplete + -- shortcut for utf-8 + | otherwise = queryutf8Optim srcOff dstOff dstLen dst inner $ + if len - srcOff < wordByteSize + -- incomplete code point + then incomplete + else + if dstOff + 4 > dstLen + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case decodeF index len srcOff of + WriteAndAdvance c advance -> do + d <- unsafeWrite dst dstOff c + inner (advance d) $ dstOff + d + NeedMore -> incomplete + Invalid -> invalid + where + wrapUp f = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + f $ Text arr 0 dstOff + contin off res = wrapUp $ \ t -> + pure . res t $ if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + incomplete = + let bytePos' = bytePos + srcOff in + contin srcOff $ \ t bs' -> + ThusFar t bytePos' bs' $ + decodeAtOffset Nothing bytePos' bs' + invalid = + let srcOff' = srcOff + wordByteSize + bytesToWord n word = + if n > 0 + then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + else word + in + contin srcOff' $ \ t bs' -> + InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' -> + decodeAtOffset mChar' (bytePos + srcOff') bs' mempty - outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode Word8) - outer dst dstLen = inner - where - inner srcOff dstOff - | srcOff >= len = thusFar - - | srcOff >= len1 - , srcOff < len1 + guessUtf8Boundary - , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen - , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 - , isValidBS bs' = do - withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO . A.copyFromPointer dst dstOff src $ len1 + guessUtf8Boundary - srcOff - inner (len1 + guessUtf8Boundary) $ dstOff + (len1 + guessUtf8Boundary - srcOff) - - | dstOff + 4 > dstLen = do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - - | otherwise = - case decodeFrom srcOff of - Accept c -> do - d <- unsafeWrite dst dstOff c - inner (srcOff + d) $ dstOff + d - Reject -> invalid - Incomplete{} -> thusFar - where - wrapUp f = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - f $ Text arr 0 dstOff - continue off res = wrapUp $ \ t -> - pure . res t $ if off >= len1 - then - if off >= len - then B.empty - else B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 - thusFar = - let bytePos' = bytePos + srcOff in - continue srcOff $ \ t bs' -> - ThusFar t bytePos' bs' $ - decodeAtOffset Nothing bytePos' bs' - invalid = - let srcOff' = srcOff + wordByteSize - bytesToWord n word = - if n > 0 - then bytesToWord (n - 1) $ - (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - else word - in - continue srcOff' $ \ t bs' -> - InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' -> - decodeAtOffset mChar' (bytePos + srcOff') bs' mempty +-- | Like 'streamDecodeUtf8With', but instead of accepting an +-- 'OnDecodeError' callback and returning a 'Decoding', it returns a +-- 'StreamDecode'. +streamDecodeUtf8With' :: ByteString -> StreamDecode Word8 +streamDecodeUtf8With' bs = streamDecodeWith 1 True bs $ \ index len srcOff -> + let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off + where + step i (Incomplete a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b + step _ st = st + in + case decodeFrom srcOff of + Accept c -> WriteAndAdvance c (srcOff +) + Reject -> Invalid + Incomplete{} -> NeedMore + +-- | Like 'streamDecodeUtf8With'', but for UTF-16 encoding and a +-- 'Bool' argument for whether the encoding is big- or little-endian. +streamDecodeUtf16With :: Bool -> ByteString -> StreamDecode Word16 +streamDecodeUtf16With isBE bs = streamDecodeWith 2 True bs $ \ index len srcOff -> + -- get next Word8 pair + let writeAndAdvance c n = WriteAndAdvance c $ const n + b0 = index $ if isBE then srcOff else srcOff + 1 + b1 = index $ if isBE then srcOff + 1 else srcOff + in + case queryUtf16Bytes b0 of + OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2 + TwoWord16 g -> + if len - srcOff < 4 + -- not enough Word8s to finish the code point + then NeedMore + else + let b2 = index $ srcOff + (if isBE then 2 else 3) + b3 = index $ srcOff + (if isBE then 3 else 2) + in + case g b2 of + Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4 + _ -> Invalid + _ -> Invalid + +-- | Like 'streamDecodeUtf16With', but for UTF-32. +streamDecodeUtf32With :: Bool -> ByteString -> StreamDecode Word32 +streamDecodeUtf32With isBE bs = streamDecodeWith 4 True bs $ \ index _ srcOff -> + -- get next Word8 quartet + let writeAndAdvance c n = WriteAndAdvance c $ const n in + case queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3 of + Just f -> case f . index $ srcOff + (if isBE then 1 else 2) of + Just f' -> case f' . index $ srcOff + (if isBE then 2 else 1) of + Just f'' -> writeAndAdvance (f'' . index $ if isBE then srcOff + 3 else srcOff) $ srcOff + 4 + _ -> Invalid + _ -> Invalid + _ -> Invalid -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. @@ -657,4 +745,4 @@ cSizeToInt = fromIntegral #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt -#endif \ No newline at end of file +#endif diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs index 4fe11a62..34bc78dd 100644 --- a/src/Data/Text/Internal/Encoding/Utf16.hs +++ b/src/Data/Text/Internal/Encoding/Utf16.hs @@ -23,14 +23,17 @@ module Data.Text.Internal.Encoding.Utf16 chr2 , validate1 , validate2 + , Utf16Result(..) + , queryUtf16Bytes ) where +import Data.Bits ((.&.)) import GHC.Exts -import GHC.Word (Word16(..)) +import GHC.Word (Word16(..), Word8(..)) #if !MIN_VERSION_base(4,16,0) -- harmless to import, except for warnings that it is unused. -import Data.Text.Internal.PrimCompat ( word16ToWord# ) +import Data.Text.Internal.PrimCompat ( word16ToWord#, word8ToWord# ) #endif chr2 :: Word16 -> Word16 -> Char @@ -50,3 +53,34 @@ validate2 :: Word16 -> Word16 -> Bool validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF && x2 >= 0xDC00 && x2 <= 0xDFFF {-# INLINE validate2 #-} + +data Utf16Result + = OneWord16 (Word8 -> Char) + | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char)) + | Invalid16 + +-- queryUtf16Bytes :: Word8 -> (Word8 -> Word8 -> a) +-- -> Utf16Result a +queryUtf16Bytes b0@(W8# w0#) = + if b0 < 0xD8 || b0 >= 0xE0 + then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#)))) + else + -- 110110xx: start of surrogate pair + if b0 .&. 0xFC == 0xD8 + then TwoWord16 $ \ b2@(W8# w2#) -> + if b2 .&. 0xFC == 0xDC + -- valid surrogate + then Just $ \ (W8# w1#) (W8# w3#) -> + C# (chr# ( + (orI# + (orI# + (orI# + (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#)) + (word2Int# (shiftL# (word8ToWord# w1#) 10#)) + ) + (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#))) + (word2Int# (word8ToWord# w3#))) +# 0x10000# + )) + else Nothing + else Invalid16 +{-# INLINE queryUtf16Bytes #-} diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs index 4e8e9b46..517ff05a 100644 --- a/src/Data/Text/Internal/Encoding/Utf32.hs +++ b/src/Data/Text/Internal/Encoding/Utf32.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE CPP, MagicHash #-} + -- | -- Module : Data.Text.Internal.Encoding.Utf32 -- Copyright : (c) 2008, 2009 Tom Harper, @@ -17,10 +19,40 @@ module Data.Text.Internal.Encoding.Utf32 ( validate + , queryUtf32Bytes ) where import Data.Word (Word32) +import GHC.Exts +import GHC.Word (Word8(..)) + +#if !MIN_VERSION_base(4,16,0) +-- harmless to import, except for warnings that it is unused. +import Data.Text.Internal.PrimCompat (word8ToWord#) +#endif validate :: Word32 -> Bool validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF) {-# INLINE validate #-} + +queryUtf32Bytes :: (Eq a, Num a) => a -> Maybe (Word8 -> Maybe (Word8 -> Maybe (Word8 -> Char))) +queryUtf32Bytes b0 = + if b0 == 0 + then Just $ \ b1@(W8# w1#) -> + if b1 < 0x11 + then Just $ \ b2@(W8# w2#) -> + if b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 + then Just $ \ (W8# w3#) -> + C# (chr# + (orI# + (orI# + (word2Int# (shiftL# (word8ToWord# w1#) 16#)) + (word2Int# (shiftL# (word8ToWord# w2#) 8#)) + ) + (word2Int# (word8ToWord# w3#)) + ) + ) + else Nothing + else Nothing + else Nothing +{-# INLINE queryUtf32Bytes #-} diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index f18c0694..e6172b47 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -27,6 +27,9 @@ import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL +t_asciiE t = E.decodeAsciiE (E.encodeUtf8 a) === Right a + where a = T.map (\c -> chr (ord c `mod` 128)) t + t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a @@ -249,6 +252,78 @@ t_decode_utf8_with_error4 = E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2 +t_decode_utf16BE_with_error = + case E.streamDecodeUtf16With True $ B.pack [0] of + E.ThusFar t pos bs f -> whenEqProp pos 0 + . whenEqProp bs (B.pack [0]) + $ case f $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] of + E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6 + . whenEqProp bs' (B.pack [0xD8, 0x01]) + $ case f' $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] of + E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12 + . whenEqProp word16 (0xDC `Bits.shiftL` 8) + $ case f'' $ Just 'x' of + E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x" + . whenEqProp pos''' 14 $ bs'' === B.empty + _ -> counterexample "Should have been decoded text." False + _ -> counterexample "Should have been an invalid word." False + _ -> counterexample "Should have encountered an incomplete code point." False + _ -> counterexample "Should have encountered an incomplete code point." False + +t_decode_utf16LE_with_error = + case E.streamDecodeUtf16With False $ B.pack [104] of + E.ThusFar t pos bs f -> whenEqProp pos 0 + . whenEqProp bs (B.pack [104]) + $ case f $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] of + E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6 + . whenEqProp bs' (B.pack [0x01, 0xD8]) + $ case f' $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] of + E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12 + . whenEqProp word16 0xDC + $ case f'' $ Just 'x' of + E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x" + . whenEqProp pos''' 14 $ bs'' === B.empty + _ -> counterexample "Should have been decoded text." False + _ -> counterexample "Should have been an invalid word." False + _ -> counterexample "Should have encountered an incomplete code point." False + _ -> counterexample "Should have encountered an incomplete code point." False + +t_decode_utf32BE_with_error = + case E.streamDecodeUtf32With True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] of -- hi \xe2 + E.ThusFar t pos bs f -> whenEqProp pos 8 + . whenEqProp bs (B.pack [0, 0]) + $ case f $ B.pack [0, 32, 0, 0, 0x26] of + E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12 + . whenEqProp bs' (B.pack [0, 0, 0x26]) + $ case f' $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] of + E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20 + . whenEqProp word32 0x0000D800 + $ case f'' $ Just 'x' of + E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" + . whenEqProp pos''' 24 $ bs'' === B.empty + _ -> counterexample "Should have been decoded text." False + _ -> counterexample "Should have been an invalid word." False + _ -> counterexample "Should have encountered an incomplete code point." False + _ -> counterexample "Should have encountered an incomplete code point." False + +t_decode_utf32LE_with_error = + case E.streamDecodeUtf32With False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 32, 0] of -- hi \xe2 + E.ThusFar t pos bs f -> whenEqProp pos 8 + . whenEqProp bs (B.pack [32, 0]) + $ case f $ B.pack [0, 0, 0x03, 0x26, 0] of + E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12 + . whenEqProp bs' (B.pack [0x03, 0x26, 0]) + $ case f' $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] of + E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20 + . whenEqProp word32 0x00D80000 + $ case f'' $ Just 'x' of + E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" + . whenEqProp pos''' 24 $ bs'' === B.empty + _ -> counterexample "Should have been decoded text." False + _ -> counterexample "Should have been an invalid word." False + _ -> counterexample "Should have encountered an incomplete code point." False + _ -> counterexample "Should have encountered an incomplete code point." False + t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> text `T.isInfixOf` @@ -257,6 +332,7 @@ t_infix_concat bs1 text bs2 = testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ + testProperty "t_asciiE" t_asciiE, testProperty "t_ascii" t_ascii, testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, @@ -299,6 +375,10 @@ testTranscoding = testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2, testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3, testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4, + testProperty "t_decode_utf16BE_with_error" t_decode_utf16BE_with_error, + testProperty "t_decode_utf16LE_with_error" t_decode_utf16LE_with_error, + testProperty "t_decode_utf32BE_with_error" t_decode_utf32BE_with_error, + testProperty "t_decode_utf32LE_with_error" t_decode_utf32LE_with_error, testProperty "t_infix_concat" t_infix_concat ] ] diff --git a/text.cabal b/text.cabal index 5231dbac..488f1451 100644 --- a/text.cabal +++ b/text.cabal @@ -254,7 +254,6 @@ test-suite tests deepseq, directory, ghc-prim, - mtl, tasty, tasty-hunit, tasty-quickcheck, From 22b866656dcc7167a96b12cdf5bbd90a0bbf7398 Mon Sep 17 00:00:00 2001 From: Sledge Date: Thu, 7 Jul 2022 11:17:38 -0600 Subject: [PATCH 13/87] Typo in documentation Co-authored-by: Xia Li-yao --- src/Data/Text/Encoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index fecf8adb..f44a9999 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -361,7 +361,7 @@ data StreamDecode w -- the incomplete code point, and a function that accepts another -- 'ByteString' as a continuation of the previous input. = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w) - -- | An encounter witt an invalid utf-8 'Word8'. The value contains + -- | An encounter with an invalid UTF-8 'Word8'. The value contains -- the decoded text up to but not including the invalid 'Word8', -- the position in the 'ByteString' of the offending 'Word8', the -- offender, and a function that accepts a possible 'Char' as to From 1dd6f34d9688e45b45e2d703c087136c4e62c602 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 10 Jul 2022 20:16:38 -0600 Subject: [PATCH 14/87] Redesigned with different approach --- src/Data/Text/Encoding.hs | 429 ++++++++++++----------- src/Data/Text/Encoding/Types.hs | 35 ++ src/Data/Text/Internal/Encoding/Utf16.hs | 3 +- src/Data/Text/Lazy/Encoding.hs | 35 +- tests/Tests/Properties/Transcoding.hs | 270 +++++++------- text.cabal | 1 + 6 files changed, 441 insertions(+), 332 deletions(-) create mode 100644 src/Data/Text/Encoding/Types.hs diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index f44a9999..4ed1edc8 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -30,6 +30,13 @@ module Data.Text.Encoding -- $total decodeLatin1 , decodeUtf8Lenient + , decodeUtf8Chunk + , decodeUtf8Chunks + , decodeUtf16Chunk + , decodeUtf16Chunks + , decodeUtf32Chunk + , decodeUtf32Chunks + , DecodeResult(..) -- *** Catchable failure , decodeUtf8' @@ -46,11 +53,7 @@ module Data.Text.Encoding -- $stream , streamDecodeUtf8 , streamDecodeUtf8With - , streamDecodeUtf8With' - , streamDecodeUtf16With - , streamDecodeUtf32With , Decoding(..) - , StreamDecode(..) -- ** Partial Functions -- $partial @@ -76,14 +79,15 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) -import Control.Monad.ST (runST, ST) +import Control.Monad.ST (runST) import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Internal (Text(..), safe, empty, append) +import Data.Text.Encoding.Types (DecodeResult(..)) +import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Show as T (singleton) @@ -91,7 +95,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8, Word16, Word32) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) -import Foreign.Storable (poke, peekByteOff) +import Foreign.Storable (Storable(..), poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import qualified Data.ByteString.Builder as B @@ -218,6 +222,205 @@ isValidBS bs = start 0 #endif #endif +data Progression + = WriteAndAdvance Char (Int -> Int) + | NeedMore + | Invalid + +decodeChunksProxy :: (Bits w, Num w, Storable w) => + w -- only used for Storable.sizeOf argument which the function discards + -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) + -> ((Int -> Word8) -> Int -> Int -> Progression) + -> ByteString + -> ByteString + -> DecodeResult Text ByteString w +decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do + marr <- A.new len' + outer marr len' 0 0 + where + wordByteSize = sizeOf w + + index :: Int -> Word8 + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + + len :: Int + len = len1 + len2 + len' :: Int + len' = (len `div` wordByteSize) + 4 + + -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w) + outer dst dstLen = inner + where + inner srcOff dstOff + -- finished + | len - srcOff < 1 = goodSoFar + -- shortcut for utf-8 + | otherwise = + case queryOptimization bs1 bs2 srcOff of + Just (arr, tLen) -> + let minLen = tLen + dstOff in + if minLen > dstLen + then do + let dstLen' = minLen + 4 + dst' <- A.resizeM dst dstLen' + A.copyI tLen dst' dstOff arr 0 + outer dst' dstLen' (srcOff + tLen) minLen + else do + A.copyI tLen dst dstOff arr 0 + inner (srcOff + tLen) minLen + _ -> if len - srcOff < wordByteSize + -- incomplete code point + then goodSoFar + else + if dstOff + 4 > dstLen + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case decodeF index len srcOff of + WriteAndAdvance c advance -> do + d <- unsafeWrite dst dstOff c + inner (advance d) $ dstOff + d + NeedMore -> goodSoFar + Invalid -> invalid + where + contin off res = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + pure . res (Text arr 0 dstOff) $ if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + goodSoFar = + contin srcOff $ \ t bs' -> + DecodeResult t Nothing bs' srcOff + invalid = + let srcOff' = srcOff + wordByteSize + bytesToWord n word = + if n > 0 + then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + else word + in + contin srcOff' $ \ t bs' -> + DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' + +decodeChunks :: (Bits w, Num w, Storable w) => + (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) + -> ((Int -> Word8) -> Int -> Int -> Progression) + -> ByteString + -> ByteString + -> DecodeResult Text ByteString w +decodeChunks = decodeChunksProxy undefined -- This allows Haskell can +-- determine the size in bytes of a data type using Storable.sizeOf +-- so that it doesn't have to be passed as an arugment. Storable.sizeOf +-- discards the actual value without resolving it. + +queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int) +queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff + | srcOff >= len1 + -- potential valid utf8 content endpoint + , utf8End <- len1 + guessUtf8Boundary + , srcOff < utf8End + -- potential valid utf8 content length + , utf8Len <- utf8End - srcOff + , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 + , isValidBS bs' = Just (runST $ do + marr <- A.new utf8Len + withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyFromPointer marr 0 src utf8Len + arr <- A.unsafeFreeze marr + pure $ (arr, utf8Len)) + | otherwise = Nothing + where + -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 + guessUtf8Boundary :: Int + guessUtf8Boundary + | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII + | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point + | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point + | len2 >= 3 && w2 >= 0xC0 = len2 - 3 + | len2 >= 4 && w3 >= 0xC0 = len2 - 4 + | otherwise = 0 + where + w0 = B.index bs2 (len2 - 1) + w1 = B.index bs2 (len2 - 2) + w2 = B.index bs2 (len2 - 3) + w3 = B.index bs2 (len2 - 4) + +decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression +decodeUtf8Base index len srcOff = + let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off + where + step i (Incomplete a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b + step _ st = st + in + case decodeFrom srcOff of + Accept c -> WriteAndAdvance c (srcOff +) + Reject -> Invalid + Incomplete{} -> NeedMore + +-- | Decode two 'ByteString's containing UTF-8-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8 +decodeUtf8Chunks = decodeChunks queryUtf8DecodeOptimization decodeUtf8Base + +-- | Decode a 'ByteString' containing UTF-8-encoded text returning a +-- 'DecodeResult'. +decodeUtf8Chunk :: ByteString -> DecodeResult Text ByteString Word8 +decodeUtf8Chunk = decodeUtf8Chunks mempty + +noOptimization :: p1 -> p2 -> p3 -> Maybe a +noOptimization _ _ _ = Nothing + +-- | Decode two 'ByteString's containing UTF-16-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16 +decodeUtf16Chunks isBE = decodeChunks noOptimization $ \ index len srcOff -> + -- get next Word8 pair + let writeAndAdvance c n = WriteAndAdvance c $ const n + b0 = index $ if isBE then srcOff else srcOff + 1 + b1 = index $ if isBE then srcOff + 1 else srcOff + in + case queryUtf16Bytes b0 of + OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2 + TwoWord16 g -> + if len - srcOff < 4 + -- not enough Word8s to finish the code point + then NeedMore + else + let b2 = index $ srcOff + (if isBE then 2 else 3) + b3 = index $ srcOff + (if isBE then 3 else 2) + in + case g b2 of + Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4 + _ -> Invalid + _ -> Invalid + +-- | Decode a 'ByteString' containing UTF-16-encoded text returning a +-- 'DecodeResult'. +decodeUtf16Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word16 +decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty + +-- | Decode two 'ByteString's containing UTF-16-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32 +decodeUtf32Chunks isBE = decodeChunks noOptimization $ \ index _ srcOff -> + -- get next Word8 quartet + case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3) + >>= ($ (index $ srcOff + (if isBE then 1 else 2))) + >>= ($ (index $ srcOff + (if isBE then 2 else 1))) of + Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4 + _ -> Invalid + +-- | Decode a 'ByteString' containing UTF-32-encoded text returning a +-- 'DecodeResult'. +decodeUtf32Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word32 +decodeUtf32Chunk isBE = decodeUtf32Chunks isBE mempty + -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a total function: On success the decoded 'Text' is within a @@ -255,10 +458,9 @@ decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of -- $stream -- --- The 'streamDecodeUtf8', 'streamDecodeUtf8With', and --- 'streamDecodeUtf8With'' functions accept a 'ByteString' that --- represents a possibly incomplete input (e.g. a packet from a network --- stream) that may not end on a UTF-8 boundary. +-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept +-- a 'ByteString' that represents a possibly incomplete input (e.g. a +-- packet from a network stream) that may not end on a UTF-8 boundary. -- -- 1. The maximal prefix of 'Text' that could be decoded from the -- given input. @@ -335,7 +537,7 @@ streamDecodeUtf8 :: ByteString -> Decoding streamDecodeUtf8 = streamDecodeUtf8With strictDecode --- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8 +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 -- encoded text. -- -- @since 1.0.0.0 @@ -344,199 +546,18 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = go empty . streamDecodeUtf8With' - where - go t res = case res of - ThusFar txt _ bs f -> Some (t `append` txt) bs $ go empty . f - InvalidWord txt _ word8 f -> go (t `append` txt) . f . onErr desc $ Just word8 - - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - --- | A stream-oriented decoding result of one of three possibilities. -data StreamDecode w - -- | The decoded 'Text' thus far with a possibly incomplete code - -- point at the end of the 'ByteString'. The value contains the - -- decoded text up to but not including the incomplete code point, - -- the position in the 'ByteString' where the code point starts, - -- the incomplete code point, and a function that accepts another - -- 'ByteString' as a continuation of the previous input. - = ThusFar !Text !Int !ByteString (ByteString -> StreamDecode w) - -- | An encounter with an invalid UTF-8 'Word8'. The value contains - -- the decoded text up to but not including the invalid 'Word8', - -- the position in the 'ByteString' of the offending 'Word8', the - -- offender, and a function that accepts a possible 'Char' as to - -- interpret the 'Word8' in this specific occurrence of it. - | InvalidWord !Text !Int !w (Maybe Char -> StreamDecode w) - -data Progression - = WriteAndAdvance Char (Int -> Int) - | NeedMore - | Invalid - -streamDecodeWith :: (Bits w, Num w) - => Int - -> Bool - -> ByteString - -> ((Int -> Word8) -> Int -> Int -> Progression) - -> StreamDecode w -streamDecodeWith wordByteSize isUtf bs decodeF = decodeAtOffset Nothing 0 bs mempty +streamDecodeUtf8With onErr = g empty mempty where - decodeAtOffset :: (Bits w, Num w) => Maybe Char -> Int -> ByteString -> ByteString -> StreamDecode w - decodeAtOffset mStartChar bytePos bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do - marr <- A.new len' - case mStartChar of - Just c -> do - d <- unsafeWrite marr 0 $ safe c - outer marr len' 0 d - _ -> outer marr len' 0 0 - where - index :: Int -> Word8 - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - - len :: Int - len = len1 + len2 - len' :: Int - len' = (len `div` wordByteSize) + 4 - - queryutf8Optim srcOff dstOff dstLen dst f g - -- shortcut for utf-8 - | wordByteSize == 1 - , isUtf - , srcOff >= len1 - -- potential valid utf8 content endpoint - , utf8End <- len1 + guessUtf8Boundary - , srcOff < utf8End - -- potential valid utf8 content length - , utf8Len <- utf8End - srcOff - -- potential endpoint in destination array if copied - , dstOff' <- dstOff + utf8Len - , dstOff' <= dstLen - , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 - , isValidBS bs' = do - withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len - f utf8End dstOff' - | otherwise = g - where - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 - where - w0 = B.index bs2 (len2 - 1) - w1 = B.index bs2 (len2 - 2) - w2 = B.index bs2 (len2 - 3) - w3 = B.index bs2 (len2 - 4) - - outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (StreamDecode w) - outer dst dstLen = inner - where - inner srcOff dstOff - -- finished - | len - srcOff < 1 = incomplete - -- shortcut for utf-8 - | otherwise = queryutf8Optim srcOff dstOff dstLen dst inner $ - if len - srcOff < wordByteSize - -- incomplete code point - then incomplete - else - if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case decodeF index len srcOff of - WriteAndAdvance c advance -> do - d <- unsafeWrite dst dstOff c - inner (advance d) $ dstOff + d - NeedMore -> incomplete - Invalid -> invalid - where - wrapUp f = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - f $ Text arr 0 dstOff - contin off res = wrapUp $ \ t -> - pure . res t $ if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 - incomplete = - let bytePos' = bytePos + srcOff in - contin srcOff $ \ t bs' -> - ThusFar t bytePos' bs' $ - decodeAtOffset Nothing bytePos' bs' - invalid = - let srcOff' = srcOff + wordByteSize - bytesToWord n word = - if n > 0 - then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - else word - in - contin srcOff' $ \ t bs' -> - InvalidWord t (bytePos + srcOff) (bytesToWord wordByteSize 0) $ \ mChar' -> - decodeAtOffset mChar' (bytePos + srcOff') bs' mempty - --- | Like 'streamDecodeUtf8With', but instead of accepting an --- 'OnDecodeError' callback and returning a 'Decoding', it returns a --- 'StreamDecode'. -streamDecodeUtf8With' :: ByteString -> StreamDecode Word8 -streamDecodeUtf8With' bs = streamDecodeWith 1 True bs $ \ index len srcOff -> - let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off - where - step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st - in - case decodeFrom srcOff of - Accept c -> WriteAndAdvance c (srcOff +) - Reject -> Invalid - Incomplete{} -> NeedMore - --- | Like 'streamDecodeUtf8With'', but for UTF-16 encoding and a --- 'Bool' argument for whether the encoding is big- or little-endian. -streamDecodeUtf16With :: Bool -> ByteString -> StreamDecode Word16 -streamDecodeUtf16With isBE bs = streamDecodeWith 2 True bs $ \ index len srcOff -> - -- get next Word8 pair - let writeAndAdvance c n = WriteAndAdvance c $ const n - b0 = index $ if isBE then srcOff else srcOff + 1 - b1 = index $ if isBE then srcOff + 1 else srcOff - in - case queryUtf16Bytes b0 of - OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2 - TwoWord16 g -> - if len - srcOff < 4 - -- not enough Word8s to finish the code point - then NeedMore - else - let b2 = index $ srcOff + (if isBE then 2 else 3) - b3 = index $ srcOff + (if isBE then 3 else 2) - in - case g b2 of - Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4 - _ -> Invalid - _ -> Invalid - --- | Like 'streamDecodeUtf16With', but for UTF-32. -streamDecodeUtf32With :: Bool -> ByteString -> StreamDecode Word32 -streamDecodeUtf32With isBE bs = streamDecodeWith 4 True bs $ \ index _ srcOff -> - -- get next Word8 quartet - let writeAndAdvance c n = WriteAndAdvance c $ const n in - case queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3 of - Just f -> case f . index $ srcOff + (if isBE then 1 else 2) of - Just f' -> case f' . index $ srcOff + (if isBE then 2 else 1) of - Just f'' -> writeAndAdvance (f'' . index $ if isBE then srcOff + 3 else srcOff) $ srcOff + 4 - _ -> Invalid - _ -> Invalid - _ -> Invalid + g t bs0 bs1 = + let DecodeResult t' mW bs1' _ = decodeUtf8Chunks bs0 bs1 + txt = t `append` t' + in + (case (mW :: Maybe Word8) of + Just _ -> + g (case onErr "" mW of + Just c -> txt `append` T.singleton c + _ -> txt) mempty + _ -> Some txt bs1' . g empty) bs1' -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs new file mode 100644 index 00000000..e791a55e --- /dev/null +++ b/src/Data/Text/Encoding/Types.hs @@ -0,0 +1,35 @@ +{-# LANGUAGE CPP, DeriveDataTypeable #-} +{-# LANGUAGE Safe #-} +-- | +-- Module : Data.Text.Encoding.Error +-- Copyright : (c) Bryan O'Sullivan 2009 +-- +-- License : BSD-style +-- Maintainer : bos@serpentine.com +-- Portability : GHC +-- +-- Types and functions for dealing with encoding and decoding errors +-- in Unicode text. +-- +-- The standard functions for encoding and decoding text are strict, +-- which is to say that they throw exceptions on invalid input. This +-- is often unhelpful on real world input, so alternative functions +-- exist that accept custom handlers for dealing with invalid inputs. +-- These 'OnError' handlers are normal Haskell functions. You can use +-- one of the presupplied functions in this module, or you can write a +-- custom handler of your own. + +module Data.Text.Encoding.Types + ( DecodeResult(..) ) + where + +-- | A decoding result on encoded data. +data DecodeResult t b w = DecodeResult + !t -- ^ The decoded data up to an incomplete code point at + -- the end of the input data, an invalid word, or to the + -- end of the input. + !(Maybe w) -- ^ If an invalid code point was encountered. + !b -- ^ The remaining undecoded data. If an invald code + -- point was encountered, this is after that code point. + !Int -- ^ Byte position of remaining undecoded data. + deriving (Eq, Ord, Show, Read) diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs index 34bc78dd..2749fa34 100644 --- a/src/Data/Text/Internal/Encoding/Utf16.hs +++ b/src/Data/Text/Internal/Encoding/Utf16.hs @@ -59,8 +59,7 @@ data Utf16Result | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char)) | Invalid16 --- queryUtf16Bytes :: Word8 -> (Word8 -> Word8 -> a) --- -> Utf16Result a +queryUtf16Bytes :: Word8 -> Utf16Result queryUtf16Bytes b0@(W8# w0#) = if b0 < 0xD8 || b0 >= 0xE0 then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#)))) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index ad361af5..62964203 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,6 +23,9 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 + , decodeUtf8Chunk + , decodeUtf16Chunk + , decodeUtf32Chunk -- *** Catchable failure , decodeUtf8' @@ -58,8 +61,9 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Monoid (Monoid(..)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) +import Data.Text.Encoding.Types (DecodeResult(..)) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) -import Data.Word (Word8) +import Data.Word (Word16, Word32, Word8) import qualified Data.ByteString as S import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP @@ -104,6 +108,35 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks +decodeChunk :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) + -> B.ByteString + -> DecodeResult Text B.ByteString w +decodeChunk decoder = g id 0 mempty + where + g tDiff pos sb0 (B.Chunk sb1 lb1) = + let DecodeResult t mW sb1' pos' = decoder sb0 sb1 + pos1 = pos + pos' + in + case mW of + Just _ -> DecodeResult (tDiff (chunk t Empty)) mW (B.chunk sb1' lb1) pos1 + _ -> (g (tDiff . chunk t) pos1 sb1' lb1) + g tDiff pos sb0 _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 B.Empty) pos + +-- | Decode a 'ByteString' containing UTF-8-encoded text returning a +-- 'DecodeResult'. +decodeUtf8Chunk :: B.ByteString -> DecodeResult Text B.ByteString Word8 +decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks + +-- | Decode a 'ByteString' containing UTF-16-encoded text returning a +-- 'DecodeResult'. +decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16 +decodeUtf16Chunk isBE = decodeChunk $ TE.decodeUtf16Chunks isBE + +-- | Decode a 'ByteString' containing UTF-32-encoded text returning a +-- 'DecodeResult'. +decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32 +decodeUtf32Chunk isBE = decodeChunk $ TE.decodeUtf32Chunks isBE + -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr (B.Chunk b0 bs0) = diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index e6172b47..54a282fe 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -38,17 +38,51 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) +whenEqProp a b next = if a == b + then next + else a === b + t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id tl_utf8' = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right) +t_utf8_c = (\ t -> + let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf8_c = (\ t -> + let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16LE = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id +t_utf16LE_c = (\ t -> + let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf16LE_c = (\ t -> + let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16LE = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id t_utf16BE = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id +t_utf16BE_c = (\ t -> + let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf16BE_c = (\ t -> + let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16BE = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id t_utf32LE = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id +t_utf32LE_c = (\ t -> + let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf32LE_c = (\ t -> + let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id +t_utf32BE_c = (\ t -> + let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf32BE_c = (\ t -> + let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id runBuilder :: B.Builder -> B.ByteString @@ -127,6 +161,13 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro length (show err) >= 0 Right _ -> counterexample (show (decoded, l)) $ de /= Strict +t_utf8_c_err :: InvalidUtf8 -> Property +t_utf8_c_err bad = + let E.DecodeResult t mW bs _ = E.decodeUtf8Chunk (toByteString bad) in + case mW of + Just w -> counterexample (show w) True + _ -> counterexample (show t) $ B.length bs > 0 + t_utf8_err' :: B.ByteString -> Bool t_utf8_err' bs = case E.decodeUtf8' bs of Left err -> length (show err) >= 0 @@ -168,6 +209,85 @@ genInvalidUTF8 = B.pack <$> oneof [ ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] +t_chunk_decode_utf8_1 = + let decodeResult0 = E.decodeUtf8Chunk $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in + decodeResult0 === (E.DecodeResult "hi, ☃!" Nothing mempty 8) +t_chunk_decode_utf8_2 = + let decodeResult0 = E.decodeUtf8Chunk $ B.pack [97, 0xC2, 97] + expectedBs0 = B.singleton 97 + in + whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $ + let decodeResult1 = E.decodeUtf8Chunk expectedBs0 in + decodeResult1 === (E.DecodeResult (T.singleton 'a') Nothing mempty 1) +t_chunk_decode_utf8_3 = + let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2] + expectedBs0 = B.singleton 0xe2 + in -- hi \xe2 + whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $ + let decodeResult1 = E.decodeUtf8Chunk $ expectedBs0 `B.snoc` 0x98 + expectedBs1 = B.pack [0xe2, 0x98] + in + whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $ + let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in + decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5) +t_chunk_decode_utf8_4 = + let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2, 0x98, 104] + expectedBs0 = B.pack [0x98, 104] + in + decodeResult0 === (E.DecodeResult "hi " (Just 0xe2) expectedBs0 4) + +t_chunk_decode_utf16BE = + let decode = E.decodeUtf16Chunk True + expectedBs0 = B.pack [0] + decodeResult0 = decode expectedBs0 in + whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ + let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] + expectedBs1 = B.pack [0xD8, 0x01] + in + whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ + let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in + decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8) +t_chunk_decode_utf16LE = + let decode = E.decodeUtf16Chunk False + expectedBs0 = B.pack [104] + decodeResult0 = decode expectedBs0 in + whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ + let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] + expectedBs1 = B.pack [0x01, 0xD8] + in + whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ + let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in + decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC) mempty 8) + +t_chunk_decode_utf32BE = + let decode = E.decodeUtf32Chunk True + expBs0 = B.pack [0, 0] + decodeResult0 = decode $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] + in + whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ + let expBs1 = B.pack [0, 0, 0x26] + decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 32, 0, 0, 0x26]) + in + whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ + let expBs2 = mempty + decodeResult2 = decode $ expBs1 `mappend` (B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]) + in + decodeResult2 === (E.DecodeResult "☃ " (Just 0xD800) expBs2 12) +t_chunk_decode_utf32LE = + let decode = E.decodeUtf32Chunk False + expBs0 = B.pack [0x20, 0] + decodeResult0 = decode $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] + in + whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ + let expBs1 = B.pack [0x03, 0x26, 0] + decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 0, 0x03, 0x26, 0]) + in + whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ + let expBs2 = mempty + decodeResult2 = decode $ expBs1 `mappend` (B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]) + in + decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12) + decodeLL :: BL.ByteString -> TL.Text decodeLL = EL.decodeUtf8With E.lenientDecode @@ -180,6 +300,12 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs +-- The decoding of lazy bytestrings should not depend on how they are chunked, +-- and it should behave the same as decoding of strict bytestrings. +-- t_decode_utf8_chunk :: Property +-- t_decode_utf8_chunk = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> +-- EL.decodeUtf8Chunk bs === (TL.fromStrict . E.decodeUtf8Chunk . B.concat . BL.toChunks) bs + -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... -- When faced with this illegal byte sequence ... a UTF-8 conformant process @@ -208,122 +334,6 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False -whenEqProp a b next = if a == b - then next - else a === b - -t_decode_utf8_with_error1 = - case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of - E.InvalidWord t pos word8 f -> whenEqProp pos 1 - . whenEqProp word8 0xC2 - $ case f $ Just 'x' of - E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "axa" - . whenEqProp bs B.empty $ pos' === 3 - _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False - _ -> counterexample "The second word (\\xC2) should have been invalid" False -t_decode_utf8_with_error2 = - case E.streamDecodeUtf8With' $ B.pack [97, 0xC2, 97] of - E.InvalidWord t pos word8 f -> whenEqProp pos 1 - . whenEqProp word8 0xC2 - $ case f Nothing of - E.ThusFar x pos' bs _ -> whenEqProp (t `T.append` x) "aa" - . whenEqProp bs B.empty $ pos' === 3 - _ -> counterexample "Should have recovered from the invalid word (\\xC2)" False - _ -> counterexample "The second word (\\xC2) should have been invalid" False -t_decode_utf8_with_error3 = - case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2] of -- hi \xe2 - E.ThusFar t pos bs f -> whenEqProp pos 3 - . whenEqProp bs (B.pack [0xe2]) - $ case f $ B.pack [0x98] of - E.ThusFar t' pos' bs' f' -> whenEqProp pos' 3 - . whenEqProp bs' (B.pack [0xe2, 0x98]) - $ case f' $ B.pack [0x83, 32, 0xFF] of - E.InvalidWord t'' pos'' word8 f'' -> whenEqProp pos'' 7 - . whenEqProp word8 0xFF - $ case f'' $ Just 'x' of - E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" - . whenEqProp pos''' 8 $ bs'' === B.empty - _ -> counterexample "Should have been decoded text." False - _ -> counterexample "Should have been an invalid word." False - _ -> counterexample "Should have encountered an incomplete code point." False - _ -> counterexample "Should have encountered an incomplete code point." False -t_decode_utf8_with_error4 = - case E.streamDecodeUtf8With' $ B.pack [104, 105, 32, 0xe2, 0x98, 104] of -- not quite "hi ☃", the last byte is wrong - E.ThusFar _ _ _ _ -> counterexample "Not incomplete, but an invalid word." False - E.InvalidWord t pos word8 _ -> whenEqProp t "hi " . whenEqProp pos 3 $ word8 === 0xe2 - -t_decode_utf16BE_with_error = - case E.streamDecodeUtf16With True $ B.pack [0] of - E.ThusFar t pos bs f -> whenEqProp pos 0 - . whenEqProp bs (B.pack [0]) - $ case f $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] of - E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6 - . whenEqProp bs' (B.pack [0xD8, 0x01]) - $ case f' $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] of - E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12 - . whenEqProp word16 (0xDC `Bits.shiftL` 8) - $ case f'' $ Just 'x' of - E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x" - . whenEqProp pos''' 14 $ bs'' === B.empty - _ -> counterexample "Should have been decoded text." False - _ -> counterexample "Should have been an invalid word." False - _ -> counterexample "Should have encountered an incomplete code point." False - _ -> counterexample "Should have encountered an incomplete code point." False - -t_decode_utf16LE_with_error = - case E.streamDecodeUtf16With False $ B.pack [104] of - E.ThusFar t pos bs f -> whenEqProp pos 0 - . whenEqProp bs (B.pack [104]) - $ case f $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] of - E.ThusFar t' pos' bs' f' -> whenEqProp pos' 6 - . whenEqProp bs' (B.pack [0x01, 0xD8]) - $ case f' $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] of - E.InvalidWord t'' pos'' word16 f'' -> whenEqProp pos'' 12 - . whenEqProp word16 0xDC - $ case f'' $ Just 'x' of - E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi \x10437 x" - . whenEqProp pos''' 14 $ bs'' === B.empty - _ -> counterexample "Should have been decoded text." False - _ -> counterexample "Should have been an invalid word." False - _ -> counterexample "Should have encountered an incomplete code point." False - _ -> counterexample "Should have encountered an incomplete code point." False - -t_decode_utf32BE_with_error = - case E.streamDecodeUtf32With True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] of -- hi \xe2 - E.ThusFar t pos bs f -> whenEqProp pos 8 - . whenEqProp bs (B.pack [0, 0]) - $ case f $ B.pack [0, 32, 0, 0, 0x26] of - E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12 - . whenEqProp bs' (B.pack [0, 0, 0x26]) - $ case f' $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] of - E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20 - . whenEqProp word32 0x0000D800 - $ case f'' $ Just 'x' of - E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" - . whenEqProp pos''' 24 $ bs'' === B.empty - _ -> counterexample "Should have been decoded text." False - _ -> counterexample "Should have been an invalid word." False - _ -> counterexample "Should have encountered an incomplete code point." False - _ -> counterexample "Should have encountered an incomplete code point." False - -t_decode_utf32LE_with_error = - case E.streamDecodeUtf32With False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 32, 0] of -- hi \xe2 - E.ThusFar t pos bs f -> whenEqProp pos 8 - . whenEqProp bs (B.pack [32, 0]) - $ case f $ B.pack [0, 0, 0x03, 0x26, 0] of - E.ThusFar t' pos' bs' f' -> whenEqProp pos' 12 - . whenEqProp bs' (B.pack [0x03, 0x26, 0]) - $ case f' $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] of - E.InvalidWord t'' pos'' word32 f'' -> whenEqProp pos'' 20 - . whenEqProp word32 0x00D80000 - $ case f'' $ Just 'x' of - E.ThusFar x pos''' bs'' _ -> whenEqProp (t `T.append` t' `T.append` t'' `T.append` x) "hi ☃ x" - . whenEqProp pos''' 24 $ bs'' === B.empty - _ -> counterexample "Should have been decoded text." False - _ -> counterexample "Should have been an invalid word." False - _ -> counterexample "Should have encountered an incomplete code point." False - _ -> counterexample "Should have encountered an incomplete code point." False - t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> text `T.isInfixOf` @@ -339,18 +349,28 @@ testTranscoding = testProperty "tl_latin1" tl_latin1, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', + testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', + testProperty "tl_utf8_c" tl_utf8_c, testProperty "t_utf16LE" t_utf16LE, + testProperty "t_utf16LE_c" t_utf16LE_c, testProperty "tl_utf16LE" tl_utf16LE, + testProperty "tl_utf16LE_c" tl_utf16LE_c, testProperty "t_utf16BE" t_utf16BE, + testProperty "t_utf16BE_c" t_utf16BE_c, testProperty "tl_utf16BE" tl_utf16BE, + testProperty "tl_utf16BE_c" tl_utf16BE_c, testProperty "t_utf32LE" t_utf32LE, + testProperty "t_utf32LE_c" t_utf32LE_c, testProperty "tl_utf32LE" tl_utf32LE, + testProperty "tl_utf32LE_c" tl_utf32LE_c, testProperty "t_utf32BE" t_utf32BE, + testProperty "t_utf32BE_c" t_utf32BE_c, testProperty "tl_utf32BE" tl_utf32BE, + testProperty "tl_utf32BE_c" tl_utf32BE_c, testGroup "builder" [ testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder, testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset, @@ -360,9 +380,18 @@ testTranscoding = ], testGroup "errors" [ testProperty "t_utf8_err" t_utf8_err, + testProperty "t_utf8_c_err" t_utf8_c_err, testProperty "t_utf8_err'" t_utf8_err' ], testGroup "error recovery" [ + testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1, + testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2, + testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3, + testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4, + testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE, + testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, + testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, + testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE, testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, @@ -371,15 +400,6 @@ testTranscoding = testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_utf8_with_error1" t_decode_utf8_with_error1, - testProperty "t_decode_utf8_with_error2" t_decode_utf8_with_error2, - testProperty "t_decode_utf8_with_error3" t_decode_utf8_with_error3, - testProperty "t_decode_utf8_with_error4" t_decode_utf8_with_error4, - testProperty "t_decode_utf16BE_with_error" t_decode_utf16BE_with_error, - testProperty "t_decode_utf16LE_with_error" t_decode_utf16LE_with_error, - testProperty "t_decode_utf32BE_with_error" t_decode_utf32BE_with_error, - testProperty "t_decode_utf32LE_with_error" t_decode_utf32LE_with_error, testProperty "t_infix_concat" t_infix_concat ] ] - \ No newline at end of file diff --git a/text.cabal b/text.cabal index 488f1451..956d8ce3 100644 --- a/text.cabal +++ b/text.cabal @@ -137,6 +137,7 @@ library Data.Text.Array Data.Text.Encoding Data.Text.Encoding.Error + Data.Text.Encoding.Types Data.Text.Foreign Data.Text.IO Data.Text.Internal From c1544101fdaf16c3e433345380a1ad5407853214 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 10 Jul 2022 20:23:51 -0600 Subject: [PATCH 15/87] fix documentation --- src/Data/Text/Encoding/Types.hs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs index e791a55e..f50e4920 100644 --- a/src/Data/Text/Encoding/Types.hs +++ b/src/Data/Text/Encoding/Types.hs @@ -1,23 +1,13 @@ -{-# LANGUAGE CPP, DeriveDataTypeable #-} {-# LANGUAGE Safe #-} -- | --- Module : Data.Text.Encoding.Error +-- Module : Data.Text.Encoding.Types -- Copyright : (c) Bryan O'Sullivan 2009 -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Portability : GHC -- --- Types and functions for dealing with encoding and decoding errors --- in Unicode text. --- --- The standard functions for encoding and decoding text are strict, --- which is to say that they throw exceptions on invalid input. This --- is often unhelpful on real world input, so alternative functions --- exist that accept custom handlers for dealing with invalid inputs. --- These 'OnError' handlers are normal Haskell functions. You can use --- one of the presupplied functions in this module, or you can write a --- custom handler of your own. +-- Types to indicate the result of an attempt to decode data. module Data.Text.Encoding.Types ( DecodeResult(..) ) From 8414af3cda4d55d1f2a5efdeb78e7e45a299d101 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Mon, 11 Jul 2022 19:14:38 -0600 Subject: [PATCH 16/87] Lazy and strict stream decoders built from chunk decoders --- src/Data/Text/Encoding.hs | 61 ++++++++--- src/Data/Text/Encoding/Types.hs | 19 +++- src/Data/Text/Lazy/Encoding.hs | 38 ++++++- tests/Tests/Properties/Transcoding.hs | 144 +++++++++++++++++++++++--- 4 files changed, 227 insertions(+), 35 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 4ed1edc8..9d21d55b 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -30,13 +30,13 @@ module Data.Text.Encoding -- $total decodeLatin1 , decodeUtf8Lenient + , DecodeResult(..) , decodeUtf8Chunk , decodeUtf8Chunks , decodeUtf16Chunk , decodeUtf16Chunks , decodeUtf32Chunk , decodeUtf32Chunks - , DecodeResult(..) -- *** Catchable failure , decodeUtf8' @@ -55,6 +55,11 @@ module Data.Text.Encoding , streamDecodeUtf8With , Decoding(..) + , StreamDecodeResult(..) + , decodeUtf8Stream + , decodeUtf16Stream + , decodeUtf32Stream + -- ** Partial Functions -- $partial , decodeASCII @@ -86,7 +91,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Encoding.Types (DecodeResult(..)) +import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..)) import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) @@ -227,14 +232,14 @@ data Progression | NeedMore | Invalid -decodeChunksProxy :: (Bits w, Num w, Storable w) => +decodeChunks :: (Bits w, Num w, Storable w) => w -- only used for Storable.sizeOf argument which the function discards -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString -> DecodeResult Text ByteString w -decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do +decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do marr <- A.new len' outer marr len' 0 0 where @@ -307,16 +312,16 @@ decodeChunksProxy w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.leng contin srcOff' $ \ t bs' -> DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' -decodeChunks :: (Bits w, Num w, Storable w) => +decodeChunksProxy :: (Bits w, Num w, Storable w) => (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString -> DecodeResult Text ByteString w -decodeChunks = decodeChunksProxy undefined -- This allows Haskell can +decodeChunksProxy = decodeChunks undefined -- This allows Haskell can -- determine the size in bytes of a data type using Storable.sizeOf -- so that it doesn't have to be passed as an arugment. Storable.sizeOf --- discards the actual value without resolving it. +-- discards the actual value without evaluating it. queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int) queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff @@ -352,21 +357,21 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression decodeUtf8Base index len srcOff = - let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off - where - step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st - in case decodeFrom srcOff of Accept c -> WriteAndAdvance c (srcOff +) Reject -> Invalid Incomplete{} -> NeedMore + where + decodeFrom off = step (off + 1) . utf8DecodeStart $ index off + + step i (Incomplete a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b + step _ st = st -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks = decodeChunks queryUtf8DecodeOptimization decodeUtf8Base +decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization decodeUtf8Base -- | Decode a 'ByteString' containing UTF-8-encoded text returning a -- 'DecodeResult'. @@ -379,7 +384,7 @@ noOptimization _ _ _ = Nothing -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16 -decodeUtf16Chunks isBE = decodeChunks noOptimization $ \ index len srcOff -> +decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -> -- get next Word8 pair let writeAndAdvance c n = WriteAndAdvance c $ const n b0 = index $ if isBE then srcOff else srcOff + 1 @@ -408,7 +413,7 @@ decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32 -decodeUtf32Chunks isBE = decodeChunks noOptimization $ \ index _ srcOff -> +decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> -- get next Word8 quartet case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3) >>= ($ (index $ srcOff + (if isBE then 1 else 2))) @@ -559,6 +564,30 @@ streamDecodeUtf8With onErr = g empty mempty _ -> txt) mempty _ -> Some txt bs1' . g empty) bs1' +decodeStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w +decodeStream chunksDecoder = g 0 mempty + where + g pos bs0 bs1 = + let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 + pos' = pos + pos1 + in + StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8- +-- encoded text. +decodeUtf8Stream :: ByteString -> StreamDecodeResult Text ByteString Word8 +decodeUtf8Stream = decodeStream decodeUtf8Chunks + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-16- +-- encoded text. +decodeUtf16Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word16 +decodeUtf16Stream = decodeStream . decodeUtf16Chunks + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-32- +-- encoded text. +decodeUtf32Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word32 +decodeUtf32Stream = decodeStream . decodeUtf32Chunks + -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. -- diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs index f50e4920..06b124a9 100644 --- a/src/Data/Text/Encoding/Types.hs +++ b/src/Data/Text/Encoding/Types.hs @@ -10,7 +10,9 @@ -- Types to indicate the result of an attempt to decode data. module Data.Text.Encoding.Types - ( DecodeResult(..) ) + ( DecodeResult(..) + , StreamDecodeResult(..) + ) where -- | A decoding result on encoded data. @@ -23,3 +25,18 @@ data DecodeResult t b w = DecodeResult -- point was encountered, this is after that code point. !Int -- ^ Byte position of remaining undecoded data. deriving (Eq, Ord, Show, Read) + +-- | A decoding result on encoded data. +data StreamDecodeResult t b w = StreamDecodeResult + !t -- ^ The decoded data up to an incomplete code point at + -- the end of the input data, an invalid word, or to the + -- end of the input. + !(Maybe w) -- ^ If an invalid code point was encountered. + !b -- ^ The remaining undecoded data. If an invald code + -- point was encountered, this is after that code point. + !Int -- ^ Byte position of remaining undecoded data. This is + -- treated as if all the data fed to previous invocations + -- of the continations where one continuous feed. + (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next + -- span of data to be decoded with + -- the remaining unencoded data. diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 62964203..99626efa 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,9 +23,14 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 + , DecodeResult(..) , decodeUtf8Chunk , decodeUtf16Chunk , decodeUtf32Chunk + , StreamDecodeResult(..) + , decodeUtf8Stream + , decodeUtf16Stream + , decodeUtf32Stream -- *** Catchable failure , decodeUtf8' @@ -61,7 +66,7 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Monoid (Monoid(..)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) -import Data.Text.Encoding.Types (DecodeResult(..)) +import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..)) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word16, Word32, Word8) import qualified Data.ByteString as S @@ -130,12 +135,39 @@ decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks -- | Decode a 'ByteString' containing UTF-16-encoded text returning a -- 'DecodeResult'. decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16 -decodeUtf16Chunk isBE = decodeChunk $ TE.decodeUtf16Chunks isBE +decodeUtf16Chunk = decodeChunk . TE.decodeUtf16Chunks -- | Decode a 'ByteString' containing UTF-32-encoded text returning a -- 'DecodeResult'. decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32 -decodeUtf32Chunk isBE = decodeChunk $ TE.decodeUtf32Chunks isBE +decodeUtf32Chunk = decodeChunk . TE.decodeUtf32Chunks + +decodeStream + :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) + -> B.ByteString + -> StreamDecodeResult Text B.ByteString w +decodeStream decoder = g 0 mempty + where + g pos bs0 bs1 = + let DecodeResult t mW bs1' pos1 = decodeChunk decoder $ B.append bs0 bs1 + pos' = pos + pos1 + in + StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 + +-- | Decode a 'ByteString' containing UTF-8-encoded text returning a +-- 'StreamDecodeResult'. +decodeUtf8Stream :: B.ByteString -> StreamDecodeResult Text B.ByteString Word8 +decodeUtf8Stream = decodeStream TE.decodeUtf8Chunks + +-- | Decode a 'ByteString' containing UTF-16-encoded text returning a +-- 'StreamDecodeResult'. +decodeUtf16Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word16 +decodeUtf16Stream = decodeStream . TE.decodeUtf16Chunks + +-- | Decode a 'ByteString' containing UTF-32-encoded text returning a +-- 'StreamDecodeResult'. +decodeUtf32Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word32 +decodeUtf32Stream = decodeStream . TE.decodeUtf32Chunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 54a282fe..81767e75 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -44,46 +44,76 @@ whenEqProp a b next = if a == b t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) -tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id -tl_utf8' = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right) t_utf8_c = (\ t -> let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +t_utf8_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf8Stream $ E.encodeUtf8 t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id +tl_utf8' = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right) tl_utf8_c = (\ t -> let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf8_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf8Stream $ EL.encodeUtf8 t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16LE = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id t_utf16LE_c = (\ t -> let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +t_utf16LE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream False $ E.encodeUtf16LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf16LE = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id tl_utf16LE_c = (\ t -> let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf16LE = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id +tl_utf16LE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream False $ EL.encodeUtf16LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16BE = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id t_utf16BE_c = (\ t -> let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +t_utf16BE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream True $ E.encodeUtf16BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf16BE = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id tl_utf16BE_c = (\ t -> let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf16BE = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id +tl_utf16BE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream True $ EL.encodeUtf16BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32LE = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id t_utf32LE_c = (\ t -> let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +t_utf32LE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream False $ E.encodeUtf32LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id tl_utf32LE_c = (\ t -> let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id +tl_utf32LE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream False $ EL.encodeUtf32LE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id t_utf32BE_c = (\ t -> let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +t_utf32BE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream True $ E.encodeUtf32BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) +tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id tl_utf32BE_c = (\ t -> let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id +tl_utf32BE_s = (\ t -> + let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream True $ EL.encodeUtf32BE t in + (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) runBuilder :: B.Builder -> B.ByteString runBuilder = @@ -168,6 +198,13 @@ t_utf8_c_err bad = Just w -> counterexample (show w) True _ -> counterexample (show t) $ B.length bs > 0 +t_utf8_s_err :: InvalidUtf8 -> Property +t_utf8_s_err bad = + let E.StreamDecodeResult t mW bs _ _ = E.decodeUtf8Stream (toByteString bad) in + case mW of + Just w -> counterexample (show w) True + _ -> counterexample (show t) $ B.length bs > 0 + t_utf8_err' :: B.ByteString -> Bool t_utf8_err' bs = case E.decodeUtf8' bs of Left err -> length (show err) >= 0 @@ -230,11 +267,6 @@ t_chunk_decode_utf8_3 = whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $ let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5) -t_chunk_decode_utf8_4 = - let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2, 0x98, 104] - expectedBs0 = B.pack [0x98, 104] - in - decodeResult0 === (E.DecodeResult "hi " (Just 0xe2) expectedBs0 4) t_chunk_decode_utf16BE = let decode = E.decodeUtf16Chunk True @@ -288,6 +320,54 @@ t_chunk_decode_utf32LE = in decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12) +t_stream_decode_utf8_1 = + let (E.StreamDecodeResult t0 mW0 bs0 pos0 _) = E.decodeUtf8Stream $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in + (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8) +t_stream_decode_utf8_2 = + let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [97, 0xC2, 97] in + whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in + (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3) +t_stream_decode_utf8_3 = + let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [104, 105, 32, 0xe2] in + whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in + whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $ + let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in + (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8) + +t_stream_decode_utf16BE = + let expectedBs0 = B.pack [0] + (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream True expectedBs0 in + whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in + whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $ + let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in + (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14) +t_stream_decode_utf16LE = + let expectedBs0 = B.pack [104] + (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream False expectedBs0 in + whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in + whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $ + let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in + (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14) + +t_stream_decode_utf32BE = + let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in + whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in + whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $ + let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in + (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24) +t_stream_decode_utf32LE = + let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in + whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $ + let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in + whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $ + let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in + (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24) + decodeLL :: BL.ByteString -> TL.Text decodeLL = EL.decodeUtf8With E.lenientDecode @@ -300,11 +380,27 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs +decodeStream decoder snoc bs = + g mempty $ decoder bs + where + g t0 (E.StreamDecodeResult t mW bs' pos f) = + let t0' = t0 `mappend` t in + case mW of + Just _ -> let mC = E.lenientDecode "" mW in + let t' = case mC of + Just c -> t0' `snoc` c + _ -> t0' + in g t' $ f mempty + _ -> (t0', bs', pos) + -- The decoding of lazy bytestrings should not depend on how they are chunked, -- and it should behave the same as decoding of strict bytestrings. --- t_decode_utf8_chunk :: Property --- t_decode_utf8_chunk = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> --- EL.decodeUtf8Chunk bs === (TL.fromStrict . E.decodeUtf8Chunk . B.concat . BL.toChunks) bs +t_decode_utf8_stream :: Property +t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> + decodeStream EL.decodeUtf8Stream TL.snoc bs === + ( let (st, sb, pos) = decodeStream E.decodeUtf8Stream T.snoc . B.concat $ BL.toChunks bs in + (TL.fromStrict st, BL.fromStrict sb, pos) + ) -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... @@ -350,27 +446,37 @@ testTranscoding = testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', testProperty "t_utf8_c" t_utf8_c, + testProperty "t_utf8_s" t_utf8_s, testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "tl_utf8_c" tl_utf8_c, + testProperty "tl_utf8_s" tl_utf8_s, testProperty "t_utf16LE" t_utf16LE, testProperty "t_utf16LE_c" t_utf16LE_c, + testProperty "t_utf16LE_s" t_utf16LE_s, testProperty "tl_utf16LE" tl_utf16LE, testProperty "tl_utf16LE_c" tl_utf16LE_c, + testProperty "tl_utf16LE_s" tl_utf16LE_s, testProperty "t_utf16BE" t_utf16BE, testProperty "t_utf16BE_c" t_utf16BE_c, + testProperty "t_utf16BE_s" t_utf16BE_s, testProperty "tl_utf16BE" tl_utf16BE, testProperty "tl_utf16BE_c" tl_utf16BE_c, + testProperty "tl_utf16BE_s" tl_utf16BE_s, testProperty "t_utf32LE" t_utf32LE, testProperty "t_utf32LE_c" t_utf32LE_c, + testProperty "t_utf32LE_s" t_utf32LE_s, testProperty "tl_utf32LE" tl_utf32LE, testProperty "tl_utf32LE_c" tl_utf32LE_c, + testProperty "tl_utf32LE_s" tl_utf32LE_s, testProperty "t_utf32BE" t_utf32BE, testProperty "t_utf32BE_c" t_utf32BE_c, + testProperty "t_utf32BE_s" t_utf32BE_s, testProperty "tl_utf32BE" tl_utf32BE, testProperty "tl_utf32BE_c" tl_utf32BE_c, + testProperty "tl_utf32BE_s" tl_utf32BE_s, testGroup "builder" [ testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder, testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset, @@ -381,18 +487,26 @@ testTranscoding = testGroup "errors" [ testProperty "t_utf8_err" t_utf8_err, testProperty "t_utf8_c_err" t_utf8_c_err, + testProperty "t_utf8_s_err" t_utf8_s_err, testProperty "t_utf8_err'" t_utf8_err' ], testGroup "error recovery" [ testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1, testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2, testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3, - testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4, testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE, testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE, + testProperty "t_stream_decode_utf8_1" t_stream_decode_utf8_1, + testProperty "t_stream_decode_utf8_2" t_stream_decode_utf8_2, + testProperty "t_stream_decode_utf8_3" t_stream_decode_utf8_3, + testProperty "t_stream_decode_utf16BE" t_stream_decode_utf16BE, + testProperty "t_stream_decode_utf16LE" t_stream_decode_utf16LE, + testProperty "t_stream_decode_utf32BE" t_stream_decode_utf32BE, + testProperty "t_stream_decode_utf32LE" t_stream_decode_utf32LE, testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, + testProperty "t_decode_utf8_stream" t_decode_utf8_stream, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, From e1eded8e829be5d6aa0a4ab5ba5b4540fdd9b6c9 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 12 Jul 2022 20:21:10 -0600 Subject: [PATCH 17/87] removed decodeUtf[X]Stream functions but left the function chunksDecoderToStream --- src/Data/Text/Encoding.hs | 221 ++++++++++---------------- src/Data/Text/Encoding/Types.hs | 14 ++ src/Data/Text/Lazy/Encoding.hs | 83 ++++------ tests/Tests/Properties/Transcoding.hs | 181 +++++++++------------ 4 files changed, 200 insertions(+), 299 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 9d21d55b..41752922 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -31,11 +31,8 @@ module Data.Text.Encoding decodeLatin1 , decodeUtf8Lenient , DecodeResult(..) - , decodeUtf8Chunk , decodeUtf8Chunks - , decodeUtf16Chunk , decodeUtf16Chunks - , decodeUtf32Chunk , decodeUtf32Chunks -- *** Catchable failure @@ -55,11 +52,6 @@ module Data.Text.Encoding , streamDecodeUtf8With , Decoding(..) - , StreamDecodeResult(..) - , decodeUtf8Stream - , decodeUtf16Stream - , decodeUtf32Stream - -- ** Partial Functions -- $partial , decodeASCII @@ -91,7 +83,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..)) +import Data.Text.Encoding.Types (DecodeResult(..)) import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) @@ -239,78 +231,81 @@ decodeChunks :: (Bits w, Num w, Storable w) => -> ByteString -> ByteString -> DecodeResult Text ByteString w -decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do - marr <- A.new len' - outer marr len' 0 0 - where - wordByteSize = sizeOf w - - index :: Int -> Word8 - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - - len :: Int - len = len1 + len2 - len' :: Int - len' = (len `div` wordByteSize) + 4 - - -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w) - outer dst dstLen = inner - where - inner srcOff dstOff - -- finished - | len - srcOff < 1 = goodSoFar - -- shortcut for utf-8 - | otherwise = - case queryOptimization bs1 bs2 srcOff of - Just (arr, tLen) -> - let minLen = tLen + dstOff in - if minLen > dstLen - then do - let dstLen' = minLen + 4 - dst' <- A.resizeM dst dstLen' - A.copyI tLen dst' dstOff arr 0 - outer dst' dstLen' (srcOff + tLen) minLen - else do - A.copyI tLen dst dstOff arr 0 - inner (srcOff + tLen) minLen - _ -> if len - srcOff < wordByteSize - -- incomplete code point - then goodSoFar - else - if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case decodeF index len srcOff of - WriteAndAdvance c advance -> do - d <- unsafeWrite dst dstOff c - inner (advance d) $ dstOff + d - NeedMore -> goodSoFar - Invalid -> invalid - where - contin off res = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - pure . res (Text arr 0 dstOff) $ if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 - goodSoFar = - contin srcOff $ \ t bs' -> - DecodeResult t Nothing bs' srcOff - invalid = - let srcOff' = srcOff + wordByteSize - bytesToWord n word = - if n > 0 - then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - else word - in - contin srcOff' $ \ t bs' -> - DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' +decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) + | len2 == 0 + , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1 + | otherwise = runST $ do + marr <- A.new len' + outer marr len' 0 0 + where + wordByteSize = sizeOf w + + index :: Int -> Word8 + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + + len :: Int + len = len1 + len2 + len' :: Int + len' = (len `div` wordByteSize) + 4 + + -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w) + outer dst dstLen = inner + where + inner srcOff dstOff + -- finished + | len - srcOff < 1 = goodSoFar + -- shortcut for utf-8 + | otherwise = + case queryOptimization bs1 bs2 srcOff of + Just (arr, tLen) -> + let minLen = tLen + dstOff in + if minLen > dstLen + then do + let dstLen' = minLen + 4 + dst' <- A.resizeM dst dstLen' + A.copyI tLen dst' dstOff arr 0 + outer dst' dstLen' (srcOff + tLen) minLen + else do + A.copyI tLen dst dstOff arr 0 + inner (srcOff + tLen) minLen + _ -> if len - srcOff < wordByteSize + -- incomplete code point + then goodSoFar + else + if dstOff + 4 > dstLen + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case decodeF index len srcOff of + WriteAndAdvance c advance -> do + d <- unsafeWrite dst dstOff c + inner (advance d) $ dstOff + d + NeedMore -> goodSoFar + Invalid -> invalid + where + contin off res = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + pure . res (Text arr 0 dstOff) $ if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + goodSoFar = + contin srcOff $ \ t bs' -> + DecodeResult t Nothing bs' srcOff + invalid = + let srcOff' = srcOff + wordByteSize + bytesToWord n word = + if n > 0 + then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + else word + in + contin srcOff' $ \ t bs' -> + DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' decodeChunksProxy :: (Bits w, Num w, Storable w) => (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) @@ -318,7 +313,7 @@ decodeChunksProxy :: (Bits w, Num w, Storable w) => -> ByteString -> ByteString -> DecodeResult Text ByteString w -decodeChunksProxy = decodeChunks undefined -- This allows Haskell can +decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- determine the size in bytes of a data type using Storable.sizeOf -- so that it doesn't have to be passed as an arugment. Storable.sizeOf -- discards the actual value without evaluating it. @@ -355,28 +350,20 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff w2 = B.index bs2 (len2 - 3) w3 = B.index bs2 (len2 - 4) -decodeUtf8Base :: (Int -> Word8) -> Int -> Int -> Progression -decodeUtf8Base index len srcOff = - case decodeFrom srcOff of - Accept c -> WriteAndAdvance c (srcOff +) - Reject -> Invalid - Incomplete{} -> NeedMore - where - decodeFrom off = step (off + 1) . utf8DecodeStart $ index off - - step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st - -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization decodeUtf8Base +decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff -> + let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off --- | Decode a 'ByteString' containing UTF-8-encoded text returning a --- 'DecodeResult'. -decodeUtf8Chunk :: ByteString -> DecodeResult Text ByteString Word8 -decodeUtf8Chunk = decodeUtf8Chunks mempty + step i (Incomplete a b) + | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b + step _ st = st + in + case decodeFrom srcOff of + Accept c -> WriteAndAdvance c (srcOff +) + Reject -> Invalid + Incomplete{} -> NeedMore noOptimization :: p1 -> p2 -> p3 -> Maybe a noOptimization _ _ _ = Nothing @@ -405,11 +392,6 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff - _ -> Invalid _ -> Invalid --- | Decode a 'ByteString' containing UTF-16-encoded text returning a --- 'DecodeResult'. -decodeUtf16Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word16 -decodeUtf16Chunk isBE = decodeUtf16Chunks isBE mempty - -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32 @@ -421,11 +403,6 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4 _ -> Invalid --- | Decode a 'ByteString' containing UTF-32-encoded text returning a --- 'DecodeResult'. -decodeUtf32Chunk :: Bool -> ByteString -> DecodeResult Text ByteString Word32 -decodeUtf32Chunk isBE = decodeUtf32Chunks isBE mempty - -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a total function: On success the decoded 'Text' is within a @@ -564,30 +541,6 @@ streamDecodeUtf8With onErr = g empty mempty _ -> txt) mempty _ -> Some txt bs1' . g empty) bs1' -decodeStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w -decodeStream chunksDecoder = g 0 mempty - where - g pos bs0 bs1 = - let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 - pos' = pos + pos1 - in - StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 - --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8- --- encoded text. -decodeUtf8Stream :: ByteString -> StreamDecodeResult Text ByteString Word8 -decodeUtf8Stream = decodeStream decodeUtf8Chunks - --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-16- --- encoded text. -decodeUtf16Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word16 -decodeUtf16Stream = decodeStream . decodeUtf16Chunks - --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-32- --- encoded text. -decodeUtf32Stream :: Bool -> ByteString -> StreamDecodeResult Text ByteString Word32 -decodeUtf32Stream = decodeStream . decodeUtf32Chunks - -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. -- diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs index 06b124a9..1f806be9 100644 --- a/src/Data/Text/Encoding/Types.hs +++ b/src/Data/Text/Encoding/Types.hs @@ -12,6 +12,7 @@ module Data.Text.Encoding.Types ( DecodeResult(..) , StreamDecodeResult(..) + , chunksDecoderToStream ) where @@ -40,3 +41,16 @@ data StreamDecodeResult t b w = StreamDecodeResult (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next -- span of data to be decoded with -- the remaining unencoded data. + +-- | Create a stream decoder from a chunks decoder. The resulting +-- stream decoder will return a 'StreamDecodeResult' which contains a +-- continuation function that accepts another section of unencoded +-- data as a continuation of any remaining unencoded data. +chunksDecoderToStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w +chunksDecoderToStream chunksDecoder = g 0 mempty + where + g pos bs0 bs1 = + let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 + pos' = pos + pos1 + in + StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 99626efa..1e1360fa 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -24,13 +24,9 @@ module Data.Text.Lazy.Encoding -- $total decodeLatin1 , DecodeResult(..) - , decodeUtf8Chunk - , decodeUtf16Chunk - , decodeUtf32Chunk - , StreamDecodeResult(..) - , decodeUtf8Stream - , decodeUtf16Stream - , decodeUtf32Stream + , decodeUtf8Chunks + , decodeUtf16Chunks + , decodeUtf32Chunks -- *** Catchable failure , decodeUtf8' @@ -66,7 +62,7 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Monoid (Monoid(..)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) -import Data.Text.Encoding.Types (DecodeResult(..), StreamDecodeResult(..)) +import Data.Text.Encoding.Types (DecodeResult(..)) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word16, Word32, Word8) import qualified Data.ByteString as S @@ -113,61 +109,36 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -decodeChunk :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) +decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) + -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString w -decodeChunk decoder = g id 0 mempty +decodeChunks decoder = g id 0 mempty where - g tDiff pos sb0 (B.Chunk sb1 lb1) = + g tDiff pos sb0 (B.Chunk sb1 lb1) lb2 = let DecodeResult t mW sb1' pos' = decoder sb0 sb1 pos1 = pos + pos' in case mW of - Just _ -> DecodeResult (tDiff (chunk t Empty)) mW (B.chunk sb1' lb1) pos1 - _ -> (g (tDiff . chunk t) pos1 sb1' lb1) - g tDiff pos sb0 _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 B.Empty) pos - --- | Decode a 'ByteString' containing UTF-8-encoded text returning a --- 'DecodeResult'. -decodeUtf8Chunk :: B.ByteString -> DecodeResult Text B.ByteString Word8 -decodeUtf8Chunk = decodeChunk TE.decodeUtf8Chunks - --- | Decode a 'ByteString' containing UTF-16-encoded text returning a --- 'DecodeResult'. -decodeUtf16Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word16 -decodeUtf16Chunk = decodeChunk . TE.decodeUtf16Chunks - --- | Decode a 'ByteString' containing UTF-32-encoded text returning a --- 'DecodeResult'. -decodeUtf32Chunk :: Bool -> B.ByteString -> DecodeResult Text B.ByteString Word32 -decodeUtf32Chunk = decodeChunk . TE.decodeUtf32Chunks - -decodeStream - :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) - -> B.ByteString - -> StreamDecodeResult Text B.ByteString w -decodeStream decoder = g 0 mempty - where - g pos bs0 bs1 = - let DecodeResult t mW bs1' pos1 = decodeChunk decoder $ B.append bs0 bs1 - pos' = pos + pos1 - in - StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 - --- | Decode a 'ByteString' containing UTF-8-encoded text returning a --- 'StreamDecodeResult'. -decodeUtf8Stream :: B.ByteString -> StreamDecodeResult Text B.ByteString Word8 -decodeUtf8Stream = decodeStream TE.decodeUtf8Chunks - --- | Decode a 'ByteString' containing UTF-16-encoded text returning a --- 'StreamDecodeResult'. -decodeUtf16Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word16 -decodeUtf16Stream = decodeStream . TE.decodeUtf16Chunks - --- | Decode a 'ByteString' containing UTF-32-encoded text returning a --- 'StreamDecodeResult'. -decodeUtf32Stream :: Bool -> B.ByteString -> StreamDecodeResult Text B.ByteString Word32 -decodeUtf32Stream = decodeStream . TE.decodeUtf32Chunks + Just _ -> DecodeResult (tDiff $ chunk t Empty) mW (B.chunk sb1' $ lb1 `B.append` lb2) pos1 + _ -> g (tDiff . chunk t) pos1 sb1' lb1 lb2 + g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty + g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos + +-- | Decode two 'ByteString's containing UTF-8-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf8Chunks :: B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word8 +decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks + +-- | Decode two 'ByteString's containing UTF-16-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf16Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word16 +decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks + +-- | Decode two 'ByteString's containing UTF-32-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +decodeUtf32Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word32 +decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 81767e75..bc1c36e4 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -24,6 +24,7 @@ import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E +import qualified Data.Text.Encoding.Types as Ty import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -45,74 +46,44 @@ whenEqProp a b next = if a == b t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) t_utf8_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunk $ E.encodeUtf8 t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -t_utf8_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf8Stream $ E.encodeUtf8 t in + let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunks mempty $ E.encodeUtf8 t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id tl_utf8' = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right) tl_utf8_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunk $ EL.encodeUtf8 t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf8_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf8Stream $ EL.encodeUtf8 t in + let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunks mempty $ EL.encodeUtf8 t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16LE = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id t_utf16LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk False $ E.encodeUtf16LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -t_utf16LE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream False $ E.encodeUtf16LE t in + let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks False mempty $ E.encodeUtf16LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16LE = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id tl_utf16LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk False $ EL.encodeUtf16LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf16LE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream False $ EL.encodeUtf16LE t in + let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks False mempty $ EL.encodeUtf16LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16BE = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id t_utf16BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunk True $ E.encodeUtf16BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -t_utf16BE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf16Stream True $ E.encodeUtf16BE t in + let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks True mempty $ E.encodeUtf16BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16BE = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id tl_utf16BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunk True $ EL.encodeUtf16BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf16BE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf16Stream True $ EL.encodeUtf16BE t in + let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks True mempty $ EL.encodeUtf16BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32LE = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id t_utf32LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk False $ E.encodeUtf32LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -t_utf32LE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream False $ E.encodeUtf32LE t in + let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks False mempty $ E.encodeUtf32LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id tl_utf32LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk False $ EL.encodeUtf32LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf32LE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream False $ EL.encodeUtf32LE t in + let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks False mempty $ EL.encodeUtf32LE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id t_utf32BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunk True $ E.encodeUtf32BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -t_utf32BE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = E.decodeUtf32Stream True $ E.encodeUtf32BE t in + let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks True mempty $ E.encodeUtf32BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id tl_utf32BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunk True $ EL.encodeUtf32BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) -tl_utf32BE_s = (\ t -> - let E.StreamDecodeResult t' mC bs _ _ = EL.decodeUtf32Stream True $ EL.encodeUtf32BE t in + let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks True mempty $ EL.encodeUtf32BE t in (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) runBuilder :: B.Builder -> B.ByteString @@ -193,14 +164,7 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro t_utf8_c_err :: InvalidUtf8 -> Property t_utf8_c_err bad = - let E.DecodeResult t mW bs _ = E.decodeUtf8Chunk (toByteString bad) in - case mW of - Just w -> counterexample (show w) True - _ -> counterexample (show t) $ B.length bs > 0 - -t_utf8_s_err :: InvalidUtf8 -> Property -t_utf8_s_err bad = - let E.StreamDecodeResult t mW bs _ _ = E.decodeUtf8Stream (toByteString bad) in + let E.DecodeResult t mW bs _ = E.decodeUtf8Chunks mempty $ toByteString bad in case mW of Just w -> counterexample (show w) True _ -> counterexample (show t) $ B.length bs > 0 @@ -247,125 +211,133 @@ genInvalidUTF8 = B.pack <$> oneof [ ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] t_chunk_decode_utf8_1 = - let decodeResult0 = E.decodeUtf8Chunk $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in - decodeResult0 === (E.DecodeResult "hi, ☃!" Nothing mempty 8) + let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in + decodeResult0 === E.DecodeResult "hi, ☃!" Nothing mempty 8 t_chunk_decode_utf8_2 = - let decodeResult0 = E.decodeUtf8Chunk $ B.pack [97, 0xC2, 97] + let decode = E.decodeUtf8Chunks mempty + decodeResult0 = decode $ B.pack [97, 0xC2, 97] expectedBs0 = B.singleton 97 in whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $ - let decodeResult1 = E.decodeUtf8Chunk expectedBs0 in - decodeResult1 === (E.DecodeResult (T.singleton 'a') Nothing mempty 1) + let decodeResult1 = decode expectedBs0 in + decodeResult1 === E.DecodeResult (T.singleton 'a') Nothing mempty 1 t_chunk_decode_utf8_3 = - let decodeResult0 = E.decodeUtf8Chunk $ B.pack [104, 105, 32, 0xe2] + let decode = E.decodeUtf8Chunks + decodeResult0 = decode mempty $ B.pack [104, 105, 32, 0xe2] expectedBs0 = B.singleton 0xe2 in -- hi \xe2 whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $ - let decodeResult1 = E.decodeUtf8Chunk $ expectedBs0 `B.snoc` 0x98 + let decodeResult1 = decode expectedBs0 $ B.singleton 0x98 expectedBs1 = B.pack [0xe2, 0x98] in whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $ - let decodeResult2 = E.decodeUtf8Chunk $ expectedBs1 `mappend` B.pack [0x83, 32, 0xFF] in - decodeResult2 === (E.DecodeResult "☃ " (Just 0xFF) mempty 5) + let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in + decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5 t_chunk_decode_utf16BE = - let decode = E.decodeUtf16Chunk True + let decode = E.decodeUtf16Chunks True expectedBs0 = B.pack [0] - decodeResult0 = decode expectedBs0 in + decodeResult0 = decode mempty expectedBs0 in whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ - let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] + let decodeResult1 = decode expectedBs0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] expectedBs1 = B.pack [0xD8, 0x01] in whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ - let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in - decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8) + let decodeResult2 = decode expectedBs1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in + decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8 t_chunk_decode_utf16LE = - let decode = E.decodeUtf16Chunk False + let decode = E.decodeUtf16Chunks False expectedBs0 = B.pack [104] - decodeResult0 = decode expectedBs0 in + decodeResult0 = decode mempty expectedBs0 in whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ - let decodeResult1 = decode $ expectedBs0 `B.append` B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] + let decodeResult1 = decode expectedBs0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] expectedBs1 = B.pack [0x01, 0xD8] in whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ - let decodeResult2 = decode $ expectedBs1 `B.append` B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in - decodeResult2 === (E.DecodeResult "\x10437 " (Just 0xDC) mempty 8) + let decodeResult2 = decode expectedBs1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in + decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC) mempty 8 t_chunk_decode_utf32BE = - let decode = E.decodeUtf32Chunk True + let decode = E.decodeUtf32Chunks True expBs0 = B.pack [0, 0] - decodeResult0 = decode $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] + decodeResult0 = decode mempty $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ let expBs1 = B.pack [0, 0, 0x26] - decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 32, 0, 0, 0x26]) + decodeResult1 = decode expBs0 $ B.pack [0, 32, 0, 0, 0x26] in whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ let expBs2 = mempty - decodeResult2 = decode $ expBs1 `mappend` (B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0]) + decodeResult2 = decode expBs1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in - decodeResult2 === (E.DecodeResult "☃ " (Just 0xD800) expBs2 12) + decodeResult2 === E.DecodeResult "☃ " (Just 0xD800) expBs2 12 t_chunk_decode_utf32LE = - let decode = E.decodeUtf32Chunk False + let decode = E.decodeUtf32Chunks False expBs0 = B.pack [0x20, 0] - decodeResult0 = decode $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] + decodeResult0 = decode mempty $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ let expBs1 = B.pack [0x03, 0x26, 0] - decodeResult1 = decode $ expBs0 `mappend` (B.pack [0, 0, 0x03, 0x26, 0]) + decodeResult1 = decode expBs0 $ B.pack [0, 0, 0x03, 0x26, 0] in whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ let expBs2 = mempty - decodeResult2 = decode $ expBs1 `mappend` (B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0]) + decodeResult2 = decode expBs1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in - decodeResult2 === (E.DecodeResult "☃ " (Just 0xD80000) expBs2 12) + decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12 + +decodeUtf8StreamS = Ty.chunksDecoderToStream E.decodeUtf8Chunks t_stream_decode_utf8_1 = - let (E.StreamDecodeResult t0 mW0 bs0 pos0 _) = E.decodeUtf8Stream $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in + let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 _) = decodeUtf8StreamS $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8) t_stream_decode_utf8_2 = - let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [97, 0xC2, 97] in + let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [97, 0xC2, 97] in whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3) t_stream_decode_utf8_3 = - let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf8Stream $ B.pack [104, 105, 32, 0xe2] in + let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [104, 105, 32, 0xe2] in whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $ - let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in + let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8) +decodeUtf16StreamS = Ty.chunksDecoderToStream . E.decodeUtf16Chunks + t_stream_decode_utf16BE = let expectedBs0 = B.pack [0] - (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream True expectedBs0 in + (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS True expectedBs0 in whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $ - let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in + let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14) t_stream_decode_utf16LE = let expectedBs0 = B.pack [104] - (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf16Stream False expectedBs0 in + (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS False expectedBs0 in whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $ - let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in + let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14) +decodeUtf32StreamS = Ty.chunksDecoderToStream . E.decodeUtf32Chunks + t_stream_decode_utf32BE = - let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in + let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $ - let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in + let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24) t_stream_decode_utf32LE = - let (E.StreamDecodeResult t0 mW0 bs0 pos0 f0) = E.decodeUtf32Stream False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in + let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $ - let (E.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in + let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $ - let (E.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in + let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24) decodeLL :: BL.ByteString -> TL.Text @@ -383,7 +355,7 @@ t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ decodeStream decoder snoc bs = g mempty $ decoder bs where - g t0 (E.StreamDecodeResult t mW bs' pos f) = + g t0 (Ty.StreamDecodeResult t mW bs' pos f) = let t0' = t0 `mappend` t in case mW of Just _ -> let mC = E.lenientDecode "" mW in @@ -393,12 +365,14 @@ decodeStream decoder snoc bs = in g t' $ f mempty _ -> (t0', bs', pos) +decodeUtf8StreamL = Ty.chunksDecoderToStream EL.decodeUtf8Chunks + -- The decoding of lazy bytestrings should not depend on how they are chunked, -- and it should behave the same as decoding of strict bytestrings. t_decode_utf8_stream :: Property t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> - decodeStream EL.decodeUtf8Stream TL.snoc bs === - ( let (st, sb, pos) = decodeStream E.decodeUtf8Stream T.snoc . B.concat $ BL.toChunks bs in + decodeStream decodeUtf8StreamL TL.snoc bs === + ( let (st, sb, pos) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs in (TL.fromStrict st, BL.fromStrict sb, pos) ) @@ -446,37 +420,27 @@ testTranscoding = testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', testProperty "t_utf8_c" t_utf8_c, - testProperty "t_utf8_s" t_utf8_s, testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "tl_utf8_c" tl_utf8_c, - testProperty "tl_utf8_s" tl_utf8_s, testProperty "t_utf16LE" t_utf16LE, testProperty "t_utf16LE_c" t_utf16LE_c, - testProperty "t_utf16LE_s" t_utf16LE_s, testProperty "tl_utf16LE" tl_utf16LE, testProperty "tl_utf16LE_c" tl_utf16LE_c, - testProperty "tl_utf16LE_s" tl_utf16LE_s, testProperty "t_utf16BE" t_utf16BE, testProperty "t_utf16BE_c" t_utf16BE_c, - testProperty "t_utf16BE_s" t_utf16BE_s, testProperty "tl_utf16BE" tl_utf16BE, testProperty "tl_utf16BE_c" tl_utf16BE_c, - testProperty "tl_utf16BE_s" tl_utf16BE_s, testProperty "t_utf32LE" t_utf32LE, testProperty "t_utf32LE_c" t_utf32LE_c, - testProperty "t_utf32LE_s" t_utf32LE_s, testProperty "tl_utf32LE" tl_utf32LE, testProperty "tl_utf32LE_c" tl_utf32LE_c, - testProperty "tl_utf32LE_s" tl_utf32LE_s, testProperty "t_utf32BE" t_utf32BE, testProperty "t_utf32BE_c" t_utf32BE_c, - testProperty "t_utf32BE_s" t_utf32BE_s, testProperty "tl_utf32BE" tl_utf32BE, testProperty "tl_utf32BE_c" tl_utf32BE_c, - testProperty "tl_utf32BE_s" tl_utf32BE_s, testGroup "builder" [ testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder, testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset, @@ -487,7 +451,6 @@ testTranscoding = testGroup "errors" [ testProperty "t_utf8_err" t_utf8_err, testProperty "t_utf8_c_err" t_utf8_c_err, - testProperty "t_utf8_s_err" t_utf8_s_err, testProperty "t_utf8_err'" t_utf8_err' ], testGroup "error recovery" [ From ff571c092badbbf13249fdbee7c1cbd703f2b8fe Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 12 Jul 2022 21:58:34 -0600 Subject: [PATCH 18/87] Added missing lazy decodeAsciiE function. Updated documentation including the change log. --- src/Data/Text/Encoding.hs | 25 +++++++++++++-- src/Data/Text/Encoding/Types.hs | 10 +++++- src/Data/Text/Lazy/Encoding.hs | 46 +++++++++++++++++++++------ tests/Tests/Properties/Transcoding.hs | 3 ++ 4 files changed, 71 insertions(+), 13 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 41752922..ccc5e935 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -352,7 +352,12 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf8Chunks :: ByteString -> ByteString -> DecodeResult Text ByteString Word8 +-- +-- @since 2.0.0.1 +decodeUtf8Chunks + :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second `ByteString` chunk to decode. + -> DecodeResult Text ByteString Word8 decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff -> let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off @@ -370,7 +375,13 @@ noOptimization _ _ _ = Nothing -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf16Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word16 +-- +-- @since 2.0.0.1 +decodeUtf16Chunks + :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) + -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second `ByteString` chunk to decode. + -> DecodeResult Text ByteString Word16 decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -> -- get next Word8 pair let writeAndAdvance c n = WriteAndAdvance c $ const n @@ -394,7 +405,13 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff - -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf32Chunks :: Bool -> ByteString -> ByteString -> DecodeResult Text ByteString Word32 +-- +-- @since 2.0.0.1 +decodeUtf32Chunks + :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) + -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second `ByteString` chunk to decode. + -> DecodeResult Text ByteString Word32 decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> -- get next Word8 quartet case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3) @@ -408,6 +425,8 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> -- This is a total function: On success the decoded 'Text' is within a -- 'Right' value, and an error ('Left' 'Int') indicates the postion of -- the offending 'Word8'. +-- +-- @since 2.0.0.1 decodeAsciiE :: ByteString -> Either Int Text decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs index 1f806be9..20113b3d 100644 --- a/src/Data/Text/Encoding/Types.hs +++ b/src/Data/Text/Encoding/Types.hs @@ -17,6 +17,8 @@ module Data.Text.Encoding.Types where -- | A decoding result on encoded data. +-- +-- @since 2.0.0.1 data DecodeResult t b w = DecodeResult !t -- ^ The decoded data up to an incomplete code point at -- the end of the input data, an invalid word, or to the @@ -28,6 +30,8 @@ data DecodeResult t b w = DecodeResult deriving (Eq, Ord, Show, Read) -- | A decoding result on encoded data. +-- +-- @since 2.0.0.1 data StreamDecodeResult t b w = StreamDecodeResult !t -- ^ The decoded data up to an incomplete code point at -- the end of the input data, an invalid word, or to the @@ -46,7 +50,11 @@ data StreamDecodeResult t b w = StreamDecodeResult -- stream decoder will return a 'StreamDecodeResult' which contains a -- continuation function that accepts another section of unencoded -- data as a continuation of any remaining unencoded data. -chunksDecoderToStream :: Monoid b => (b -> b -> DecodeResult t b w) -> b -> StreamDecodeResult t b w +-- +-- @since 2.0.0.1 +chunksDecoderToStream :: Monoid b + => (b -> b -> DecodeResult t b w) -- ^ Chunk decoder function + -> b -> StreamDecodeResult t b w chunksDecoderToStream chunksDecoder = g 0 mempty where g pos bs0 bs1 = diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 1e1360fa..c598280f 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,6 +23,7 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 + , decodeAsciiE , DecodeResult(..) , decodeUtf8Chunks , decodeUtf16Chunks @@ -109,6 +110,16 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks +-- | Decode a 'ByteString' containing 7-bit ASCII encoded text. +-- +-- This is a total function: On success the decoded 'Text' is within a +-- 'Right' value, and an error ('Left' 'Int') indicates the postion of +-- the offending 'Word8'. +-- +-- @since 2.0.0.1 +decodeAsciiE :: B.ByteString -> Either Int Text +decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks + decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) -> B.ByteString -> B.ByteString @@ -125,19 +136,36 @@ decodeChunks decoder = g id 0 mempty g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos --- | Decode two 'ByteString's containing UTF-8-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf8Chunks :: B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word8 +-- | Decode two 'B.ByteString's containing UTF-8-encoded text as though +-- they were one continuous 'B.ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.0.1 +decodeUtf8Chunks + :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + -> DecodeResult Text B.ByteString Word8 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks --- | Decode two 'ByteString's containing UTF-16-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf16Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word16 +-- | Decode two 'B.ByteString's containing UTF-16-encoded text as though +-- they were one continuous 'B.ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.0.1 +decodeUtf16Chunks + :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) + -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + -> DecodeResult Text B.ByteString Word16 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks --- | Decode two 'ByteString's containing UTF-32-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. -decodeUtf32Chunks :: Bool -> B.ByteString -> B.ByteString -> DecodeResult Text B.ByteString Word32 +-- | Decode two 'B.ByteString's containing UTF-32-encoded text as though +-- they were one continuous 'B.ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.0.1 +decodeUtf32Chunks + :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) + -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + -> DecodeResult Text B.ByteString Word32 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks -- | Decode a 'ByteString' containing UTF-8 encoded text. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index bc1c36e4..c71fef67 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -30,6 +30,8 @@ import qualified Data.Text.Lazy.Encoding as EL t_asciiE t = E.decodeAsciiE (E.encodeUtf8 a) === Right a where a = T.map (\c -> chr (ord c `mod` 128)) t +tl_asciiE t = EL.decodeAsciiE (EL.encodeUtf8 a) === Right a + where a = TL.map (\c -> chr (ord c `mod` 128)) t t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t @@ -413,6 +415,7 @@ testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ testProperty "t_asciiE" t_asciiE, + testProperty "tl_asciiE" tl_asciiE, testProperty "t_ascii" t_ascii, testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, From 7d0757afaec4ec694e877687b48a6e95b6c1e86b Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 16 Jul 2022 17:55:27 -0600 Subject: [PATCH 19/87] created module Data.Text.Encoding.Common and moved the contents Types and Error to it. Redesigned chunksDecoderToStream --- benchmarks/haskell/Benchmarks/Stream.hs | 2 +- src/Data/Text.hs | 18 +- src/Data/Text/Encoding.hs | 3 +- src/Data/Text/Encoding/Common.hs | 177 ++++++++++++++++++ src/Data/Text/Encoding/Error.hs | 97 ++-------- src/Data/Text/Encoding/Types.hs | 64 ------- src/Data/Text/Internal/Encoding/Fusion.hs | 2 +- .../Text/Internal/Lazy/Encoding/Fusion.hs | 2 +- src/Data/Text/Lazy.hs | 18 +- src/Data/Text/Lazy/Encoding.hs | 3 +- tests/Tests/Properties/Transcoding.hs | 128 +++++-------- tests/Tests/QuickCheckUtils.hs | 2 +- tests/Tests/Regressions.hs | 2 +- text.cabal | 3 +- 14 files changed, 249 insertions(+), 272 deletions(-) create mode 100644 src/Data/Text/Encoding/Common.hs delete mode 100644 src/Data/Text/Encoding/Types.hs diff --git a/benchmarks/haskell/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs index 1d53c498..539b709c 100644 --- a/benchmarks/haskell/Benchmarks/Stream.hs +++ b/benchmarks/haskell/Benchmarks/Stream.hs @@ -21,7 +21,7 @@ import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Lazy as BL import Data.Text.Internal.Fusion.Types (Step (..), Stream (..)) import qualified Data.Text.Encoding as T -import qualified Data.Text.Encoding.Error as E +import qualified Data.Text.Encoding.Common as E import qualified Data.Text.Internal.Encoding.Fusion as T import qualified Data.Text.Internal.Encoding.Fusion.Common as F import qualified Data.Text.Internal.Fusion as T diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 5176a0ab..c1eda582 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -355,21 +355,10 @@ instance Monoid Text where mappend = (<>) mconcat = concat --- | Performs replacement on invalid scalar values: --- --- >>> :set -XOverloadedStrings --- >>> "\55555" :: Text --- "\65533" instance IsString Text where fromString = pack --- | Performs replacement on invalid scalar values: --- --- >>> :set -XOverloadedLists --- >>> ['\55555'] :: Text --- "\65533" --- --- @since 1.2.0.0 +-- | @since 1.2.0.0 instance Exts.IsList Text where type Item Text = Char fromList = pack @@ -456,10 +445,7 @@ compareText (Text arrA offA lenA) (Text arrB offB lenB) = -- * Conversion to/from 'Text' -- | /O(n)/ Convert a 'String' into a 'Text'. --- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id': --- --- >>> unpack (pack "\55555") --- "\65533" +-- Performs replacement on invalid scalar values. pack :: String -> Text pack = unstream . S.map safe . S.streamList {-# INLINE [1] pack #-} diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index ccc5e935..b8e9ffb1 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -82,8 +82,7 @@ import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Short.Internal as SBS -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Encoding.Types (DecodeResult(..)) +import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Internal.Unsafe.Char (unsafeWrite) diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs new file mode 100644 index 00000000..3212e315 --- /dev/null +++ b/src/Data/Text/Encoding/Common.hs @@ -0,0 +1,177 @@ +{-# LANGUAGE CPP, DeriveDataTypeable #-} +{-# LANGUAGE Safe #-} +-- | +-- Module : Data.Text.Encoding.Common +-- Copyright : (c) Bryan O'Sullivan 2009 +-- +-- License : BSD-style +-- Maintainer : bos@serpentine.com +-- Portability : GHC +-- +-- Common functions and types for both lazy and strict encoding and +-- decoding including error handling. +-- +-- The standard functions for encoding and decoding text are strict, +-- which is to say that they throw exceptions on invalid input. This +-- is often unhelpful on real world input, so alternative functions +-- exist that accept custom handlers for dealing with invalid inputs. +-- These 'OnError' handlers are normal Haskell functions. You can use +-- one of the presupplied functions in this module, or you can write a +-- custom handler of your own. + +module Data.Text.Encoding.Common + ( + -- * Full-service result types and functions + DecodeResult(..) + , DecodeResultHandler + , chunksDecoderToStream + -- * Error handling types + , UnicodeException(..) + , OnError + , OnDecodeError + , OnEncodeError + -- * Useful error handling functions + , lenientDecode + , strictDecode + , strictEncode + , ignore + , replace + ) + where + +import Control.DeepSeq (NFData (..)) +import Control.Exception (Exception, throw) +import Data.Typeable (Typeable) +import Data.Word (Word8) +import Numeric (showHex) + +-- | A decoding result on encoded data. +-- +-- @since 2.0.0.1 +data DecodeResult t b w = DecodeResult + !t -- ^ The decoded data up to an incomplete code point at + -- the end of the input data, an invalid word, or to the + -- end of the input. + !(Maybe w) -- ^ If an invalid code point was encountered. + !b -- ^ The remaining undecoded data. If an invald code + -- point was encountered, this is after that code point. + !Int -- ^ Byte position of remaining undecoded data. + deriving (Eq, Ord, Show, Read) + +-- | The funtion type for handling successful decodings. +-- +-- @since 2.0.0.1 +type DecodeResultHandler t w b b' r + = t -- ^ The decoded data up to an incomplete code point at + -- the end of the input data, an invalid word, or to + -- the end of the input. + -> Maybe w -- ^ If an invalid code point was encountered. + -> b -- ^ The remaining undecoded data. If an invald code + -- point was encountered, this is after that code + -- point. + -> Int -- ^ Byte position of remaining undecoded data. This + -- is treated as if all the data fed to previous + -- invocations of the continations were one continuous + -- feed. + -> (b' -> r) -- ^ Continuation to accept the next span of data to be + -- decoded with the remaining unencoded data. + -> r -- ^ Result of the continuation. + +-- | Create a stream decoder from a chunks decoder. The resulting +-- stream decoder accepts a 'DecodeResultHandler' to process the decode +-- result. The continuations accept another section of unencoded data +-- as a continuation of any remaining unencoded data. +-- +-- @since 2.0.0.1 +chunksDecoderToStream :: Monoid b + => (b -> b' -> DecodeResult t b w) -- ^ Chunks decoder + -> b' -- ^ Encoded data + -> DecodeResultHandler t w b b' r -- ^ Result continuation + -> r -- ^ Continuation result +chunksDecoderToStream chunksDecoder bs f = + g mempty bs 0 + where + g bs0 bs1 pos0 = + let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 + pos = pos0 + pos1 + in + f t mW bs1' pos $ \ bs2 -> g bs1' bs2 pos + +-- | Function type for handling a coding error. It is supplied with +-- two inputs: +-- +-- * A 'String' that describes the error. +-- +-- * The input value that caused the error. If the error arose +-- because the end of input was reached or could not be identified +-- precisely, this value will be 'Nothing'. +-- +-- If the handler returns a value wrapped with 'Just', that value will +-- be used in the output as the replacement for the invalid input. If +-- it returns 'Nothing', no value will be used in the output. +-- +-- Should the handler need to abort processing, it should use 'error' +-- or 'throw' an exception (preferably a 'UnicodeException'). It may +-- use the description provided to construct a more helpful error +-- report. +type OnError a b = String -> Maybe a -> Maybe b + +-- | A handler for a decoding error. +type OnDecodeError = OnError Word8 Char + +-- | A handler for an encoding error. +{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} +type OnEncodeError = OnError Char Word8 + +-- | An exception type for representing Unicode encoding errors. +data UnicodeException = + DecodeError String (Maybe Word8) + -- ^ Could not decode a byte sequence because it was invalid under + -- the given encoding, or ran out of input in mid-decode. + | EncodeError String (Maybe Char) + -- ^ Tried to encode a character that could not be represented + -- under the given encoding, or ran out of input in mid-encode. + deriving (Eq, Typeable) + +{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-} + +showUnicodeException :: UnicodeException -> String +showUnicodeException (DecodeError desc (Just w)) + = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc) +showUnicodeException (DecodeError desc Nothing) + = "Cannot decode input: " ++ desc +showUnicodeException (EncodeError desc (Just c)) + = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) +showUnicodeException (EncodeError desc Nothing) + = "Cannot encode input: " ++ desc + +instance Show UnicodeException where + show = showUnicodeException + +instance Exception UnicodeException + +instance NFData UnicodeException where + rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () + rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () + +-- | Throw a 'UnicodeException' if decoding fails. +strictDecode :: OnDecodeError +strictDecode desc c = throw (DecodeError desc c) + +-- | Replace an invalid input byte with the Unicode replacement +-- character U+FFFD. +lenientDecode :: OnDecodeError +lenientDecode _ _ = Just '\xfffd' + +-- | Throw a 'UnicodeException' if encoding fails. +{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-} +strictEncode :: OnEncodeError +strictEncode desc c = throw (EncodeError desc c) + +-- | Ignore an invalid input, substituting nothing in the output. +ignore :: OnError a b +ignore _ _ = Nothing + +-- | Replace an invalid input with a valid output. +replace :: b -> OnError a b +replace c _ _ = Just c diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index ea9e0997..569fafcf 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE CPP, DeriveDataTypeable #-} {-# LANGUAGE Safe #-} -- | -- Module : Data.Text.Encoding.Error @@ -34,87 +33,15 @@ module Data.Text.Encoding.Error , replace ) where -import Control.DeepSeq (NFData (..)) -import Control.Exception (Exception, throw) -import Data.Typeable (Typeable) -import Data.Word (Word8) -import Numeric (showHex) - --- | Function type for handling a coding error. It is supplied with --- two inputs: --- --- * A 'String' that describes the error. --- --- * The input value that caused the error. If the error arose --- because the end of input was reached or could not be identified --- precisely, this value will be 'Nothing'. --- --- If the handler returns a value wrapped with 'Just', that value will --- be used in the output as the replacement for the invalid input. If --- it returns 'Nothing', no value will be used in the output. --- --- Should the handler need to abort processing, it should use 'error' --- or 'throw' an exception (preferably a 'UnicodeException'). It may --- use the description provided to construct a more helpful error --- report. -type OnError a b = String -> Maybe a -> Maybe b - --- | A handler for a decoding error. -type OnDecodeError = OnError Word8 Char - --- | A handler for an encoding error. -{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} -type OnEncodeError = OnError Char Word8 - --- | An exception type for representing Unicode encoding errors. -data UnicodeException = - DecodeError String (Maybe Word8) - -- ^ Could not decode a byte sequence because it was invalid under - -- the given encoding, or ran out of input in mid-decode. - | EncodeError String (Maybe Char) - -- ^ Tried to encode a character that could not be represented - -- under the given encoding, or ran out of input in mid-encode. - deriving (Eq, Typeable) - -{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-} - -showUnicodeException :: UnicodeException -> String -showUnicodeException (DecodeError desc (Just w)) - = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc) -showUnicodeException (DecodeError desc Nothing) - = "Cannot decode input: " ++ desc -showUnicodeException (EncodeError desc (Just c)) - = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) -showUnicodeException (EncodeError desc Nothing) - = "Cannot encode input: " ++ desc - -instance Show UnicodeException where - show = showUnicodeException - -instance Exception UnicodeException - -instance NFData UnicodeException where - rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () - rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () - --- | Throw a 'UnicodeException' if decoding fails. -strictDecode :: OnDecodeError -strictDecode desc c = throw (DecodeError desc c) - --- | Replace an invalid input byte with the Unicode replacement --- character U+FFFD. -lenientDecode :: OnDecodeError -lenientDecode _ _ = Just '\xfffd' - --- | Throw a 'UnicodeException' if encoding fails. -{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-} -strictEncode :: OnEncodeError -strictEncode desc c = throw (EncodeError desc c) - --- | Ignore an invalid input, substituting nothing in the output. -ignore :: OnError a b -ignore _ _ = Nothing - --- | Replace an invalid input with a valid output. -replace :: b -> OnError a b -replace c _ _ = Just c +import Data.Text.Encoding.Common + ( UnicodeException(..) + , OnError + , OnDecodeError + , OnEncodeError + -- * Useful error handling functions + , lenientDecode + , strictDecode + , strictEncode + , ignore + , replace + ) diff --git a/src/Data/Text/Encoding/Types.hs b/src/Data/Text/Encoding/Types.hs deleted file mode 100644 index 20113b3d..00000000 --- a/src/Data/Text/Encoding/Types.hs +++ /dev/null @@ -1,64 +0,0 @@ -{-# LANGUAGE Safe #-} --- | --- Module : Data.Text.Encoding.Types --- Copyright : (c) Bryan O'Sullivan 2009 --- --- License : BSD-style --- Maintainer : bos@serpentine.com --- Portability : GHC --- --- Types to indicate the result of an attempt to decode data. - -module Data.Text.Encoding.Types - ( DecodeResult(..) - , StreamDecodeResult(..) - , chunksDecoderToStream - ) - where - --- | A decoding result on encoded data. --- --- @since 2.0.0.1 -data DecodeResult t b w = DecodeResult - !t -- ^ The decoded data up to an incomplete code point at - -- the end of the input data, an invalid word, or to the - -- end of the input. - !(Maybe w) -- ^ If an invalid code point was encountered. - !b -- ^ The remaining undecoded data. If an invald code - -- point was encountered, this is after that code point. - !Int -- ^ Byte position of remaining undecoded data. - deriving (Eq, Ord, Show, Read) - --- | A decoding result on encoded data. --- --- @since 2.0.0.1 -data StreamDecodeResult t b w = StreamDecodeResult - !t -- ^ The decoded data up to an incomplete code point at - -- the end of the input data, an invalid word, or to the - -- end of the input. - !(Maybe w) -- ^ If an invalid code point was encountered. - !b -- ^ The remaining undecoded data. If an invald code - -- point was encountered, this is after that code point. - !Int -- ^ Byte position of remaining undecoded data. This is - -- treated as if all the data fed to previous invocations - -- of the continations where one continuous feed. - (b -> StreamDecodeResult t b w) -- ^ Continuation to accept the next - -- span of data to be decoded with - -- the remaining unencoded data. - --- | Create a stream decoder from a chunks decoder. The resulting --- stream decoder will return a 'StreamDecodeResult' which contains a --- continuation function that accepts another section of unencoded --- data as a continuation of any remaining unencoded data. --- --- @since 2.0.0.1 -chunksDecoderToStream :: Monoid b - => (b -> b -> DecodeResult t b w) -- ^ Chunk decoder function - -> b -> StreamDecodeResult t b w -chunksDecoderToStream chunksDecoder = g 0 mempty - where - g pos bs0 bs1 = - let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 - pos' = pos + pos1 - in - StreamDecodeResult t mW bs1' pos' $ \ bs2 -> g pos' bs1' bs2 diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs index aa8f0d02..aa3019ab 100644 --- a/src/Data/Text/Internal/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Encoding/Fusion.hs @@ -41,7 +41,7 @@ import Data.Bits (shiftL, shiftR) import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy) import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size -import Data.Text.Encoding.Error +import Data.Text.Encoding.Common import Data.Text.Internal.Encoding.Fusion.Common import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs index 69149779..5ebad0ad 100644 --- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs @@ -38,7 +38,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Unsafe as B import Data.Text.Internal.ByteStringCompat import Data.Text.Internal.Encoding.Fusion.Common -import Data.Text.Encoding.Error +import Data.Text.Encoding.Common import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs index 39834a8b..9e2638eb 100644 --- a/src/Data/Text/Lazy.hs +++ b/src/Data/Text/Lazy.hs @@ -316,21 +316,10 @@ instance Monoid Text where mappend = (<>) mconcat = concat --- | Performs replacement on invalid scalar values: --- --- >>> :set -XOverloadedStrings --- >>> "\55555" :: Data.Text.Lazy.Text --- "\65533" instance IsString Text where fromString = pack --- | Performs replacement on invalid scalar values: --- --- >>> :set -XOverloadedLists --- >>> ['\55555'] :: Data.Text.Lazy.Text --- "\65533" --- --- @since 1.2.0.0 +-- | @since 1.2.0.0 instance Exts.IsList Text where type Item Text = Char fromList = pack @@ -383,10 +372,7 @@ textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr] -- | /O(n)/ Convert a 'String' into a 'Text'. -- --- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id': --- --- >>> Data.Text.Lazy.unpack (Data.Text.Lazy.pack "\55555") --- "\65533" +-- Performs replacement on invalid scalar values. pack :: #if defined(ASSERTS) HasCallStack => diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index c598280f..25f655c2 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -62,8 +62,7 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Monoid (Monoid(..)) -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) -import Data.Text.Encoding.Types (DecodeResult(..)) +import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word16, Word32, Word8) import qualified Data.ByteString as S diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index c71fef67..9b36bfde 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -1,13 +1,17 @@ -- | Tests for encoding and decoding -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, FlexibleContexts #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} module Tests.Properties.Transcoding ( testTranscoding ) where +import Control.Monad.Writer (runWriterT, tell) +import Data.Functor.Identity (runIdentity) +-- import Data.Monoid (Endo(appEndo)) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) +import Data.Maybe (fromMaybe) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -23,8 +27,7 @@ import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E -import qualified Data.Text.Encoding.Error as E -import qualified Data.Text.Encoding.Types as Ty +import qualified Data.Text.Encoding.Common as C import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -288,65 +291,11 @@ t_chunk_decode_utf32LE = in decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12 -decodeUtf8StreamS = Ty.chunksDecoderToStream E.decodeUtf8Chunks - -t_stream_decode_utf8_1 = - let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 _) = decodeUtf8StreamS $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in - (t0, mW0, bs0, pos0) === ("hi, ☃!", Nothing, mempty, 8) -t_stream_decode_utf8_2 = - let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [97, 0xC2, 97] in - whenEqProp (t0, mW0, bs0, pos0) (T.singleton 'a', Just 0xC2, B.singleton 97, 2) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 _) = f0 mempty in - (t1, mW1, bs1, pos1) === (T.singleton 'a', Nothing, mempty, 3) -t_stream_decode_utf8_3 = - let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf8StreamS $ B.pack [104, 105, 32, 0xe2] in - whenEqProp (t0, mW0, bs0, pos0) ("hi ", Nothing, B.singleton 0xe2, 3) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.singleton 0x98 in - whenEqProp (t1, mW1, bs1, pos1) ("", Nothing, B.pack [0xe2, 0x98], 3) $ - let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x83, 32, 0xFF] in - (t2, mW2, bs2, pos2) === ("☃ ", Just 0xFF, mempty, 8) - -decodeUtf16StreamS = Ty.chunksDecoderToStream . E.decodeUtf16Chunks - -t_stream_decode_utf16BE = - let expectedBs0 = B.pack [0] - (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS True expectedBs0 in - whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] in - whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0xD8, 0x01], 6) $ - let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in - (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC00, mempty, 14) -t_stream_decode_utf16LE = - let expectedBs0 = B.pack [104] - (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf16StreamS False expectedBs0 in - whenEqProp (t0, mW0, bs0, pos0) (T.empty, Nothing, expectedBs0, 0) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] in - whenEqProp (t1, mW1, bs1, pos1) ("hi ", Nothing, B.pack [0x01, 0xD8], 6) $ - let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in - (t2, mW2, bs2, pos2) === ("\x10437 ", Just 0xDC, mempty, 14) - -decodeUtf32StreamS = Ty.chunksDecoderToStream . E.decodeUtf32Chunks - -t_stream_decode_utf32BE = - let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS True $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] in - whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0, 0], 8) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 32, 0, 0, 0x26] in - whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0, 0, 0x26], 12) $ - let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] in - (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD800, mempty, 24) -t_stream_decode_utf32LE = - let (Ty.StreamDecodeResult t0 mW0 bs0 pos0 f0) = decodeUtf32StreamS False $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] in - whenEqProp (t0, mW0, bs0, pos0) ("hi", Nothing, B.pack [0x20, 0], 8) $ - let (Ty.StreamDecodeResult t1 mW1 bs1 pos1 f1) = f0 $ B.pack [0, 0, 0x03, 0x26, 0] in - whenEqProp (t1, mW1, bs1, pos1) (" ", Nothing, B.pack [0x03, 0x26, 0], 12) $ - let (Ty.StreamDecodeResult t2 mW2 bs2 pos2 _) = f1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] in - (t2, mW2, bs2, pos2) === ("☃ ", Just 0xD80000, mempty, 24) - decodeLL :: BL.ByteString -> TL.Text -decodeLL = EL.decodeUtf8With E.lenientDecode +decodeLL = EL.decodeUtf8With C.lenientDecode decodeL :: B.ByteString -> T.Text -decodeL = E.decodeUtf8With E.lenientDecode +decodeL = E.decodeUtf8With C.lenientDecode -- The lenient decoding of lazy bytestrings should not depend on how they are chunked, -- and it should behave the same as decoding of strict bytestrings. @@ -354,20 +303,42 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs -decodeStream decoder snoc bs = - g mempty $ decoder bs - where - g t0 (Ty.StreamDecodeResult t mW bs' pos f) = - let t0' = t0 `mappend` t in - case mW of - Just _ -> let mC = E.lenientDecode "" mW in - let t' = case mC of - Just c -> t0' `snoc` c - _ -> t0' - in g t' $ f mempty - _ -> (t0', bs', pos) - -decodeUtf8StreamL = Ty.chunksDecoderToStream EL.decodeUtf8Chunks +decodeStream decoder snoc bs = runIdentity $ do + ((b, p), tDiff) <- runWriterT $ decoder bs $ \ t mW b p f -> + case mW of + Just w -> + let t' = (case C.lenientDecode "" $ Just w of + Just c -> t `snoc` c + _ -> t) + in do + tell (t' <>) + f mempty + _ -> do + tell (t <>) + pure (b, p) + pure (tDiff mempty, b, p) + +decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks + +decodeUtf8StreamS = C.chunksDecoderToStream E.decodeUtf8Chunks + +-- The results of the stream decoder on a strict bytestring should +-- match the results of a non-stream decoder on a lazy bytestring. +t_decode_utf8_lenient_stream :: Property +t_decode_utf8_lenient_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> + let (st, sb, _) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs + st' = B.foldr (\ w st'' -> st'' `T.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) st sb + in + decodeLL bs === TL.fromStrict st' + +-- The results of the non-stream decoder on a strict bytestring should +-- match the results of a stream decoder on a lazy bytestring. +t_decode_utf8_stream_lenient :: Property +t_decode_utf8_stream_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> + let (lt, lb, _) = decodeStream decodeUtf8StreamL TL.snoc bs + lt' = BL.foldr (\ w lt'' -> lt'' `TL.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) lt lb + in + lt' === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs -- The decoding of lazy bytestrings should not depend on how they are chunked, -- and it should behave the same as decoding of strict bytestrings. @@ -403,7 +374,7 @@ t_decode_with_error4' = t_decode_with_error5' = ioProperty $ do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) pure $ case ret of - Left (_ :: E.UnicodeException) -> True + Left (_ :: C.UnicodeException) -> True Right{} -> False t_infix_concat bs1 text bs2 = @@ -464,14 +435,9 @@ testTranscoding = testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE, - testProperty "t_stream_decode_utf8_1" t_stream_decode_utf8_1, - testProperty "t_stream_decode_utf8_2" t_stream_decode_utf8_2, - testProperty "t_stream_decode_utf8_3" t_stream_decode_utf8_3, - testProperty "t_stream_decode_utf16BE" t_stream_decode_utf16BE, - testProperty "t_stream_decode_utf16LE" t_stream_decode_utf16LE, - testProperty "t_stream_decode_utf32BE" t_stream_decode_utf32BE, - testProperty "t_stream_decode_utf32LE" t_stream_decode_utf32LE, testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, + testProperty "t_decode_utf8_stream_lenient" t_decode_utf8_stream_lenient, + testProperty "t_decode_utf8_lenient_stream" t_decode_utf8_lenient_stream, testProperty "t_decode_utf8_stream" t_decode_utf8_stream, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index 8f36f7ec..a6d72529 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -42,7 +42,7 @@ import Tests.Utils import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T -import qualified Data.Text.Encoding.Error as T +import qualified Data.Text.Encoding.Common as T import qualified Data.Text.Internal.Fusion as TF import qualified Data.Text.Internal.Fusion.Common as TF import qualified Data.Text.Internal.Lazy as TL diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs index cf011680..7828bc75 100644 --- a/tests/Tests/Regressions.hs +++ b/tests/Tests/Regressions.hs @@ -21,7 +21,7 @@ import qualified Data.ByteString.Lazy as LB import qualified Data.Text as T import qualified Data.Text.Array as TA import qualified Data.Text.Encoding as TE -import qualified Data.Text.Encoding.Error as E +import qualified Data.Text.Encoding.Common as E import qualified Data.Text.Internal as T import qualified Data.Text.IO as T import qualified Data.Text.Lazy as LT diff --git a/text.cabal b/text.cabal index 956d8ce3..7def5192 100644 --- a/text.cabal +++ b/text.cabal @@ -137,7 +137,7 @@ library Data.Text.Array Data.Text.Encoding Data.Text.Encoding.Error - Data.Text.Encoding.Types + Data.Text.Encoding.Common Data.Text.Foreign Data.Text.IO Data.Text.Internal @@ -255,6 +255,7 @@ test-suite tests deepseq, directory, ghc-prim, + mtl, tasty, tasty-hunit, tasty-quickcheck, From 270a6055f2430cba38d9ff6312550461e05d288e Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 16 Jul 2022 19:46:22 -0600 Subject: [PATCH 20/87] replace <> with mappend --- tests/Tests/Properties/Transcoding.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 9b36bfde..4c68f90a 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -311,10 +311,10 @@ decodeStream decoder snoc bs = runIdentity $ do Just c -> t `snoc` c _ -> t) in do - tell (t' <>) + tell (mappend t') f mempty _ -> do - tell (t <>) + tell (mappend t) pure (b, p) pure (tDiff mempty, b, p) From 135e7f5f3d56e15e00dc9bb7be0cbc09d23f13c6 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 17 Jul 2022 14:31:43 -0600 Subject: [PATCH 21/87] fix haddocks, remove commented code, remove mtl dependecy for tests --- changelog.md | 2 -- src/Data/Text/Encoding.hs | 8 ++++---- src/Data/Text/Encoding/Common.hs | 8 ++++---- src/Data/Text/Encoding/Error.hs | 1 - src/Data/Text/Internal/Builder.hs | 11 ----------- src/Data/Text/Lazy/Encoding.hs | 8 ++++---- tests/Tests/Properties/Transcoding.hs | 15 +++++---------- text.cabal | 1 - 8 files changed, 17 insertions(+), 37 deletions(-) diff --git a/changelog.md b/changelog.md index 4dc76498..b0347e05 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,3 @@ -### 2.0.1 - * Improve portability of C and C++ code. * [Make `Lift` instance more efficient](https://github.com/haskell/text/pull/413) * [Make `toCaseFold` idempotent](https://github.com/haskell/text/pull/402) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index b8e9ffb1..f67ca72c 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -352,7 +352,7 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf8Chunks :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. -> ByteString -- ^ The second `ByteString` chunk to decode. @@ -375,7 +375,7 @@ noOptimization _ _ _ = Nothing -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf16Chunks :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. @@ -405,7 +405,7 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff - -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf32Chunks :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. @@ -425,7 +425,7 @@ decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> -- 'Right' value, and an error ('Left' 'Int') indicates the postion of -- the offending 'Word8'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeAsciiE :: ByteString -> Either Int Text decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs index 3212e315..d46c40ee 100644 --- a/src/Data/Text/Encoding/Common.hs +++ b/src/Data/Text/Encoding/Common.hs @@ -47,12 +47,12 @@ import Numeric (showHex) -- | A decoding result on encoded data. -- --- @since 2.0.0.1 +-- @since 2.0.1 data DecodeResult t b w = DecodeResult !t -- ^ The decoded data up to an incomplete code point at -- the end of the input data, an invalid word, or to the -- end of the input. - !(Maybe w) -- ^ If an invalid code point was encountered. + (Maybe w) -- ^ If an invalid code point was encountered. !b -- ^ The remaining undecoded data. If an invald code -- point was encountered, this is after that code point. !Int -- ^ Byte position of remaining undecoded data. @@ -60,7 +60,7 @@ data DecodeResult t b w = DecodeResult -- | The funtion type for handling successful decodings. -- --- @since 2.0.0.1 +-- @since 2.0.1 type DecodeResultHandler t w b b' r = t -- ^ The decoded data up to an incomplete code point at -- the end of the input data, an invalid word, or to @@ -82,7 +82,7 @@ type DecodeResultHandler t w b b' r -- result. The continuations accept another section of unencoded data -- as a continuation of any remaining unencoded data. -- --- @since 2.0.0.1 +-- @since 2.0.1 chunksDecoderToStream :: Monoid b => (b -> b' -> DecodeResult t b w) -- ^ Chunks decoder -> b' -- ^ Encoded data diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index 569fafcf..83fa3a76 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -38,7 +38,6 @@ import Data.Text.Encoding.Common , OnError , OnDecodeError , OnEncodeError - -- * Useful error handling functions , lenientDecode , strictDecode , strictEncode diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs index 6269c597..590c6528 100644 --- a/src/Data/Text/Internal/Builder.hs +++ b/src/Data/Text/Internal/Builder.hs @@ -108,11 +108,6 @@ instance Monoid Builder where mconcat = foldr mappend Data.Monoid.mempty {-# INLINE mconcat #-} --- | Performs replacement on invalid scalar values: --- --- >>> :set -XOverloadedStrings --- >>> "\55555" :: Builder --- "\65533" instance String.IsString Builder where fromString = fromString {-# INLINE fromString #-} @@ -186,12 +181,6 @@ fromText t@(Text arr off l) -- -- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@ -- --- Performs replacement on invalid scalar values: --- --- >>> fromString "\55555" --- "\65533" --- --- @since 1.2.0.0 fromString :: String -> Builder fromString str = Builder $ \k (Buffer p0 o0 u0 l0) -> let loop !marr !o !u !l [] = k (Buffer marr o u l) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 25f655c2..14fa6ef8 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -115,7 +115,7 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- 'Right' value, and an error ('Left' 'Int') indicates the postion of -- the offending 'Word8'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeAsciiE :: B.ByteString -> Either Int Text decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks @@ -138,7 +138,7 @@ decodeChunks decoder = g id 0 mempty -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf8Chunks :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. @@ -148,7 +148,7 @@ decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks -- | Decode two 'B.ByteString's containing UTF-16-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf16Chunks :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. @@ -159,7 +159,7 @@ decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks -- | Decode two 'B.ByteString's containing UTF-32-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.0.1 +-- @since 2.0.1 decodeUtf32Chunks :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 4c68f90a..0cbc0141 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -6,9 +6,6 @@ module Tests.Properties.Transcoding ( testTranscoding ) where -import Control.Monad.Writer (runWriterT, tell) -import Data.Functor.Identity (runIdentity) --- import Data.Monoid (Endo(appEndo)) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) import Data.Maybe (fromMaybe) @@ -303,20 +300,18 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs -decodeStream decoder snoc bs = runIdentity $ do - ((b, p), tDiff) <- runWriterT $ decoder bs $ \ t mW b p f -> +decodeStream decoder snoc bs = + case decoder bs $ \ t mW b p f -> case mW of Just w -> let t' = (case C.lenientDecode "" $ Just w of Just c -> t `snoc` c _ -> t) in do - tell (mappend t') + (mappend t', ()) f mempty - _ -> do - tell (mappend t) - pure (b, p) - pure (tDiff mempty, b, p) + _ -> (mappend t, (b, p)) of + (tDiff, (b, p)) -> (tDiff mempty, b, p) decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks diff --git a/text.cabal b/text.cabal index 7def5192..cabb1582 100644 --- a/text.cabal +++ b/text.cabal @@ -255,7 +255,6 @@ test-suite tests deepseq, directory, ghc-prim, - mtl, tasty, tasty-hunit, tasty-quickcheck, From 74520741a084c28f285676ea055624cd0aacddbe Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 17 Jul 2022 14:51:16 -0600 Subject: [PATCH 22/87] remove unused extension from Tests.Properties.Transcoding --- tests/Tests/Properties/Transcoding.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 0cbc0141..0fcdb7ef 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -1,6 +1,6 @@ -- | Tests for encoding and decoding -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} module Tests.Properties.Transcoding ( testTranscoding @@ -307,9 +307,9 @@ decodeStream decoder snoc bs = let t' = (case C.lenientDecode "" $ Just w of Just c -> t `snoc` c _ -> t) - in do - (mappend t', ()) - f mempty + in + case f mempty of + (tDiff, bp) -> (mappend t' `mappend` tDiff, bp) _ -> (mappend t, (b, p)) of (tDiff, (b, p)) -> (tDiff mempty, b, p) From 438ab9b728841f39737114f75550591b80232d7d Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 17 Jul 2022 15:27:24 -0600 Subject: [PATCH 23/87] Accommodating older versions of Haddock --- src/Data/Text/Encoding/Common.hs | 50 +++++++++++++++----------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs index d46c40ee..dfd9f05d 100644 --- a/src/Data/Text/Encoding/Common.hs +++ b/src/Data/Text/Encoding/Common.hs @@ -45,42 +45,38 @@ import Data.Typeable (Typeable) import Data.Word (Word8) import Numeric (showHex) --- | A decoding result on encoded data. +-- | A decoding result on encoded data. It contains: +-- +-- 1. the decoded data up to an incomplete code point at the end of +-- the input data, an invalid word, or to the end of the input, +-- 2. the invalid code point if one was encountered, +-- 3. the remaining undecoded data, which is either an incomplete +-- code point, the data after the invalid code point, or empty, +-- and +-- 4. the byte position of remaining undecoded data. -- -- @since 2.0.1 -data DecodeResult t b w = DecodeResult - !t -- ^ The decoded data up to an incomplete code point at - -- the end of the input data, an invalid word, or to the - -- end of the input. - (Maybe w) -- ^ If an invalid code point was encountered. - !b -- ^ The remaining undecoded data. If an invald code - -- point was encountered, this is after that code point. - !Int -- ^ Byte position of remaining undecoded data. +data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int deriving (Eq, Ord, Show, Read) --- | The funtion type for handling successful decodings. +-- | The funtion type for handling successful decodings. The arguments +-- passed to the handler are: +-- +-- 1. the decoded data up to an incomplete code point at the end of +-- the input data, an invalid word, or to the end of the input, +-- 2. the invalid code point if one was encountered, +-- 3. the remaining undecoded data, which is either an incomplete +-- code point, the data after the invalid code point, or empty, +-- 4. the byte position of remaining undecoded data, and +-- 5. a continuation function that accepts data that as treated as +-- a continuation of any remaining unencoded data. -- -- @since 2.0.1 -type DecodeResultHandler t w b b' r - = t -- ^ The decoded data up to an incomplete code point at - -- the end of the input data, an invalid word, or to - -- the end of the input. - -> Maybe w -- ^ If an invalid code point was encountered. - -> b -- ^ The remaining undecoded data. If an invald code - -- point was encountered, this is after that code - -- point. - -> Int -- ^ Byte position of remaining undecoded data. This - -- is treated as if all the data fed to previous - -- invocations of the continations were one continuous - -- feed. - -> (b' -> r) -- ^ Continuation to accept the next span of data to be - -- decoded with the remaining unencoded data. - -> r -- ^ Result of the continuation. +type DecodeResultHandler t w b b' r = t -> Maybe w -> b -> Int -> (b' -> r) -> r -- | Create a stream decoder from a chunks decoder. The resulting -- stream decoder accepts a 'DecodeResultHandler' to process the decode --- result. The continuations accept another section of unencoded data --- as a continuation of any remaining unencoded data. +-- result. -- -- @since 2.0.1 chunksDecoderToStream :: Monoid b From 623eff363708f8f60663f66a702b5d4b9b6fe387 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 20 Jul 2022 07:45:29 -0600 Subject: [PATCH 24/87] Removed double copy of UTF-8 optimization. Refactored UTF-16 and UTF-32 code point inspectors --- src/Data/Text/Encoding.hs | 167 +++++++++++++---------- src/Data/Text/Internal/Encoding/Utf16.hs | 47 +++---- src/Data/Text/Internal/Encoding/Utf32.hs | 36 +++-- 3 files changed, 135 insertions(+), 115 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index f67ca72c..496f66de 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -4,6 +4,7 @@ {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TupleSections #-} -- | -- Module : Data.Text.Encoding -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, @@ -76,7 +77,7 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) -import Control.Monad.ST (runST) +import Control.Monad.ST (ST, runST) import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B @@ -218,22 +219,29 @@ isValidBS bs = start 0 #endif #endif -data Progression - = WriteAndAdvance Char (Int -> Int) - | NeedMore - | Invalid - -decodeChunks :: (Bits w, Num w, Storable w) => - w -- only used for Storable.sizeOf argument which the function discards - -> (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) +decodeChunks :: (Bits w, Num w, Storable w) + => w + -> ( Int + -> ByteString + -> Int + -> A.MArray s + -> Int + -> Int + -> ST + s + (Maybe + ((A.MArray s + -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) + -> ST s (DecodeResult Text ByteString w))) + ) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString - -> DecodeResult Text ByteString w + -> ST s (DecodeResult Text ByteString w) decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) | len2 == 0 , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1 - | otherwise = runST $ do + | otherwise = do marr <- A.new len' outer marr len' 0 0 where @@ -249,43 +257,33 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len' :: Int len' = (len `div` wordByteSize) + 4 - -- outer :: (Bits w, Num w) => A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w) outer dst dstLen = inner where inner srcOff dstOff -- finished | len - srcOff < 1 = goodSoFar -- shortcut for utf-8 - | otherwise = - case queryOptimization bs1 bs2 srcOff of - Just (arr, tLen) -> - let minLen = tLen + dstOff in - if minLen > dstLen - then do - let dstLen' = minLen + 4 - dst' <- A.resizeM dst dstLen' - A.copyI tLen dst' dstOff arr 0 - outer dst' dstLen' (srcOff + tLen) minLen - else do - A.copyI tLen dst dstOff arr 0 - inner (srcOff + tLen) minLen + | otherwise = do + mOuterArgs <- queryOptimization len1 bs2 srcOff dst dstLen dstOff + case mOuterArgs of + Just outerArgs -> outerArgs outer _ -> if len - srcOff < wordByteSize - -- incomplete code point - then goodSoFar - else - if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case decodeF index len srcOff of - WriteAndAdvance c advance -> do - d <- unsafeWrite dst dstOff c - inner (advance d) $ dstOff + d - NeedMore -> goodSoFar - Invalid -> invalid + -- incomplete code point + then goodSoFar + else + if dstOff + 4 > dstLen + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case decodeF index len srcOff of + WriteAndAdvance c advance -> do + d <- unsafeWrite dst dstOff c + inner (advance d) $ dstOff + d + NeedMore -> goodSoFar + Invalid -> invalid where contin off res = do A.shrinkM dst dstOff @@ -306,19 +304,37 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> contin srcOff' $ \ t bs' -> DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' -decodeChunksProxy :: (Bits w, Num w, Storable w) => - (ByteString -> ByteString -> Int -> Maybe (A.Array, Int)) +decodeChunksProxy :: (Bits w, Num w, Storable w) + => ( Int + -> ByteString + -> Int + -> A.MArray s + -> Int + -> Int + -> ST s (Maybe ( + (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) + -> ST s (DecodeResult Text ByteString w) + ) + ) + ) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString - -> DecodeResult Text ByteString w + -> ST s (DecodeResult Text ByteString w) decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- determine the size in bytes of a data type using Storable.sizeOf -- so that it doesn't have to be passed as an arugment. Storable.sizeOf -- discards the actual value without evaluating it. -queryUtf8DecodeOptimization :: ByteString -> ByteString -> Int -> Maybe (A.Array, Int) -queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff +queryUtf8DecodeOptimization + :: Int + -> ByteString + -> Int + -> A.MArray s + -> Int + -> Int + -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t)) +queryUtf8DecodeOptimization len1 bs2@(B.length -> len2) srcOff dst dstLen dstOff | srcOff >= len1 -- potential valid utf8 content endpoint , utf8End <- len1 + guessUtf8Boundary @@ -326,13 +342,18 @@ queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff -- potential valid utf8 content length , utf8Len <- utf8End - srcOff , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 - , isValidBS bs' = Just (runST $ do - marr <- A.new utf8Len + , isValidBS bs' + , minLen <- utf8Len + dstOff = do + (dst', dstLen') <- + if minLen > dstLen + then + let newLen = minLen + 4 in + (, newLen) <$> A.resizeM dst newLen + else pure (dst, dstLen) withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer marr 0 src utf8Len - arr <- A.unsafeFreeze marr - pure $ (arr, utf8Len)) - | otherwise = Nothing + unsafeSTToIO $ A.copyFromPointer dst' dstOff src utf8Len + pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) minLen + | otherwise = pure Nothing where -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 guessUtf8Boundary :: Int @@ -357,7 +378,7 @@ decodeUtf8Chunks :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. -> ByteString -- ^ The second `ByteString` chunk to decode. -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len srcOff -> +decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff -> let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off step i (Incomplete a b) @@ -367,10 +388,10 @@ decodeUtf8Chunks = decodeChunksProxy queryUtf8DecodeOptimization $ \ index len s case decodeFrom srcOff of Accept c -> WriteAndAdvance c (srcOff +) Reject -> Invalid - Incomplete{} -> NeedMore + Incomplete{} -> NeedMore) bs1 bs2 -noOptimization :: p1 -> p2 -> p3 -> Maybe a -noOptimization _ _ _ = Nothing +noOptimization :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a) +noOptimization _ _ _ _ _ _ = pure Nothing -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. @@ -381,14 +402,14 @@ decodeUtf16Chunks -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. -> ByteString -- ^ The second `ByteString` chunk to decode. -> DecodeResult Text ByteString Word16 -decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff -> +decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff -> -- get next Word8 pair let writeAndAdvance c n = WriteAndAdvance c $ const n b0 = index $ if isBE then srcOff else srcOff + 1 b1 = index $ if isBE then srcOff + 1 else srcOff in - case queryUtf16Bytes b0 of - OneWord16 f -> writeAndAdvance (f b1) $ srcOff + 2 + case queryUtf16Bytes b0 b1 of + OneWord16 c -> writeAndAdvance c $ srcOff + 2 TwoWord16 g -> if len - srcOff < 4 -- not enough Word8s to finish the code point @@ -397,10 +418,10 @@ decodeUtf16Chunks isBE = decodeChunksProxy noOptimization $ \ index len srcOff - let b2 = index $ srcOff + (if isBE then 2 else 3) b3 = index $ srcOff + (if isBE then 3 else 2) in - case g b2 of - Just f' -> writeAndAdvance (f' b1 b3) $ srcOff + 4 + case g b2 b3 of + Just c -> writeAndAdvance c $ srcOff + 4 _ -> Invalid - _ -> Invalid + _ -> Invalid) bs1 bs2 -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. @@ -411,19 +432,25 @@ decodeUtf32Chunks -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. -> ByteString -- ^ The second `ByteString` chunk to decode. -> DecodeResult Text ByteString Word32 -decodeUtf32Chunks isBE = decodeChunksProxy noOptimization $ \ index _ srcOff -> +decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index _ srcOff -> -- get next Word8 quartet - case (queryUtf32Bytes . index $ if isBE then srcOff else srcOff + 3) - >>= ($ (index $ srcOff + (if isBE then 1 else 2))) - >>= ($ (index $ srcOff + (if isBE then 2 else 1))) of - Just f -> WriteAndAdvance (f . index $ if isBE then srcOff + 3 else srcOff) . const $ srcOff + 4 - _ -> Invalid + case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3) + (index $ srcOff + (if isBE then 1 else 2)) + (index $ srcOff + (if isBE then 2 else 1)) + (index $ if isBE then srcOff + 3 else srcOff) of + Just c -> WriteAndAdvance c . const $ srcOff + 4 + _ -> Invalid) bs1 bs2 + +data Progression + = WriteAndAdvance Char (Int -> Int) + | NeedMore + | Invalid -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a total function: On success the decoded 'Text' is within a --- 'Right' value, and an error ('Left' 'Int') indicates the postion of --- the offending 'Word8'. +-- 'Right' value, and an error @('Left' 'Int')@ indicates the position +-- of the offending 'Word8'. -- -- @since 2.0.1 decodeAsciiE :: ByteString -> Either Int Text diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs index 2749fa34..45b0eeb3 100644 --- a/src/Data/Text/Internal/Encoding/Utf16.hs +++ b/src/Data/Text/Internal/Encoding/Utf16.hs @@ -55,31 +55,28 @@ validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF && {-# INLINE validate2 #-} data Utf16Result - = OneWord16 (Word8 -> Char) - | TwoWord16 (Word8 -> Maybe (Word8 -> Word8 -> Char)) + = OneWord16 Char + | TwoWord16 (Word8 -> Word8 -> Maybe Char) | Invalid16 -queryUtf16Bytes :: Word8 -> Utf16Result -queryUtf16Bytes b0@(W8# w0#) = - if b0 < 0xD8 || b0 >= 0xE0 - then OneWord16 $ \ (W8# w1#) -> C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#)))) - else - -- 110110xx: start of surrogate pair - if b0 .&. 0xFC == 0xD8 - then TwoWord16 $ \ b2@(W8# w2#) -> - if b2 .&. 0xFC == 0xDC - -- valid surrogate - then Just $ \ (W8# w1#) (W8# w3#) -> - C# (chr# ( - (orI# - (orI# - (orI# - (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#)) - (word2Int# (shiftL# (word8ToWord# w1#) 10#)) - ) - (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#))) - (word2Int# (word8ToWord# w3#))) +# 0x10000# - )) - else Nothing - else Invalid16 +queryUtf16Bytes :: Word8 -> Word8 -> Utf16Result +queryUtf16Bytes b0@(W8# w0#) (W8# w1#) + | b0 < 0xD8 || b0 >= 0xE0 = OneWord16 $ C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#)))) + -- 110110xx: start of surrogate pair + | b0 .&. 0xFC == 0xD8 = TwoWord16 $ \ b2@(W8# w2#) (W8# w3#) -> + if b2 .&. 0xFC == 0xDC + -- valid surrogate + then Just $ + C# (chr# ( + (orI# + (orI# + (orI# + (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#)) + (word2Int# (shiftL# (word8ToWord# w1#) 10#)) + ) + (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#))) + (word2Int# (word8ToWord# w3#))) +# 0x10000# + )) + else Nothing + | otherwise = Invalid16 {-# INLINE queryUtf16Bytes #-} diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs index 517ff05a..40a0e25d 100644 --- a/src/Data/Text/Internal/Encoding/Utf32.hs +++ b/src/Data/Text/Internal/Encoding/Utf32.hs @@ -35,24 +35,20 @@ validate :: Word32 -> Bool validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF) {-# INLINE validate #-} -queryUtf32Bytes :: (Eq a, Num a) => a -> Maybe (Word8 -> Maybe (Word8 -> Maybe (Word8 -> Char))) -queryUtf32Bytes b0 = - if b0 == 0 - then Just $ \ b1@(W8# w1#) -> - if b1 < 0x11 - then Just $ \ b2@(W8# w2#) -> - if b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 - then Just $ \ (W8# w3#) -> - C# (chr# - (orI# - (orI# - (word2Int# (shiftL# (word8ToWord# w1#) 16#)) - (word2Int# (shiftL# (word8ToWord# w2#) 8#)) - ) - (word2Int# (word8ToWord# w3#)) - ) - ) - else Nothing - else Nothing - else Nothing +queryUtf32Bytes :: (Eq a, Num a) => a -> Word8 -> Word8 -> Word8 -> Maybe Char +queryUtf32Bytes b0 b1@(W8# w1#) b2@(W8# w2#) (W8# w3#) + | b0 == 0 + , b1 < 0x11 + , b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 = + Just $ + C# (chr# + (orI# + (orI# + (word2Int# (shiftL# (word8ToWord# w1#) 16#)) + (word2Int# (shiftL# (word8ToWord# w2#) 8#)) + ) + (word2Int# (word8ToWord# w3#)) + ) + ) + | otherwise = Nothing {-# INLINE queryUtf32Bytes #-} From 30a3ae1079331ca76fcbd492bbaa24252794143d Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 20 Jul 2022 22:22:52 -0600 Subject: [PATCH 25/87] Clarify DecodeResult documentation. Simplify WriteAndProgress data constructor. --- src/Data/Text.hs | 18 ++++++++- src/Data/Text/Encoding.hs | 56 +++++++++++++-------------- src/Data/Text/Encoding/Common.hs | 40 +------------------ src/Data/Text/Internal/Builder.hs | 11 ++++++ src/Data/Text/Lazy.hs | 18 ++++++++- tests/Tests/Properties/Transcoding.hs | 48 ----------------------- 6 files changed, 71 insertions(+), 120 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index c1eda582..5176a0ab 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -355,10 +355,21 @@ instance Monoid Text where mappend = (<>) mconcat = concat +-- | Performs replacement on invalid scalar values: +-- +-- >>> :set -XOverloadedStrings +-- >>> "\55555" :: Text +-- "\65533" instance IsString Text where fromString = pack --- | @since 1.2.0.0 +-- | Performs replacement on invalid scalar values: +-- +-- >>> :set -XOverloadedLists +-- >>> ['\55555'] :: Text +-- "\65533" +-- +-- @since 1.2.0.0 instance Exts.IsList Text where type Item Text = Char fromList = pack @@ -445,7 +456,10 @@ compareText (Text arrA offA lenA) (Text arrB offB lenB) = -- * Conversion to/from 'Text' -- | /O(n)/ Convert a 'String' into a 'Text'. --- Performs replacement on invalid scalar values. +-- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id': +-- +-- >>> unpack (pack "\55555") +-- "\65533" pack :: String -> Text pack = unstream . S.map safe . S.streamList {-# INLINE [1] pack #-} diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 496f66de..af27cc50 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -74,10 +74,9 @@ module Data.Text.Encoding , encodeUtf8BuilderEscaped ) where -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) - import Control.Exception (evaluate, try) import Control.Monad.ST (ST, runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B @@ -219,20 +218,23 @@ isValidBS bs = start 0 #endif #endif +data Progression + = WriteAndAdvance Char Int + | NeedMore + | Invalid + decodeChunks :: (Bits w, Num w, Storable w) => w - -> ( Int + -> ( ByteString -> ByteString -> Int -> A.MArray s -> Int -> Int - -> ST - s - (Maybe - ((A.MArray s - -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) - -> ST s (DecodeResult Text ByteString w))) + -> ST s (Maybe ( + (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) + -> ST s (DecodeResult Text ByteString w)) + ) ) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString @@ -264,7 +266,7 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> | len - srcOff < 1 = goodSoFar -- shortcut for utf-8 | otherwise = do - mOuterArgs <- queryOptimization len1 bs2 srcOff dst dstLen dstOff + mOuterArgs <- queryOptimization bs1 bs2 srcOff dst dstLen dstOff case mOuterArgs of Just outerArgs -> outerArgs outer _ -> if len - srcOff < wordByteSize @@ -279,9 +281,9 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> outer dst' dstLen' srcOff dstOff else case decodeF index len srcOff of - WriteAndAdvance c advance -> do + WriteAndAdvance c srcOff' -> do d <- unsafeWrite dst dstOff c - inner (advance d) $ dstOff + d + inner srcOff' $ dstOff + d NeedMore -> goodSoFar Invalid -> invalid where @@ -305,7 +307,7 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' decodeChunksProxy :: (Bits w, Num w, Storable w) - => ( Int + => ( ByteString -> ByteString -> Int -> A.MArray s @@ -327,14 +329,14 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- discards the actual value without evaluating it. queryUtf8DecodeOptimization - :: Int + :: ByteString -> ByteString -> Int -> A.MArray s -> Int -> Int -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t)) -queryUtf8DecodeOptimization len1 bs2@(B.length -> len2) srcOff dst dstLen dstOff +queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff dst dstLen dstOff | srcOff >= len1 -- potential valid utf8 content endpoint , utf8End <- len1 + guessUtf8Boundary @@ -379,14 +381,13 @@ decodeUtf8Chunks -> ByteString -- ^ The second `ByteString` chunk to decode. -> DecodeResult Text ByteString Word8 decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff -> - let decodeFrom off = step (off + 1) . utf8DecodeStart $ index off - - step i (Incomplete a b) + let step i (Incomplete a b) | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step _ st = st + step i st = (st, i) + (dr, srcOff') = step (srcOff + 1) . utf8DecodeStart $ index srcOff in - case decodeFrom srcOff of - Accept c -> WriteAndAdvance c (srcOff +) + case dr of + Accept c -> WriteAndAdvance c srcOff' Reject -> Invalid Incomplete{} -> NeedMore) bs1 bs2 @@ -404,12 +405,12 @@ decodeUtf16Chunks -> DecodeResult Text ByteString Word16 decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff -> -- get next Word8 pair - let writeAndAdvance c n = WriteAndAdvance c $ const n + let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n b0 = index $ if isBE then srcOff else srcOff + 1 b1 = index $ if isBE then srcOff + 1 else srcOff in case queryUtf16Bytes b0 b1 of - OneWord16 c -> writeAndAdvance c $ srcOff + 2 + OneWord16 c -> writeAndAdvance c 2 TwoWord16 g -> if len - srcOff < 4 -- not enough Word8s to finish the code point @@ -419,7 +420,7 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind b3 = index $ srcOff + (if isBE then 3 else 2) in case g b2 b3 of - Just c -> writeAndAdvance c $ srcOff + 4 + Just c -> writeAndAdvance c 4 _ -> Invalid _ -> Invalid) bs1 bs2 @@ -438,14 +439,9 @@ decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind (index $ srcOff + (if isBE then 1 else 2)) (index $ srcOff + (if isBE then 2 else 1)) (index $ if isBE then srcOff + 3 else srcOff) of - Just c -> WriteAndAdvance c . const $ srcOff + 4 + Just c -> WriteAndAdvance c $ srcOff + 4 _ -> Invalid) bs1 bs2 -data Progression - = WriteAndAdvance Char (Int -> Int) - | NeedMore - | Invalid - -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a total function: On success the decoded 'Text' is within a diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs index dfd9f05d..385c6025 100644 --- a/src/Data/Text/Encoding/Common.hs +++ b/src/Data/Text/Encoding/Common.hs @@ -21,10 +21,8 @@ module Data.Text.Encoding.Common ( - -- * Full-service result types and functions + -- * Full-service result type DecodeResult(..) - , DecodeResultHandler - , chunksDecoderToStream -- * Error handling types , UnicodeException(..) , OnError @@ -49,7 +47,7 @@ import Numeric (showHex) -- -- 1. the decoded data up to an incomplete code point at the end of -- the input data, an invalid word, or to the end of the input, --- 2. the invalid code point if one was encountered, +-- 2. the first word of an invalid code point if one was encountered, -- 3. the remaining undecoded data, which is either an incomplete -- code point, the data after the invalid code point, or empty, -- and @@ -59,40 +57,6 @@ import Numeric (showHex) data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int deriving (Eq, Ord, Show, Read) --- | The funtion type for handling successful decodings. The arguments --- passed to the handler are: --- --- 1. the decoded data up to an incomplete code point at the end of --- the input data, an invalid word, or to the end of the input, --- 2. the invalid code point if one was encountered, --- 3. the remaining undecoded data, which is either an incomplete --- code point, the data after the invalid code point, or empty, --- 4. the byte position of remaining undecoded data, and --- 5. a continuation function that accepts data that as treated as --- a continuation of any remaining unencoded data. --- --- @since 2.0.1 -type DecodeResultHandler t w b b' r = t -> Maybe w -> b -> Int -> (b' -> r) -> r - --- | Create a stream decoder from a chunks decoder. The resulting --- stream decoder accepts a 'DecodeResultHandler' to process the decode --- result. --- --- @since 2.0.1 -chunksDecoderToStream :: Monoid b - => (b -> b' -> DecodeResult t b w) -- ^ Chunks decoder - -> b' -- ^ Encoded data - -> DecodeResultHandler t w b b' r -- ^ Result continuation - -> r -- ^ Continuation result -chunksDecoderToStream chunksDecoder bs f = - g mempty bs 0 - where - g bs0 bs1 pos0 = - let DecodeResult t mW bs1' pos1 = chunksDecoder bs0 bs1 - pos = pos0 + pos1 - in - f t mW bs1' pos $ \ bs2 -> g bs1' bs2 pos - -- | Function type for handling a coding error. It is supplied with -- two inputs: -- diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs index 590c6528..6269c597 100644 --- a/src/Data/Text/Internal/Builder.hs +++ b/src/Data/Text/Internal/Builder.hs @@ -108,6 +108,11 @@ instance Monoid Builder where mconcat = foldr mappend Data.Monoid.mempty {-# INLINE mconcat #-} +-- | Performs replacement on invalid scalar values: +-- +-- >>> :set -XOverloadedStrings +-- >>> "\55555" :: Builder +-- "\65533" instance String.IsString Builder where fromString = fromString {-# INLINE fromString #-} @@ -181,6 +186,12 @@ fromText t@(Text arr off l) -- -- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@ -- +-- Performs replacement on invalid scalar values: +-- +-- >>> fromString "\55555" +-- "\65533" +-- +-- @since 1.2.0.0 fromString :: String -> Builder fromString str = Builder $ \k (Buffer p0 o0 u0 l0) -> let loop !marr !o !u !l [] = k (Buffer marr o u l) diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs index 9e2638eb..39834a8b 100644 --- a/src/Data/Text/Lazy.hs +++ b/src/Data/Text/Lazy.hs @@ -316,10 +316,21 @@ instance Monoid Text where mappend = (<>) mconcat = concat +-- | Performs replacement on invalid scalar values: +-- +-- >>> :set -XOverloadedStrings +-- >>> "\55555" :: Data.Text.Lazy.Text +-- "\65533" instance IsString Text where fromString = pack --- | @since 1.2.0.0 +-- | Performs replacement on invalid scalar values: +-- +-- >>> :set -XOverloadedLists +-- >>> ['\55555'] :: Data.Text.Lazy.Text +-- "\65533" +-- +-- @since 1.2.0.0 instance Exts.IsList Text where type Item Text = Char fromList = pack @@ -372,7 +383,10 @@ textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr] -- | /O(n)/ Convert a 'String' into a 'Text'. -- --- Performs replacement on invalid scalar values. +-- Performs replacement on invalid scalar values, so @'unpack' . 'pack'@ is not 'id': +-- +-- >>> Data.Text.Lazy.unpack (Data.Text.Lazy.pack "\55555") +-- "\65533" pack :: #if defined(ASSERTS) HasCallStack => diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 0fcdb7ef..580d15cc 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -8,7 +8,6 @@ module Tests.Properties.Transcoding import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) -import Data.Maybe (fromMaybe) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -300,50 +299,6 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs -decodeStream decoder snoc bs = - case decoder bs $ \ t mW b p f -> - case mW of - Just w -> - let t' = (case C.lenientDecode "" $ Just w of - Just c -> t `snoc` c - _ -> t) - in - case f mempty of - (tDiff, bp) -> (mappend t' `mappend` tDiff, bp) - _ -> (mappend t, (b, p)) of - (tDiff, (b, p)) -> (tDiff mempty, b, p) - -decodeUtf8StreamL = C.chunksDecoderToStream EL.decodeUtf8Chunks - -decodeUtf8StreamS = C.chunksDecoderToStream E.decodeUtf8Chunks - --- The results of the stream decoder on a strict bytestring should --- match the results of a non-stream decoder on a lazy bytestring. -t_decode_utf8_lenient_stream :: Property -t_decode_utf8_lenient_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> - let (st, sb, _) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs - st' = B.foldr (\ w st'' -> st'' `T.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) st sb - in - decodeLL bs === TL.fromStrict st' - --- The results of the non-stream decoder on a strict bytestring should --- match the results of a stream decoder on a lazy bytestring. -t_decode_utf8_stream_lenient :: Property -t_decode_utf8_stream_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> - let (lt, lb, _) = decodeStream decodeUtf8StreamL TL.snoc bs - lt' = BL.foldr (\ w lt'' -> lt'' `TL.snoc` (fromMaybe 'a' . C.lenientDecode "" $ Just w)) lt lb - in - lt' === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs - --- The decoding of lazy bytestrings should not depend on how they are chunked, --- and it should behave the same as decoding of strict bytestrings. -t_decode_utf8_stream :: Property -t_decode_utf8_stream = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> - decodeStream decodeUtf8StreamL TL.snoc bs === - ( let (st, sb, pos) = decodeStream decodeUtf8StreamS T.snoc . B.concat $ BL.toChunks bs in - (TL.fromStrict st, BL.fromStrict sb, pos) - ) - -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... -- When faced with this illegal byte sequence ... a UTF-8 conformant process @@ -431,9 +386,6 @@ testTranscoding = testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE, testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, - testProperty "t_decode_utf8_stream_lenient" t_decode_utf8_stream_lenient, - testProperty "t_decode_utf8_lenient_stream" t_decode_utf8_lenient_stream, - testProperty "t_decode_utf8_stream" t_decode_utf8_stream, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, From 0b4bceaf04da164a547f42301766c38be7d4d8bb Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 23 Jul 2022 13:19:13 -0600 Subject: [PATCH 26/87] Replaced decodeAsciiE with decodeAsciiChunks (lazy and strict) --- changelog.md | 2 +- src/Data/Text/Encoding.hs | 327 +++++++++++++------------- src/Data/Text/Encoding/Common.hs | 2 +- src/Data/Text/Lazy/Encoding.hs | 44 ++-- tests/Tests/Properties/Transcoding.hs | 56 ++++- 5 files changed, 240 insertions(+), 191 deletions(-) diff --git a/changelog.md b/changelog.md index b0347e05..f198612a 100644 --- a/changelog.md +++ b/changelog.md @@ -207,7 +207,7 @@ * Bumped lower bound on deepseq to 1.4 for compatibility with the upcoming GHC 7.10 -#### 1.2.0.1 +#### 1.2.0.2 * Fixed a buffer overflow in rendering of large Integers (https://github.com/bos/text/issues/99) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index af27cc50..799fedcb 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -32,6 +32,7 @@ module Data.Text.Encoding decodeLatin1 , decodeUtf8Lenient , DecodeResult(..) + , decodeAsciiChunks , decodeUtf8Chunks , decodeUtf16Chunks , decodeUtf32Chunks @@ -40,7 +41,6 @@ module Data.Text.Encoding , decodeUtf8' -- *** Controllable error handling - , decodeAsciiE , decodeUtf8With , decodeUtf16LEWith , decodeUtf16BEWith @@ -81,7 +81,6 @@ import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B -import qualified Data.ByteString.Short.Internal as SBS import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) @@ -147,13 +146,11 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -- anything except ASCII and copies buffer or throws an error otherwise. -- decodeASCII :: ByteString -> Text -decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - c_is_ascii src (src `plusPtr` len) - if asciiPrefixLen == len - then let !(SBS.SBS arr) = SBS.toShort bs in - return (Text (A.ByteArray arr) 0 len) - else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen +decodeASCII bs = case decodeAsciiChunks bs mempty of + DecodeResult t mW _ pos -> + case mW of + Just _ -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show (pos - 1) + _ -> t -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -171,7 +168,7 @@ decodeLatin1 :: decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) let inner srcOff dstOff = if srcOff >= len then return dstOff else do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + asciiPrefixLen <- fmap fromIntegral . unsafeIOToST . unsafeWithForeignPtr fp $ \src -> c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len) if asciiPrefixLen == 0 then do @@ -192,45 +189,23 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize -isValidBS :: ByteString -> Bool -#ifdef SIMDUTF -isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len) -#else -#if MIN_VERSION_bytestring(0,11,2) -isValidBS = B.isValidUtf8 -#else -isValidBS bs = start 0 - where - start ix - | ix >= B.length bs = True - | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st _ -> step (ix + 1) st - step ix st - | ix >= B.length bs = False - -- We do not use decoded code point, so passing a dummy value to save an argument. - | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st' _ -> step (ix + 1) st' -#endif -#endif - data Progression = WriteAndAdvance Char Int | NeedMore | Invalid -decodeChunks :: (Bits w, Num w, Storable w) +decodeChunks :: +#if defined(ASSERTS) + HasCallStack => +#endif + (Bits w, Num w, Storable w) => w - -> ( ByteString + -> ( Int -> ByteString - -> Int -> A.MArray s -> Int -> Int + -> Int -> ST s (Maybe ( (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) -> ST s (DecodeResult Text ByteString w)) @@ -240,33 +215,32 @@ decodeChunks :: (Bits w, Num w, Storable w) -> ByteString -> ByteString -> ST s (DecodeResult Text ByteString w) -decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> len2) - | len2 == 0 - , len1 > 0 = decodeChunks w queryOptimization decodeF bs2 bs1 - | otherwise = do - marr <- A.new len' - outer marr len' 0 0 - where - wordByteSize = sizeOf w - - index :: Int -> Word8 - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - - len :: Int - len = len1 + len2 - len' :: Int - len' = (len `div` wordByteSize) + 4 - - outer dst dstLen = inner - where - inner srcOff dstOff - -- finished - | len - srcOff < 1 = goodSoFar - -- shortcut for utf-8 - | otherwise = do - mOuterArgs <- queryOptimization bs1 bs2 srcOff dst dstLen dstOff +decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = do + marr <- A.new len' + outer marr len' 0 0 + where + wordByteSize = sizeOf w + + index :: Int -> Word8 + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + + len :: Int + len = len1 + len2 + len' :: Int + len' = len `div` wordByteSize + 4 + + outer dst dstLen = inner + where + inner srcOff dstOff + | srcOff < len1 = decode 0 bs1 + | srcOff < len = decode len1 bs2 + -- finished (for now) + | otherwise = goodSoFar + where + decode bsOff bs = do + mOuterArgs <- bulkDecodeF bsOff bs dst dstLen srcOff dstOff case mOuterArgs of Just outerArgs -> outerArgs outer _ -> if len - srcOff < wordByteSize @@ -280,39 +254,37 @@ decodeChunks w queryOptimization decodeF bs1@(B.length -> len1) bs2@(B.length -> dst' <- A.resizeM dst dstLen' outer dst' dstLen' srcOff dstOff else - case decodeF index len srcOff of + case incrementalDecodeF index len srcOff of WriteAndAdvance c srcOff' -> do d <- unsafeWrite dst dstOff c inner srcOff' $ dstOff + d NeedMore -> goodSoFar Invalid -> invalid - where - contin off res = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - pure . res (Text arr 0 dstOff) $ if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 - goodSoFar = - contin srcOff $ \ t bs' -> - DecodeResult t Nothing bs' srcOff - invalid = - let srcOff' = srcOff + wordByteSize - bytesToWord n word = - if n > 0 - then bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - else word - in - contin srcOff' $ \ t bs' -> - DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' + contin off res = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + pure . res (Text arr 0 dstOff) $ if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off $ bs1 `B.append` bs2 + goodSoFar = + contin srcOff $ \ t bs' -> + DecodeResult t Nothing bs' srcOff + invalid = + let srcOff' = srcOff + wordByteSize + bytesToWord n word + | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + | otherwise = word + in + contin srcOff' $ \ t bs' -> + DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' decodeChunksProxy :: (Bits w, Num w, Storable w) - => ( ByteString + => ( Int -> ByteString - -> Int -> A.MArray s -> Int -> Int + -> Int -> ST s (Maybe ( (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) -> ST s (DecodeResult Text ByteString w) @@ -328,82 +300,140 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- so that it doesn't have to be passed as an arugment. Storable.sizeOf -- discards the actual value without evaluating it. -queryUtf8DecodeOptimization - :: ByteString - -> ByteString +bulkCharCopy + :: Ptr Word8 -> Int -> A.MArray s -> Int -> Int + -> IO (A.MArray s, Int, Int) +bulkCharCopy src srcLen dst dstLen dstOff = do + let minLen = dstOff + srcLen + (dst', dstLen') <- + if minLen > dstLen + then + let newLen = minLen + 4 in + (, newLen) <$> (unsafeSTToIO $ A.resizeM dst newLen) + else pure (dst, dstLen) + unsafeSTToIO $ A.copyFromPointer dst' dstOff src srcLen + pure (dst', dstLen', minLen) + +-- | Decode two 'ByteString's containing ASCII as though they were one +-- continuous 'ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.2 +decodeAsciiChunks + :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second 'ByteString' chunk to decode. + -> DecodeResult Text ByteString Word8 +decodeAsciiChunks bs1 bs2 = runST $ decodeChunksProxy ( + \ bsOff bs dst dstLen srcOff dstOff -> withBS bs $ \ fp len -> + unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do + let srcOff' = srcOff - bsOff + asciiPrefixLen <- fmap fromIntegral . c_is_ascii (src `plusPtr` srcOff') $ src `plusPtr` len + if asciiPrefixLen == 0 + then pure Nothing + else do + (dst', dstLen', dstOff') <- bulkCharCopy src asciiPrefixLen dst dstLen dstOff + pure . Just $ \ f -> f dst' dstLen' (srcOff' + asciiPrefixLen + bsOff) dstOff' + ) (\ _ _ _ -> Invalid) bs1 bs2 + +decodeUtf8Bulk + :: Int + -> ByteString + -> A.MArray s + -> Int + -> Int + -> Int -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t)) -queryUtf8DecodeOptimization (B.length -> len1) bs2@(B.length -> len2) srcOff dst dstLen dstOff - | srcOff >= len1 +decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff -- potential valid utf8 content endpoint - , utf8End <- len1 + guessUtf8Boundary + | utf8End <- bsOff + guessUtf8Boundary bs + -- not yet reached the UTF-8 boundary , srcOff < utf8End -- potential valid utf8 content length , utf8Len <- utf8End - srcOff - , bs' <- B.drop (srcOff - len1) $ B.take guessUtf8Boundary bs2 - , isValidBS bs' - , minLen <- utf8Len + dstOff = do - (dst', dstLen') <- - if minLen > dstLen - then - let newLen = minLen + 4 in - (, newLen) <$> A.resizeM dst newLen - else pure (dst, dstLen) - withBS bs' $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst' dstOff src utf8Len - pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) minLen + -- this ByteString chunk's offset + , srcOff2 <- srcOff - bsOff + -- check for UTF-8 compliance + , isValidBS bs srcOff2 utf8Len = do + (dst', dstLen', dstOff') <- withBS bs $ \ fp _ -> + unsafeIOToST $ unsafeWithForeignPtr fp $ \ src -> + bulkCharCopy (src `plusPtr` srcOff2) utf8Len dst dstLen dstOff + pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) dstOff' | otherwise = pure Nothing where -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 + guessUtf8Boundary :: ByteString -> Int + guessUtf8Boundary bs'@(B.length -> len) + | len >= 1 && w 1 < 0x80 = len -- last char is ASCII + | len >= 1 && w 1 >= 0xc0 = len - 1 -- last char starts a two-(or more-)word code point + | len >= 2 && w 2 >= 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point + | len >= 3 && w 3 >= 0xf0 = len - 3 -- third to last char starts a four-word code point + | otherwise = 0 -- gonna have to resolve this with an incremental approach where - w0 = B.index bs2 (len2 - 1) - w1 = B.index bs2 (len2 - 2) - w2 = B.index bs2 (len2 - 3) - w3 = B.index bs2 (len2 - 4) + w n = B.index bs' (len - n) + + isValidBS :: ByteString -> Int -> Int -> Bool + isValidBS bs' off count = if off + count > B.length bs' + then False + else +#ifdef SIMDUTF + withBS bs' $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral count) +#else +#if MIN_VERSION_bytestring(0,11,2) + B.isValidUtf8 (B.take count $ B.drop off bs') +#else + start off + where + start ix + | ix >= count = True + | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st _ -> step (ix + 1) st + step ix st + | ix >= count = False + -- We do not use decoded code point, so passing a dummy value to save an argument. + | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st' _ -> step (ix + 1) st' +#endif +#endif -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf8Chunks - :: ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> ByteString -- ^ The second `ByteString` chunk to decode. + :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy queryUtf8DecodeOptimization (\ index len srcOff -> +decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy decodeUtf8Bulk (\ index len srcOff -> let step i (Incomplete a b) | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b step i st = (st, i) - (dr, srcOff') = step (srcOff + 1) . utf8DecodeStart $ index srcOff in - case dr of - Accept c -> WriteAndAdvance c srcOff' - Reject -> Invalid - Incomplete{} -> NeedMore) bs1 bs2 + case step (srcOff + 1) . utf8DecodeStart $ index srcOff of + (Accept c, srcOff') -> WriteAndAdvance c srcOff' + (Reject, _) -> Invalid + (Incomplete{}, _) -> NeedMore) bs1 bs2 -noOptimization :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a) -noOptimization _ _ _ _ _ _ = pure Nothing +noBulkDecodeOp :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a) +noBulkDecodeOp _ _ _ _ _ _ = pure Nothing -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf16Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) - -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> ByteString -- ^ The second `ByteString` chunk to decode. + :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') + -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word16 -decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index len srcOff -> +decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index len srcOff -> -- get next Word8 pair let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n b0 = index $ if isBE then srcOff else srcOff + 1 @@ -427,13 +457,13 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf32Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) - -> ByteString -- ^ The first `ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> ByteString -- ^ The second `ByteString` chunk to decode. + :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') + -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word32 -decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ index _ srcOff -> +decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index _ srcOff -> -- get next Word8 quartet case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3) (index $ srcOff + (if isBE then 1 else 2)) @@ -442,22 +472,6 @@ decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noOptimization (\ ind Just c -> WriteAndAdvance c $ srcOff + 4 _ -> Invalid) bs1 bs2 --- | Decode a 'ByteString' containing 7-bit ASCII encoded text. --- --- This is a total function: On success the decoded 'Text' is within a --- 'Right' value, and an error @('Left' 'Int')@ indicates the position --- of the offending 'Word8'. --- --- @since 2.0.1 -decodeAsciiE :: ByteString -> Either Int Text -decodeAsciiE bs = withBS bs $ \fp len -> if len == 0 then Right empty else runST $ do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - c_is_ascii src (src `plusPtr` len) - pure $ if asciiPrefixLen == len - then let !(SBS.SBS arr) = SBS.toShort bs in - Right (Text (A.ByteArray arr) 0 len) - else Left asciiPrefixLen - -- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- Surrogate code points in replacement character returned by 'OnDecodeError' @@ -783,9 +797,6 @@ encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} -cSizeToInt :: CSize -> Int -cSizeToInt = fromIntegral - #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs index 385c6025..0f3bd2db 100644 --- a/src/Data/Text/Encoding/Common.hs +++ b/src/Data/Text/Encoding/Common.hs @@ -53,7 +53,7 @@ import Numeric (showHex) -- and -- 4. the byte position of remaining undecoded data. -- --- @since 2.0.1 +-- @since 2.0.2 data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int deriving (Eq, Ord, Show, Read) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 14fa6ef8..992654fa 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,8 +23,8 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 - , decodeAsciiE , DecodeResult(..) + , decodeAsciiChunks , decodeUtf8Chunks , decodeUtf16Chunks , decodeUtf32Chunks @@ -109,16 +109,6 @@ decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks --- | Decode a 'ByteString' containing 7-bit ASCII encoded text. --- --- This is a total function: On success the decoded 'Text' is within a --- 'Right' value, and an error ('Left' 'Int') indicates the postion of --- the offending 'Word8'. --- --- @since 2.0.1 -decodeAsciiE :: B.ByteString -> Either Int Text -decodeAsciiE = foldr (\ lb -> (chunk <$> TE.decodeAsciiE lb <*>)) (pure empty) . B.toChunks - decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) -> B.ByteString -> B.ByteString @@ -135,35 +125,45 @@ decodeChunks decoder = g id 0 mempty g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos +-- | Decode two 'ByteString's containing ASCII as though they were one +-- continuous 'ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.2 +decodeAsciiChunks + :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. + -> DecodeResult Text B.ByteString Word8 +decodeAsciiChunks = decodeChunks TE.decodeAsciiChunks + -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf8Chunks - :: B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word8 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks -- | Decode two 'B.ByteString's containing UTF-16-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf16Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) - -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') + -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word16 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks -- | Decode two 'B.ByteString's containing UTF-32-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- --- @since 2.0.1 +-- @since 2.0.2 decodeUtf32Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian (`True`) or little-endian (`False`) - -> B.ByteString -- ^ The first `B.ByteString` chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> B.ByteString -- ^ The second `B.ByteString` chunk to decode. + :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') + -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word32 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 580d15cc..b61e603c 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -13,6 +13,7 @@ import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Tests.QuickCheckUtils import qualified Control.Exception as Exception +import qualified Control.Monad.Fix as F import qualified Data.Bits as Bits (shiftL, shiftR) import qualified Data.ByteString as B import qualified Data.ByteString.Builder as B @@ -27,11 +28,6 @@ import qualified Data.Text.Encoding.Common as C import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL -t_asciiE t = E.decodeAsciiE (E.encodeUtf8 a) === Right a - where a = T.map (\c -> chr (ord c `mod` 128)) t -tl_asciiE t = EL.decodeAsciiE (EL.encodeUtf8 a) === Right a - where a = TL.map (\c -> chr (ord c `mod` 128)) t - t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a @@ -211,9 +207,33 @@ genInvalidUTF8 = B.pack <$> oneof [ ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] +t_chunk_decode_ascii_1 = + let bs1 = B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, + 0x68, 0x65, 0x6c, 0x6c] + bs2 = B.pack [0x6f, 0x2c, 0x20, 0x94, + 0x68, 0x6f, 0x77, 0x20, 0xcc, + 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] + decode = E.decodeAsciiChunks + decodeResult0 = decode bs1 mempty + expectedBs0 = B.pack [0x68, 0x65, 0x6c, 0x6c] + in + whenEqProp decodeResult0 (E.DecodeResult "hi, " (Just 0x83) expectedBs0 5) $ + let decodeResult1 = decode expectedBs0 bs2 + expectedBs1 = B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, + 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] + in + whenEqProp decodeResult1 (E.DecodeResult "hello, " (Just 0x94) expectedBs1 8) $ + let decodeResult2 = decode expectedBs1 mempty + expectedBs2 = B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] + in + whenEqProp decodeResult2 (E.DecodeResult "how " (Just 0xcc) expectedBs2 5) $ + decode expectedBs2 mempty === E.DecodeResult "are ya?" Nothing mempty 7 + t_chunk_decode_utf8_1 = - let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x21] in - decodeResult0 === E.DecodeResult "hi, ☃!" Nothing mempty 8 + let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20, + 0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] in + decodeResult0 === E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22 t_chunk_decode_utf8_2 = let decode = E.decodeUtf8Chunks mempty decodeResult0 = decode $ B.pack [97, 0xC2, 97] @@ -235,6 +255,24 @@ t_chunk_decode_utf8_3 = let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5 +-- test multi-word code points split across bytestring chunks +t_chunk_decode_utf8_4 = + F.fix (\ f bs s -> + case s of + (input, expected) : s' -> whenEqProp (E.decodeUtf8Chunks bs input) expected $ + case expected of + E.DecodeResult _ _ bs' _ -> f bs' s' + _ -> counterexample "" True + ) mempty + [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4) + , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4) + , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0) + , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9) + , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0) + , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0) + , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5) + ] + t_chunk_decode_utf16BE = let decode = E.decodeUtf16Chunks True expectedBs0 = B.pack [0] @@ -335,8 +373,6 @@ t_infix_concat bs1 text bs2 = testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ - testProperty "t_asciiE" t_asciiE, - testProperty "tl_asciiE" tl_asciiE, testProperty "t_ascii" t_ascii, testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, @@ -381,6 +417,8 @@ testTranscoding = testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1, testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2, testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3, + testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4, + testProperty "t_chunk_decode_ascii_1" t_chunk_decode_ascii_1, testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE, testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, From e1b25698894db4c0f2565b5aee71102860949b4a Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 23 Jul 2022 13:26:32 -0600 Subject: [PATCH 27/87] minor mistake in calculating inital array length for text array in decodeChunks --- src/Data/Text/Encoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 799fedcb..85c192dc 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -229,7 +229,7 @@ decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.leng len :: Int len = len1 + len2 len' :: Int - len' = len `div` wordByteSize + 4 + len' = (len `div` wordByteSize) + 4 outer dst dstLen = inner where From c5594dc4f5b02c40c62ab98f98a972e2274b8ebf Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 23 Jul 2022 16:30:51 -0600 Subject: [PATCH 28/87] Fixed default isValidBS function --- src/Data/Text/Encoding.hs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 85c192dc..433b3348 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -366,13 +366,19 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 guessUtf8Boundary :: ByteString -> Int guessUtf8Boundary bs'@(B.length -> len) - | len >= 1 && w 1 < 0x80 = len -- last char is ASCII - | len >= 1 && w 1 >= 0xc0 = len - 1 -- last char starts a two-(or more-)word code point - | len >= 2 && w 2 >= 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point - | len >= 3 && w 3 >= 0xf0 = len - 3 -- third to last char starts a four-word code point - | otherwise = 0 -- gonna have to resolve this with an incremental approach + | len >= 1 && wr 1 < 0x80 -- last char is ASCII + , wb 2 0xe0 0xc0 -- last two chars are a two-word code point + , wb 3 0xf0 0xe0 -- last three chars are a three-word code point + , wb 2 0xf8 0xf0 -- last four chars are a four-word code point + = len + | w 1 0xc0 = len - 1 -- last char starts a two-(or more-)word code point + | w 2 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point + | w 3 0xf0 = len - 3 -- third to last char starts a four-word code point + | otherwise = 0 -- gonna have to resolve this with an incremental approach where - w n = B.index bs' (len - n) + wr n = B.index bs' (len - n) + w n bt = len >= n && wr n >= bt + wb n mask bt = len >= n && wr n .&. mask == bt isValidBS :: ByteString -> Int -> Int -> Bool isValidBS bs' off count = if off + count > B.length bs' @@ -388,13 +394,13 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff start off where start ix - | ix >= count = True + | ix >= off + count = True | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of Accept{} -> start (ix + 1) Reject{} -> False Incomplete st _ -> step (ix + 1) st step ix st - | ix >= count = False + | ix >= off + count = False -- We do not use decoded code point, so passing a dummy value to save an argument. | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of Accept{} -> start (ix + 1) From 0fefb3a4470ca1c234783f980462c124ad57f3b9 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 23 Jul 2022 19:56:05 -0600 Subject: [PATCH 29/87] Corrected logic for UTF-8 boundary checking --- src/Data/Text/Encoding.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 433b3348..91f77509 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -367,9 +367,9 @@ decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff guessUtf8Boundary :: ByteString -> Int guessUtf8Boundary bs'@(B.length -> len) | len >= 1 && wr 1 < 0x80 -- last char is ASCII - , wb 2 0xe0 0xc0 -- last two chars are a two-word code point - , wb 3 0xf0 0xe0 -- last three chars are a three-word code point - , wb 2 0xf8 0xf0 -- last four chars are a four-word code point + || wb 2 0xe0 0xc0 -- last two chars are a two-word code point + || wb 3 0xf0 0xe0 -- last three chars are a three-word code point + || wb 2 0xf8 0xf0 -- last four chars are a four-word code point = len | w 1 0xc0 = len - 1 -- last char starts a two-(or more-)word code point | w 2 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point From d09864938ecbac981d6d5e2d59039bc6dd759f69 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 23 Jul 2022 20:49:35 -0600 Subject: [PATCH 30/87] Cleaned up decodeChunks tests --- tests/Tests/Properties/Transcoding.hs | 219 +++++++++++++------------- 1 file changed, 106 insertions(+), 113 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index b61e603c..8aab9ae3 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -13,7 +13,6 @@ import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Tests.QuickCheckUtils import qualified Control.Exception as Exception -import qualified Control.Monad.Fix as F import qualified Data.Bits as Bits (shiftL, shiftR) import qualified Data.ByteString as B import qualified Data.ByteString.Builder as B @@ -207,123 +206,117 @@ genInvalidUTF8 = B.pack <$> oneof [ ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] -t_chunk_decode_ascii_1 = - let bs1 = B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, - 0x68, 0x65, 0x6c, 0x6c] - bs2 = B.pack [0x6f, 0x2c, 0x20, 0x94, +-- test multi-word code points split across bytestring chunks +chunksTests decodeF insExpectedOuts = + f mempty insExpectedOuts + where + f bs s = + case s of + (input, expected) : s' -> whenEqProp (decodeF bs input) expected $ + case expected of + E.DecodeResult _ _ bs' _ -> f bs' s' + _ -> counterexample "" True + +t_chunk_decode_ascii_1 = chunksTests E.decodeAsciiChunks + [ ( B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c] + , E.DecodeResult "hi, " (Just 0x83) (B.pack [0x68, 0x65, 0x6c, 0x6c]) 5 + ) + , ( B.pack [0x6f, 0x2c, 0x20, 0x94, 0x68, 0x6f, 0x77, 0x20, 0xcc, 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] - decode = E.decodeAsciiChunks - decodeResult0 = decode bs1 mempty - expectedBs0 = B.pack [0x68, 0x65, 0x6c, 0x6c] - in - whenEqProp decodeResult0 (E.DecodeResult "hi, " (Just 0x83) expectedBs0 5) $ - let decodeResult1 = decode expectedBs0 bs2 - expectedBs1 = B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, - 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] - in - whenEqProp decodeResult1 (E.DecodeResult "hello, " (Just 0x94) expectedBs1 8) $ - let decodeResult2 = decode expectedBs1 mempty - expectedBs2 = B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] - in - whenEqProp decodeResult2 (E.DecodeResult "how " (Just 0xcc) expectedBs2 5) $ - decode expectedBs2 mempty === E.DecodeResult "are ya?" Nothing mempty 7 - -t_chunk_decode_utf8_1 = - let decodeResult0 = E.decodeUtf8Chunks mempty $ B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20, - 0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, - 0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] in - decodeResult0 === E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22 -t_chunk_decode_utf8_2 = - let decode = E.decodeUtf8Chunks mempty - decodeResult0 = decode $ B.pack [97, 0xC2, 97] - expectedBs0 = B.singleton 97 - in - whenEqProp decodeResult0 (E.DecodeResult (T.singleton 'a') (Just 0xC2) expectedBs0 2) $ - let decodeResult1 = decode expectedBs0 in - decodeResult1 === E.DecodeResult (T.singleton 'a') Nothing mempty 1 -t_chunk_decode_utf8_3 = - let decode = E.decodeUtf8Chunks - decodeResult0 = decode mempty $ B.pack [104, 105, 32, 0xe2] - expectedBs0 = B.singleton 0xe2 - in -- hi \xe2 - whenEqProp decodeResult0 (E.DecodeResult "hi " Nothing expectedBs0 3) $ - let decodeResult1 = decode expectedBs0 $ B.singleton 0x98 - expectedBs1 = B.pack [0xe2, 0x98] - in - whenEqProp decodeResult1 (E.DecodeResult "" Nothing expectedBs1 0) $ - let decodeResult2 = decode expectedBs1 $ B.pack [0x83, 32, 0xFF] in - decodeResult2 === E.DecodeResult "☃ " (Just 0xFF) mempty 5 + , E.DecodeResult "hello, " (Just 0x94) (B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, + 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 8 + ) + , ( mempty + , E.DecodeResult "how " (Just 0xcc) (B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 5 + ) + , ( mempty + , E.DecodeResult "are ya?" Nothing mempty 7 + ) + ] +t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks + [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20, + 0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] + , E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22 + ) + ] +t_chunk_decode_utf8_2 = chunksTests E.decodeUtf8Chunks + [ ( B.pack [97, 0xC2, 97] + , E.DecodeResult (T.singleton 'a') (Just 0xC2) (B.singleton 97) 2 + ) + , ( mempty + , E.DecodeResult (T.singleton 'a') Nothing mempty 1 + ) + ] +t_chunk_decode_utf8_3 = chunksTests E.decodeUtf8Chunks + [ ( B.pack [104, 105, 32, 0xe2] + , E.DecodeResult "hi " Nothing (B.singleton 0xe2) 3 + ) + , ( B.singleton 0x98 + , E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0 + ) + , ( B.pack [0x83, 32, 0xFF] + , E.DecodeResult "☃ " (Just 0xFF) mempty 5 + ) + ] -- test multi-word code points split across bytestring chunks -t_chunk_decode_utf8_4 = - F.fix (\ f bs s -> - case s of - (input, expected) : s' -> whenEqProp (E.decodeUtf8Chunks bs input) expected $ - case expected of - E.DecodeResult _ _ bs' _ -> f bs' s' - _ -> counterexample "" True - ) mempty - [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4) - , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4) - , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0) - , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9) - , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0) - , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0) - , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5) - ] +t_chunk_decode_utf8_4 = chunksTests E.decodeUtf8Chunks + [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4) + , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4) + , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0) + , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9) + , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0) + , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0) + , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5) + ] -t_chunk_decode_utf16BE = - let decode = E.decodeUtf16Chunks True - expectedBs0 = B.pack [0] - decodeResult0 = decode mempty expectedBs0 in - whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ - let decodeResult1 = decode expectedBs0 $ B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] - expectedBs1 = B.pack [0xD8, 0x01] - in - whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ - let decodeResult2 = decode expectedBs1 $ B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] in - decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8 -t_chunk_decode_utf16LE = - let decode = E.decodeUtf16Chunks False - expectedBs0 = B.pack [104] - decodeResult0 = decode mempty expectedBs0 in - whenEqProp decodeResult0 (E.DecodeResult T.empty Nothing expectedBs0 0) $ - let decodeResult1 = decode expectedBs0 $ B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] - expectedBs1 = B.pack [0x01, 0xD8] - in - whenEqProp decodeResult1 (E.DecodeResult "hi " Nothing expectedBs1 6) $ - let decodeResult2 = decode expectedBs1 $ B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] in - decodeResult2 === E.DecodeResult "\x10437 " (Just 0xDC) mempty 8 - -t_chunk_decode_utf32BE = - let decode = E.decodeUtf32Chunks True - expBs0 = B.pack [0, 0] - decodeResult0 = decode mempty $ B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] - in - whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ - let expBs1 = B.pack [0, 0, 0x26] - decodeResult1 = decode expBs0 $ B.pack [0, 32, 0, 0, 0x26] - in - whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ - let expBs2 = mempty - decodeResult2 = decode expBs1 $ B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] - in - decodeResult2 === E.DecodeResult "☃ " (Just 0xD800) expBs2 12 -t_chunk_decode_utf32LE = - let decode = E.decodeUtf32Chunks False - expBs0 = B.pack [0x20, 0] - decodeResult0 = decode mempty $ B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] - in - whenEqProp decodeResult0 (E.DecodeResult "hi" Nothing expBs0 8) $ - let expBs1 = B.pack [0x03, 0x26, 0] - decodeResult1 = decode expBs0 $ B.pack [0, 0, 0x03, 0x26, 0] - in - whenEqProp decodeResult1 (E.DecodeResult " " Nothing expBs1 4) $ - let expBs2 = mempty - decodeResult2 = decode expBs1 $ B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] - in - decodeResult2 === E.DecodeResult "☃ " (Just 0xD80000) expBs2 12 +t_chunk_decode_utf16BE = chunksTests (E.decodeUtf16Chunks True) + [ ( B.pack [0] + , E.DecodeResult T.empty Nothing (B.pack [0]) 0 + ) + , ( B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] + , E.DecodeResult "hi " Nothing (B.pack [0xD8, 0x01]) 6 + ) + , ( B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] + , E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8 + ) + ] +t_chunk_decode_utf16LE = chunksTests (E.decodeUtf16Chunks False) + [ ( B.pack [104] + , E.DecodeResult T.empty Nothing (B.pack [104]) 0 + ) + , ( B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] + , E.DecodeResult "hi " Nothing (B.pack [0x01, 0xD8]) 6 + ) + , ( B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] + , E.DecodeResult "\x10437 " (Just 0xDC) mempty 8 + ) + ] + +t_chunk_decode_utf32BE = chunksTests (E.decodeUtf32Chunks True) + [ ( B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] + , E.DecodeResult "hi" Nothing (B.pack [0, 0]) 8 + ) + , ( B.pack [0, 32, 0, 0, 0x26] + , E.DecodeResult " " Nothing (B.pack [0, 0, 0x26]) 4 + ) + , ( B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] + , E.DecodeResult "☃ " (Just 0xD800) mempty 12 + ) + ] +t_chunk_decode_utf32LE = chunksTests (E.decodeUtf32Chunks False) + [ ( B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] + , E.DecodeResult "hi" Nothing (B.pack [0x20, 0]) 8 + ) + , ( B.pack [0, 0, 0x03, 0x26, 0] + , E.DecodeResult " " Nothing (B.pack [0x03, 0x26, 0]) 4 + ) + , ( B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] + , E.DecodeResult "☃ " (Just 0xD80000) mempty 12 + ) + ] decodeLL :: BL.ByteString -> TL.Text decodeLL = EL.decodeUtf8With C.lenientDecode From 048327910d8f196a45c609bdbc377b1be4f3252e Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 26 Jul 2022 16:32:07 -0600 Subject: [PATCH 31/87] Refactored chunk and ASCII decoders --- src/Data/Text/Encoding.hs | 338 +++++++++--------------- src/Data/Text/Internal/Encoding/Utf8.hs | 28 +- src/Data/Text/Lazy/Encoding.hs | 48 ++-- tests/Tests/Properties/Transcoding.hs | 47 ++-- 4 files changed, 190 insertions(+), 271 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 91f77509..cbe5fea6 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -31,8 +31,8 @@ module Data.Text.Encoding -- $total decodeLatin1 , decodeUtf8Lenient + , decodeAsciiPrefix , DecodeResult(..) - , decodeAsciiChunks , decodeUtf8Chunks , decodeUtf16Chunks , decodeUtf32Chunks @@ -75,7 +75,8 @@ module Data.Text.Encoding ) where import Control.Exception (evaluate, try) -import Control.Monad.ST (ST, runST) +import Control.Monad (when) +import Control.Monad.ST (runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) import Data.ByteString (ByteString) @@ -97,24 +98,18 @@ import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP -import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..)) +import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DetectUtf8Result(..)) import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes) import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F import Data.Text.Internal.ByteStringCompat +import Numeric (showHex) #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif -#ifdef SIMDUTF -import Foreign.C.Types (CInt(..)) -#elif !MIN_VERSION_bytestring(0,11,2) -import qualified Data.ByteString.Unsafe as B -import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -#endif - -- $strict -- -- All of the single-parameter functions for decoding bytestrings @@ -146,11 +141,10 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -- anything except ASCII and copies buffer or throws an error otherwise. -- decodeASCII :: ByteString -> Text -decodeASCII bs = case decodeAsciiChunks bs mempty of - DecodeResult t mW _ pos -> - case mW of - Just _ -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show (pos - 1) - _ -> t +decodeASCII bs@(B.length -> len) = case decodeAsciiPrefix bs of + (t, Nothing) -> t + (_, Just (w, (B.length -> len'))) -> + error $ "decodeASCII: detected non-ASCII codepoint (\\x" ++ showHex w (") at " ++ show (len - len' - 1)) -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -189,6 +183,83 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize +-- | Decode a 'ByteString' containing ASCII. +-- +-- This is a total function. The 'ByteString' is decoded until either +-- the end is reached or it errors with the first non-ASCII 'Word8' is +-- encountered. In either case the function will return what 'Text' was +-- decoded. On error, the non-ASCII 'Word8' is also returned followed +-- by the rest of the undecoded 'ByteString'. +-- +-- @since 2.0.2 +decodeAsciiPrefix + :: ByteString + -> (Text, Maybe (Word8, ByteString)) +decodeAsciiPrefix bs = if B.null bs + then (empty, Nothing) + else runST $ withBS bs $ \ fp len -> + unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do + asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len + (, if asciiPrefixLen < len + then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs) + else Nothing) <$> if asciiPrefixLen == 0 + then pure empty + else unsafeSTToIO $ do + dst <- A.new asciiPrefixLen + A.copyFromPointer dst 0 src asciiPrefixLen + arr <- A.unsafeFreeze dst + pure $ Text arr 0 asciiPrefixLen + +-- | Decode two 'ByteString's containing UTF-8-encoded text as though +-- they were one continuous 'ByteString' returning a 'DecodeResult'. +-- +-- @since 2.0.2 +decodeUtf8Chunks + :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. + -> ByteString -- ^ The second 'ByteString' chunk to decode. + -> DecodeResult Text ByteString Word8 +decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = + let len = len1 + len2 in + if len == 0 + then DecodeResult empty Nothing bs1 0 + else + let index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + step i _ Reject = (Reject, i) + step i i' (Incomplete a) = + if i' < len + then step i (i' + 1) $ utf8DecodeContinue (index i') a + else (Incomplete a, i) + step _ i' st@_ = + if i' < len + then step i' (i' + 1) . utf8DecodeStart $ index i' + else (st, i') + (st', t) = case step 0 1 . utf8DecodeStart $ index 0 of + (st, count) -> + (st, ) $ if count > 0 + then runST $ do + dst <- A.new count + let crossesBSBoundary = count > len1 + copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count' + copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0 + when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1 + arr <- A.unsafeFreeze dst + pure $ Text arr 0 count + else empty + decodeResult off mErr = DecodeResult t mErr (if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off bs1 `B.append` bs2) off + tLen = case t of Text _ _ len' -> len' + in + case st' of + Reject -> + let tLen' = tLen + 1 in + decodeResult tLen' . Just $ index tLen + _ -> decodeResult tLen Nothing + data Progression = WriteAndAdvance Char Int | NeedMore @@ -200,22 +271,11 @@ decodeChunks :: #endif (Bits w, Num w, Storable w) => w - -> ( Int - -> ByteString - -> A.MArray s - -> Int - -> Int - -> Int - -> ST s (Maybe ( - (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) - -> ST s (DecodeResult Text ByteString w)) - ) - ) -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString - -> ST s (DecodeResult Text ByteString w) -decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = do + -> DecodeResult Text ByteString w +decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do marr <- A.new len' outer marr len' 0 0 where @@ -234,212 +294,59 @@ decodeChunks w bulkDecodeF incrementalDecodeF bs1@(B.length -> len1) bs2@(B.leng outer dst dstLen = inner where inner srcOff dstOff - | srcOff < len1 = decode 0 bs1 - | srcOff < len = decode len1 bs2 + | srcOff < len + , len >= srcOff + wordByteSize = + if dstOff + 4 > dstLen + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case transcodeF index len srcOff of + WriteAndAdvance c srcOff' -> do + d <- unsafeWrite dst dstOff c + inner srcOff' $ dstOff + d + NeedMore -> goodSoFar + Invalid -> + let srcOff' = srcOff + wordByteSize + bytesToWord n word + | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + | otherwise = word + in + wrapUp srcOff' . Just $ bytesToWord wordByteSize 0 -- finished (for now) | otherwise = goodSoFar where - decode bsOff bs = do - mOuterArgs <- bulkDecodeF bsOff bs dst dstLen srcOff dstOff - case mOuterArgs of - Just outerArgs -> outerArgs outer - _ -> if len - srcOff < wordByteSize - -- incomplete code point - then goodSoFar - else - if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case incrementalDecodeF index len srcOff of - WriteAndAdvance c srcOff' -> do - d <- unsafeWrite dst dstOff c - inner srcOff' $ dstOff + d - NeedMore -> goodSoFar - Invalid -> invalid - contin off res = do + wrapUp off mW = do A.shrinkM dst dstOff arr <- A.unsafeFreeze dst - pure . res (Text arr 0 dstOff) $ if off >= len1 + pure $ DecodeResult (Text arr 0 dstOff) mW (if off >= len1 then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2 + else B.drop off $ bs1 `B.append` bs2) off goodSoFar = - contin srcOff $ \ t bs' -> - DecodeResult t Nothing bs' srcOff - invalid = - let srcOff' = srcOff + wordByteSize - bytesToWord n word - | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - | otherwise = word - in - contin srcOff' $ \ t bs' -> - DecodeResult t (Just $ bytesToWord wordByteSize 0) bs' srcOff' + wrapUp srcOff Nothing decodeChunksProxy :: (Bits w, Num w, Storable w) - => ( Int - -> ByteString - -> A.MArray s - -> Int - -> Int - -> Int - -> ST s (Maybe ( - (A.MArray s -> Int -> Int -> Int -> ST s (DecodeResult Text ByteString w)) - -> ST s (DecodeResult Text ByteString w) - ) - ) - ) - -> ((Int -> Word8) -> Int -> Int -> Progression) + => ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString -> ByteString - -> ST s (DecodeResult Text ByteString w) + -> DecodeResult Text ByteString w decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- determine the size in bytes of a data type using Storable.sizeOf -- so that it doesn't have to be passed as an arugment. Storable.sizeOf -- discards the actual value without evaluating it. -bulkCharCopy - :: Ptr Word8 - -> Int - -> A.MArray s - -> Int - -> Int - -> IO (A.MArray s, Int, Int) -bulkCharCopy src srcLen dst dstLen dstOff = do - let minLen = dstOff + srcLen - (dst', dstLen') <- - if minLen > dstLen - then - let newLen = minLen + 4 in - (, newLen) <$> (unsafeSTToIO $ A.resizeM dst newLen) - else pure (dst, dstLen) - unsafeSTToIO $ A.copyFromPointer dst' dstOff src srcLen - pure (dst', dstLen', minLen) - --- | Decode two 'ByteString's containing ASCII as though they were one --- continuous 'ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeAsciiChunks - :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> ByteString -- ^ The second 'ByteString' chunk to decode. - -> DecodeResult Text ByteString Word8 -decodeAsciiChunks bs1 bs2 = runST $ decodeChunksProxy ( - \ bsOff bs dst dstLen srcOff dstOff -> withBS bs $ \ fp len -> - unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do - let srcOff' = srcOff - bsOff - asciiPrefixLen <- fmap fromIntegral . c_is_ascii (src `plusPtr` srcOff') $ src `plusPtr` len - if asciiPrefixLen == 0 - then pure Nothing - else do - (dst', dstLen', dstOff') <- bulkCharCopy src asciiPrefixLen dst dstLen dstOff - pure . Just $ \ f -> f dst' dstLen' (srcOff' + asciiPrefixLen + bsOff) dstOff' - ) (\ _ _ _ -> Invalid) bs1 bs2 - -decodeUtf8Bulk - :: Int - -> ByteString - -> A.MArray s - -> Int - -> Int - -> Int - -> ST s (Maybe ((A.MArray s -> Int -> Int -> Int -> t) -> t)) -decodeUtf8Bulk bsOff bs dst dstLen srcOff dstOff - -- potential valid utf8 content endpoint - | utf8End <- bsOff + guessUtf8Boundary bs - -- not yet reached the UTF-8 boundary - , srcOff < utf8End - -- potential valid utf8 content length - , utf8Len <- utf8End - srcOff - -- this ByteString chunk's offset - , srcOff2 <- srcOff - bsOff - -- check for UTF-8 compliance - , isValidBS bs srcOff2 utf8Len = do - (dst', dstLen', dstOff') <- withBS bs $ \ fp _ -> - unsafeIOToST $ unsafeWithForeignPtr fp $ \ src -> - bulkCharCopy (src `plusPtr` srcOff2) utf8Len dst dstLen dstOff - pure . Just $ \ f -> f dst' dstLen' (srcOff + utf8Len) dstOff' - | otherwise = pure Nothing - where - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: ByteString -> Int - guessUtf8Boundary bs'@(B.length -> len) - | len >= 1 && wr 1 < 0x80 -- last char is ASCII - || wb 2 0xe0 0xc0 -- last two chars are a two-word code point - || wb 3 0xf0 0xe0 -- last three chars are a three-word code point - || wb 2 0xf8 0xf0 -- last four chars are a four-word code point - = len - | w 1 0xc0 = len - 1 -- last char starts a two-(or more-)word code point - | w 2 0xe0 = len - 2 -- pre-last char starts a three- or four-word code point - | w 3 0xf0 = len - 3 -- third to last char starts a four-word code point - | otherwise = 0 -- gonna have to resolve this with an incremental approach - where - wr n = B.index bs' (len - n) - w n bt = len >= n && wr n >= bt - wb n mask bt = len >= n && wr n .&. mask == bt - - isValidBS :: ByteString -> Int -> Int -> Bool - isValidBS bs' off count = if off + count > B.length bs' - then False - else -#ifdef SIMDUTF - withBS bs' $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral count) -#else -#if MIN_VERSION_bytestring(0,11,2) - B.isValidUtf8 (B.take count $ B.drop off bs') -#else - start off - where - start ix - | ix >= off + count = True - | otherwise = case utf8DecodeStart (B.unsafeIndex bs' ix) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st _ -> step (ix + 1) st - step ix st - | ix >= off + count = False - -- We do not use decoded code point, so passing a dummy value to save an argument. - | otherwise = case utf8DecodeContinue (B.unsafeIndex bs' ix) st (CodePoint 0) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st' _ -> step (ix + 1) st' -#endif -#endif - --- | Decode two 'ByteString's containing UTF-8-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeUtf8Chunks - :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> ByteString -- ^ The second 'ByteString' chunk to decode. - -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks bs1 bs2 = runST $ decodeChunksProxy decodeUtf8Bulk (\ index len srcOff -> - let step i (Incomplete a b) - | i < len = step (i + 1) $ utf8DecodeContinue (index i) a b - step i st = (st, i) - in - case step (srcOff + 1) . utf8DecodeStart $ index srcOff of - (Accept c, srcOff') -> WriteAndAdvance c srcOff' - (Reject, _) -> Invalid - (Incomplete{}, _) -> NeedMore) bs1 bs2 - -noBulkDecodeOp :: Applicative f => p0 -> p1 -> p2 -> p3 -> p4 -> p5 -> f (Maybe a) -noBulkDecodeOp _ _ _ _ _ _ = pure Nothing - -- | Decode two 'ByteString's containing UTF-16-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- -- @since 2.0.2 decodeUtf16Chunks :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word16 -decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index len srcOff -> +decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff -> -- get next Word8 pair let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n b0 = index $ if isBE then srcOff else srcOff + 1 @@ -466,10 +373,10 @@ decodeUtf16Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ ind -- @since 2.0.2 decodeUtf32Chunks :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word32 -decodeUtf32Chunks isBE bs1 bs2 = runST $ decodeChunksProxy noBulkDecodeOp (\ index _ srcOff -> +decodeUtf32Chunks isBE bs1 bs2 = decodeChunksProxy (\ index _ srcOff -> -- get next Word8 quartet case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3) (index $ srcOff + (if isBE then 1 else 2)) @@ -488,7 +395,7 @@ decodeUtf8With :: #endif OnDecodeError -> ByteString -> Text decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of - Some t unencoded _ -> codePointToInvalid unencoded t + Some t undecoded _ -> codePointToInvalid undecoded t where codePointToInvalid bs' txt = case B.uncons bs' of @@ -802,8 +709,3 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt)) encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} - -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index bcf6a778..e9cc9e68 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -34,7 +34,7 @@ module Data.Text.Internal.Encoding.Utf8 , validate3 , validate4 -- * Naive decoding - , DecoderResult(..) + , DetectUtf8Result(..) , DecoderState(..) , CodePoint(..) , utf8DecodeStart @@ -46,7 +46,7 @@ import Control.Exception (assert) import GHC.Stack (HasCallStack) #endif import Data.Bits (Bits(..), FiniteBits(..)) -import Data.Char (ord, chr) +import Data.Char (ord) import GHC.Exts import GHC.Word (Word8(..)) @@ -264,29 +264,27 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#) newtype CodePoint = CodePoint Int -- | @since 2.0 -data DecoderResult - = Accept !Char - | Incomplete !DecoderState !CodePoint +data DetectUtf8Result + = Accept + | Incomplete !DecoderState | Reject -- | @since 2.0 -utf8DecodeStart :: Word8 -> DecoderResult +utf8DecodeStart :: Word8 -> DetectUtf8Result utf8DecodeStart !w - | st == utf8AcceptState = Accept (chr (word8ToInt w)) + | st == utf8AcceptState = Accept | st == utf8RejectState = Reject - | otherwise = Incomplete st (CodePoint cp) + | otherwise = Incomplete st where - cl@(ByteClass cl') = byteToClass w + cl = byteToClass w st = updateState cl utf8AcceptState - cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w -- | @since 2.0 -utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult -utf8DecodeContinue !w !st (CodePoint !cp) - | st' == utf8AcceptState = Accept (chr cp') +utf8DecodeContinue :: Word8 -> DecoderState -> DetectUtf8Result +utf8DecodeContinue !w !st + | st' == utf8AcceptState = Accept | st' == utf8RejectState = Reject - | otherwise = Incomplete st' (CodePoint cp') + | otherwise = Incomplete st' where cl = byteToClass w st' = updateState cl st - cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 992654fa..91f36dc8 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,8 +23,8 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 + , decodeAsciiPrefix , DecodeResult(..) - , decodeAsciiChunks , decodeUtf8Chunks , decodeUtf16Chunks , decodeUtf32Chunks @@ -100,15 +100,35 @@ import Data.Text.Unsafe (unsafeDupablePerformIO) -- (preferably not at all). See "Data.Text.Lazy.Encoding#g:total" for better -- solutions. --- | Decode a 'ByteString' containing 7-bit ASCII +-- | Decode a 'B.ByteString' containing 7-bit ASCII -- encoded text. decodeASCII :: B.ByteString -> Text decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks --- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. +-- | Decode a 'B.ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks +-- | Decode a 'B.ByteString' containing ASCII. +-- +-- This is a total function. The 'B.ByteString' is decoded until either +-- the end is reached or it errors with the first non-ASCII 'Word8' is +-- encountered. In either case the function will return what 'Text' was +-- decoded. On error, the non-ASCII 'Word8' is also returned followed +-- by the rest of the undecoded 'B.ByteString'. +-- +-- @since 2.0.2 +decodeAsciiPrefix + :: B.ByteString + -> (Text, Maybe (Word8, B.ByteString)) +decodeAsciiPrefix = g id + where + g tDiff (B.Chunk sb lb) = + case TE.decodeAsciiPrefix sb of + (t, Nothing) -> g (tDiff . chunk t) lb + (t, Just (w, sb')) -> (tDiff $ chunk t Empty, Just (w, B.chunk sb' lb)) + g tDiff _ = (tDiff Empty, Nothing) + decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) -> B.ByteString -> B.ByteString @@ -125,22 +145,12 @@ decodeChunks decoder = g id 0 mempty g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos --- | Decode two 'ByteString's containing ASCII as though they were one --- continuous 'ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeAsciiChunks - :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. - -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. - -> DecodeResult Text B.ByteString Word8 -decodeAsciiChunks = decodeChunks TE.decodeAsciiChunks - -- | Decode two 'B.ByteString's containing UTF-8-encoded text as though -- they were one continuous 'B.ByteString' returning a 'DecodeResult'. -- -- @since 2.0.2 decodeUtf8Chunks - :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word8 decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks @@ -151,7 +161,7 @@ decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks -- @since 2.0.2 decodeUtf16Chunks :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word16 decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks @@ -162,12 +172,12 @@ decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks -- @since 2.0.2 decodeUtf32Chunks :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the unencoded data from the previous call of this function. + -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. -> DecodeResult Text B.ByteString Word32 decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks --- | Decode a 'ByteString' containing UTF-8 encoded text. +-- | Decode a 'B.ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr (B.Chunk b0 bs0) = case TE.streamDecodeUtf8With onErr b0 of @@ -188,7 +198,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" decodeUtf8With _ _ = empty --- | Decode a 'ByteString' containing UTF-8 encoded text that is known +-- | Decode a 'B.ByteString' containing UTF-8 encoded text that is known -- to be valid. -- -- If the input contains any invalid UTF-8 data, an exception will be @@ -199,7 +209,7 @@ decodeUtf8 :: B.ByteString -> Text decodeUtf8 = decodeUtf8With strictDecode {-# INLINE[0] decodeUtf8 #-} --- | Decode a 'ByteString' containing UTF-8 encoded text.. +-- | Decode a 'B.ByteString' containing UTF-8 encoded text.. -- -- If the input contains any invalid UTF-8 data, the relevant -- exception will be returned, otherwise the decoded text. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 8aab9ae3..d8d0ce58 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -206,6 +206,33 @@ genInvalidUTF8 = B.pack <$> oneof [ ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] +t_prefix_decode_ascii_1 = + let g bs es is = + case es of + expected : es' -> whenEqProp (E.decodeAsciiPrefix bs) expected $ + case expected of + (_, Just (_, bs')) -> g bs' es' is + _ -> f (is, es') + _ -> counterexample ("More input than expected: " ++ show is) False + f s = case s of + (input : is, es) -> g input es is + (_, es@(_ : _)) -> counterexample ("More expected output than input: " ++ show es) False + _ -> counterexample "" True + in + f ( [ B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c] + , B.pack [0x6f, 0x2c, 0x20, 0x94, + 0x68, 0x6f, 0x77, 0x20, 0xcc, + 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] + ] + , [ ("hi, ", Just (0x83, B.pack [0x68, 0x65, 0x6c, 0x6c])) + , ("hell", Nothing) + , ("o, ", Just (0x94, B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, + 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f])) + , ("how ", Just (0xcc, B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f])) + , ("are ya?", Nothing) + ] + ) + -- test multi-word code points split across bytestring chunks chunksTests decodeF insExpectedOuts = f mempty insExpectedOuts @@ -217,24 +244,6 @@ chunksTests decodeF insExpectedOuts = E.DecodeResult _ _ bs' _ -> f bs' s' _ -> counterexample "" True -t_chunk_decode_ascii_1 = chunksTests E.decodeAsciiChunks - [ ( B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c] - , E.DecodeResult "hi, " (Just 0x83) (B.pack [0x68, 0x65, 0x6c, 0x6c]) 5 - ) - , ( B.pack [0x6f, 0x2c, 0x20, 0x94, - 0x68, 0x6f, 0x77, 0x20, 0xcc, - 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] - , E.DecodeResult "hello, " (Just 0x94) (B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, - 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 8 - ) - , ( mempty - , E.DecodeResult "how " (Just 0xcc) (B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f]) 5 - ) - , ( mempty - , E.DecodeResult "are ya?" Nothing mempty 7 - ) - ] - t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20, 0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, @@ -411,7 +420,7 @@ testTranscoding = testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2, testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3, testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4, - testProperty "t_chunk_decode_ascii_1" t_chunk_decode_ascii_1, + testProperty "t_prefix_decode_ascii_1" t_prefix_decode_ascii_1, testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE, testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, From 82ab16a4c05dff0feb93d3338a3398842af938df Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 26 Jul 2022 16:56:09 -0600 Subject: [PATCH 32/87] version typo --- changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index f198612a..b0347e05 100644 --- a/changelog.md +++ b/changelog.md @@ -207,7 +207,7 @@ * Bumped lower bound on deepseq to 1.4 for compatibility with the upcoming GHC 7.10 -#### 1.2.0.2 +#### 1.2.0.1 * Fixed a buffer overflow in rendering of large Integers (https://github.com/bos/text/issues/99) From 0b6e5c583124bce1816a7ac188fd0f80ceedcf84 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 27 Jul 2022 16:42:54 -0600 Subject: [PATCH 33/87] rename from 'Decode' to 'Detect' --- changelog.md | 12 +++++++ src/Data/Text/Encoding.hs | 42 ++++++++++++------------- src/Data/Text/Internal/Encoding/Utf8.hs | 30 +++++++++--------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/changelog.md b/changelog.md index b0347e05..2f529fc6 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,15 @@ +### 2.0.2 + +* A new suite of total decoders have been added in `Data.Text.Encoding` + and `Data.Text.Lazy.Encoding` that allow decoding to be aborted on + errors without the need to raise an `error` and `catch` it elsewhere: + * `decodeAsciiPrefix` + * `decodeUtf8Chunks` + * `decodeUtf16Chunks` + * `decodeUtf32Chunks` + +### 2.0.1 + * Improve portability of C and C++ code. * [Make `Lift` instance more efficient](https://github.com/haskell/text/pull/413) * [Make `toCaseFold` idempotent](https://github.com/haskell/text/pull/402) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index cbe5fea6..600ca7b6 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -98,7 +98,7 @@ import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP -import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DetectUtf8Result(..)) +import Data.Text.Internal.Encoding.Utf8 (utf8DetectStart, utf8DetectContinue, DetectUtf8Result(..)) import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes) import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes) import qualified Data.Text.Array as A @@ -161,24 +161,22 @@ decodeLatin1 :: ByteString -> Text decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) - let inner srcOff dstOff = if srcOff >= len then return dstOff else do - asciiPrefixLen <- fmap fromIntegral . unsafeIOToST . unsafeWithForeignPtr fp $ \src -> - c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len) - if asciiPrefixLen == 0 - then do - byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff - A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) - A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F)) - inner (srcOff + 1) (dstOff + 2) - else do - unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen - inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) - - actualLen <- inner 0 0 - dst' <- A.resizeM dst actualLen - arr <- A.unsafeFreeze dst' - return $ Text arr 0 actualLen + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> do + let inner srcOff dstOff = if srcOff >= len then pure dstOff else do + let src' = src `plusPtr` srcOff + asciiPrefixLen <- fmap fromIntegral . c_is_ascii src' $ src `plusPtr` len + if asciiPrefixLen == 0 + then do + byte <- peekByteOff src srcOff + unsafeSTToIO $ A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) *> + A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F)) + inner (srcOff + 1) (dstOff + 2) + else do + unsafeSTToIO $ A.copyFromPointer dst dstOff src' asciiPrefixLen + inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) + actualLen <- inner 0 0 + arr <- unsafeSTToIO $ A.resizeM dst actualLen >>= A.unsafeFreeze + pure $ Text arr 0 actualLen foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize @@ -229,13 +227,13 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = step i _ Reject = (Reject, i) step i i' (Incomplete a) = if i' < len - then step i (i' + 1) $ utf8DecodeContinue (index i') a + then step i (i' + 1) $ utf8DetectContinue (index i') a else (Incomplete a, i) step _ i' st@_ = if i' < len - then step i' (i' + 1) . utf8DecodeStart $ index i' + then step i' (i' + 1) . utf8DetectStart $ index i' else (st, i') - (st', t) = case step 0 1 . utf8DecodeStart $ index 0 of + (st', t) = case step 0 1 . utf8DetectStart $ index 0 of (st, count) -> (st, ) $ if count > 0 then runST $ do diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index e9cc9e68..f5b96fb6 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -35,10 +35,10 @@ module Data.Text.Internal.Encoding.Utf8 , validate4 -- * Naive decoding , DetectUtf8Result(..) - , DecoderState(..) + , DetectState(..) , CodePoint(..) - , utf8DecodeStart - , utf8DecodeContinue + , utf8DetectStart + , utf8DetectContinue ) where #if defined(ASSERTS) @@ -243,17 +243,17 @@ byteToClass n = ByteClass (W8# el#) table# :: Addr# table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# -newtype DecoderState = DecoderState Word8 +newtype DetectState = DetectState Word8 deriving (Eq) -utf8AcceptState :: DecoderState -utf8AcceptState = DecoderState 0 +utf8AcceptState :: DetectState +utf8AcceptState = DetectState 0 -utf8RejectState :: DecoderState -utf8RejectState = DecoderState 12 +utf8RejectState :: DetectState +utf8RejectState = DetectState 12 -updateState :: ByteClass -> DecoderState -> DecoderState -updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#) +updateState :: ByteClass -> DetectState -> DetectState +updateState (ByteClass c) (DetectState s) = DetectState (W8# el#) where !(I# n#) = word8ToInt (c + s) el# = indexWord8OffAddr# table# n# @@ -266,12 +266,12 @@ newtype CodePoint = CodePoint Int -- | @since 2.0 data DetectUtf8Result = Accept - | Incomplete !DecoderState + | Incomplete !DetectState | Reject -- | @since 2.0 -utf8DecodeStart :: Word8 -> DetectUtf8Result -utf8DecodeStart !w +utf8DetectStart :: Word8 -> DetectUtf8Result +utf8DetectStart !w | st == utf8AcceptState = Accept | st == utf8RejectState = Reject | otherwise = Incomplete st @@ -280,8 +280,8 @@ utf8DecodeStart !w st = updateState cl utf8AcceptState -- | @since 2.0 -utf8DecodeContinue :: Word8 -> DecoderState -> DetectUtf8Result -utf8DecodeContinue !w !st +utf8DetectContinue :: Word8 -> DetectState -> DetectUtf8Result +utf8DetectContinue !w !st | st' == utf8AcceptState = Accept | st' == utf8RejectState = Reject | otherwise = Incomplete st' From 63790fc26e9a9dbdc645af95f827c7bf361e92bd Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 27 Jul 2022 16:50:11 -0600 Subject: [PATCH 34/87] Remove unused newtype CodePoint --- src/Data/Text/Internal/Encoding/Utf8.hs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index f5b96fb6..15796870 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -36,7 +36,6 @@ module Data.Text.Internal.Encoding.Utf8 -- * Naive decoding , DetectUtf8Result(..) , DetectState(..) - , CodePoint(..) , utf8DetectStart , utf8DetectContinue ) where @@ -261,8 +260,6 @@ updateState (ByteClass c) (DetectState s) = DetectState (W8# el#) table# :: Addr# table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"# -newtype CodePoint = CodePoint Int - -- | @since 2.0 data DetectUtf8Result = Accept From f4bf3a7edb1075484d490935aba3576c1cb4b020 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 27 Jul 2022 17:19:05 -0600 Subject: [PATCH 35/87] Added missing error message in streamDecodeUtf8With --- src/Data/Text/Encoding.hs | 146 +++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 600ca7b6..3a76664f 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -219,44 +219,42 @@ decodeUtf8Chunks decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = let len = len1 + len2 in if len == 0 - then DecodeResult empty Nothing bs1 0 - else - let index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - step i _ Reject = (Reject, i) - step i i' (Incomplete a) = - if i' < len - then step i (i' + 1) $ utf8DetectContinue (index i') a - else (Incomplete a, i) - step _ i' st@_ = - if i' < len - then step i' (i' + 1) . utf8DetectStart $ index i' - else (st, i') - (st', t) = case step 0 1 . utf8DetectStart $ index 0 of - (st, count) -> - (st, ) $ if count > 0 - then runST $ do - dst <- A.new count - let crossesBSBoundary = count > len1 - copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count' - copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0 - when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1 - arr <- A.unsafeFreeze dst - pure $ Text arr 0 count - else empty - decodeResult off mErr = DecodeResult t mErr (if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off bs1 `B.append` bs2) off - tLen = case t of Text _ _ len' -> len' - in - case st' of - Reject -> - let tLen' = tLen + 1 in - decodeResult tLen' . Just $ index tLen - _ -> decodeResult tLen Nothing + then DecodeResult empty Nothing bs1 0 + else + let index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + step i _ Reject = (Reject, i) + step i i' (Incomplete a) + | i' < len = step i (i' + 1) $ utf8DetectContinue (index i') a + | otherwise = (Incomplete a, i) + step _ i' _ + | i' < len = step i' (i' + 1) . utf8DetectStart $ index i' + | otherwise = (Accept, i') + (st', t) = case step 0 1 . utf8DetectStart $ index 0 of + (st, count) -> + (st, ) $ if count > 0 + then runST $ do + dst <- A.new count + let crossesBSBoundary = count > len1 + copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count' + copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0 + when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1 + arr <- A.unsafeFreeze dst + pure $ Text arr 0 count + else empty + decodeResult off mErr = DecodeResult t mErr (if off >= len1 + then B.drop (off - len1) bs2 + else B.drop off bs1 `B.append` bs2) off + tLen = case t of Text _ _ len' -> len' + in + case st' of + Reject -> + let tLen' = tLen + 1 in + decodeResult tLen' . Just $ index tLen + _ -> decodeResult tLen Nothing data Progression = WriteAndAdvance Char Int @@ -295,24 +293,24 @@ decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST | srcOff < len , len >= srcOff + wordByteSize = if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case transcodeF index len srcOff of - WriteAndAdvance c srcOff' -> do - d <- unsafeWrite dst dstOff c - inner srcOff' $ dstOff + d - NeedMore -> goodSoFar - Invalid -> - let srcOff' = srcOff + wordByteSize - bytesToWord n word - | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - | otherwise = word - in - wrapUp srcOff' . Just $ bytesToWord wordByteSize 0 + -- need more space in destination + then do + let dstLen' = dstLen + 4 + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + else + case transcodeF index len srcOff of + WriteAndAdvance c srcOff' -> do + d <- unsafeWrite dst dstOff c + inner srcOff' $ dstOff + d + NeedMore -> goodSoFar + Invalid -> + let srcOff' = srcOff + wordByteSize + bytesToWord n word + | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) + | otherwise = word + in + wrapUp srcOff' . Just $ bytesToWord wordByteSize 0 -- finished (for now) | otherwise = goodSoFar where @@ -354,15 +352,15 @@ decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff -> OneWord16 c -> writeAndAdvance c 2 TwoWord16 g -> if len - srcOff < 4 - -- not enough Word8s to finish the code point - then NeedMore - else - let b2 = index $ srcOff + (if isBE then 2 else 3) - b3 = index $ srcOff + (if isBE then 3 else 2) - in - case g b2 b3 of - Just c -> writeAndAdvance c 4 - _ -> Invalid + -- not enough Word8s to finish the code point + then NeedMore + else + let b2 = index $ srcOff + (if isBE then 2 else 3) + b3 = index $ srcOff + (if isBE then 3 else 2) + in + case g b2 b3 of + Just c -> writeAndAdvance c 4 + _ -> Invalid _ -> Invalid) bs1 bs2 -- | Decode two 'ByteString's containing UTF-16-encoded text as though @@ -497,15 +495,15 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr = g empty mempty where g t bs0 bs1 = - let DecodeResult t' mW bs1' _ = decodeUtf8Chunks bs0 bs1 - txt = t `append` t' - in - (case (mW :: Maybe Word8) of - Just _ -> - g (case onErr "" mW of - Just c -> txt `append` T.singleton c - _ -> txt) mempty - _ -> Some txt bs1' . g empty) bs1' + case decodeUtf8Chunks bs0 bs1 of + DecodeResult t' mW bs1' _ -> + case t `append` t' of + txt -> (case mW of + Just _ -> + g (case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" mW of + Just c -> txt `append` T.singleton c + _ -> txt) mempty + _ -> Some txt bs1' . g empty) bs1' -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. From 18edcb2c2301a02bec6bf7c8c2d9745f7e4c64d6 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Thu, 4 Aug 2022 17:51:28 -0600 Subject: [PATCH 36/87] Reintroduce simdutf --- src/Data/Text/Encoding.hs | 151 ++++++++++++++++++------ src/Data/Text/Internal/Encoding/Utf8.hs | 3 +- 2 files changed, 114 insertions(+), 40 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 3a76664f..5cd5906f 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -110,6 +110,12 @@ import Numeric (showHex) import GHC.Stack (HasCallStack) #endif +#ifdef SIMDUTF +import Foreign.C.Types (CInt(..)) +#elif !MIN_VERSION_bytestring(0,11,2) +import qualified Data.ByteString.Unsafe as B +#endif + -- $strict -- -- All of the single-parameter functions for decoding bytestrings @@ -208,53 +214,120 @@ decodeAsciiPrefix bs = if B.null bs arr <- A.unsafeFreeze dst pure $ Text arr 0 asciiPrefixLen +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + -- | Decode two 'ByteString's containing UTF-8-encoded text as though -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- -- @since 2.0.2 -decodeUtf8Chunks - :: ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. +decodeUtf8Chunks :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word8 decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = - let len = len1 + len2 in - if len == 0 - then DecodeResult empty Nothing bs1 0 + let len = len1 + len2 + index i + | i < len1 = B.index bs1 i + | otherwise = B.index bs2 $ i - len1 + guessUtf8Boundary len' + | wi 3 0xf0 = len' - 3 -- third to last char starts a four-byte code point + | wi 2 0xe0 = len' - 2 -- pre-last char starts a three-or-four-byte code point + | wi 1 0xc0 = len' - 1 -- last char starts a two-(or more-)byte code point + | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point + wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point + wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point + w 1 (< 0x80) = len' -- last char is ASCII + | otherwise = 0 + where + w n test = len' >= n && test (index $ len' - n) + wc n mask word8 = w n $ (word8 ==) . (mask .&.) + wi n word8 = w n (>= word8) + bs1Utf8Boundary = guessUtf8Boundary len1 + bs2Utf8Boundary = guessUtf8Boundary len + isValidBS :: Int -> Int -> ByteString -> Bool +#ifdef SIMDUTF + isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen) +#else +#if MIN_VERSION_bytestring(0,11,2) + isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off +#else + isValidBS off bLen bs = start off + where + start ix + | ix >= off + bLen = True + | otherwise = case utf8DetectStart (B.unsafeIndex bs ix) of + Accept -> start (ix + 1) + Reject -> False + Incomplete st -> step (ix + 1) st + step ix st + | ix >= off + bLen = False + | otherwise = case utf8DetectContinue (B.unsafeIndex bs ix) st of + Accept -> start (ix + 1) + Reject -> False + Incomplete st' -> step (ix + 1) st' +#endif +#endif + bsToText bs count dst dstOff = + when (count > 0) . withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src count + chunksToText utf8Len + | utf8Len > 0 = runST $ do + dst <- A.new utf8Len + let crossesBsBoundary = utf8Len > len1 + bsToText bs1 (if crossesBsBoundary then len1 else utf8Len) dst 0 + when crossesBsBoundary $ bsToText bs2 (utf8Len - len1) dst len1 + arr <- A.unsafeFreeze dst + pure $ Text arr 0 utf8Len + | otherwise = empty + decodeResult isErr off = + let off' = if isErr then off + 1 else off in + DecodeResult + (chunksToText off) + (if isErr then Just $ index off else Nothing) + (if off' < len1 + then B.drop off' bs1 `B.append` bs2 + else B.drop (off' - len1) bs2) + off' + countValidUtf8 i _ Reject = decodeResult True i + countValidUtf8 i i' (Incomplete a) + | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a + | otherwise = decodeResult False i + countValidUtf8 _ i' _ + | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i' + | otherwise = decodeResult False i' + wrapUpBs1 off = countValidUtf8 off off Accept + wrapUpBs2 off = + wrapUpBs1 $ if bs2Utf8Boundary > off && isValidBS (off - len1) (bs2Utf8Boundary - off) bs2 + then bs2Utf8Boundary + else off + in + if bs1Utf8Boundary > 0 + then + if isValidBS 0 bs1Utf8Boundary bs1 + then + if bs1Utf8Boundary < len1 + then + let checkCodePointAccrossBoundary Reject _ = decodeResult True bs1Utf8Boundary + checkCodePointAccrossBoundary (Incomplete a) off = if off < len + then checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1 + else decodeResult False bs1Utf8Boundary + checkCodePointAccrossBoundary Accept off = wrapUpBs2 off + in + checkCodePointAccrossBoundary (utf8DetectStart $ index bs1Utf8Boundary) $ bs1Utf8Boundary + 1 + else wrapUpBs2 len1 + else wrapUpBs1 0 else - let index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - step i _ Reject = (Reject, i) - step i i' (Incomplete a) - | i' < len = step i (i' + 1) $ utf8DetectContinue (index i') a - | otherwise = (Incomplete a, i) - step _ i' _ - | i' < len = step i' (i' + 1) . utf8DetectStart $ index i' - | otherwise = (Accept, i') - (st', t) = case step 0 1 . utf8DetectStart $ index 0 of - (st, count) -> - (st, ) $ if count > 0 - then runST $ do - dst <- A.new count - let crossesBSBoundary = count > len1 - copyBSRange bs count' dst' dstOff' = withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst' dstOff' src count' - copyBSRange bs1 (if crossesBSBoundary then len1 else count) dst 0 - when crossesBSBoundary $ copyBSRange bs2 (count - len1) dst len1 - arr <- A.unsafeFreeze dst - pure $ Text arr 0 count - else empty - decodeResult off mErr = DecodeResult t mErr (if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off bs1 `B.append` bs2) off - tLen = case t of Text _ _ len' -> len' - in - case st' of - Reject -> - let tLen' = tLen + 1 in - decodeResult tLen' . Just $ index tLen - _ -> decodeResult tLen Nothing + (if len1 > 0 + then wrapUpBs1 + else wrapUpBs2) 0 data Progression = WriteAndAdvance Char Int diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 15796870..17ee7cfb 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -243,7 +243,7 @@ byteToClass n = ByteClass (W8# el#) table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# newtype DetectState = DetectState Word8 - deriving (Eq) + deriving (Eq, Ord, Show, Read) utf8AcceptState :: DetectState utf8AcceptState = DetectState 0 @@ -265,6 +265,7 @@ data DetectUtf8Result = Accept | Incomplete !DetectState | Reject + deriving (Eq, Ord, Show, Read) -- | @since 2.0 utf8DetectStart :: Word8 -> DetectUtf8Result From d69964fddc2f21957d9b5b5571c2e103b9f25065 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Mon, 8 Aug 2022 20:59:52 -0600 Subject: [PATCH 37/87] Realized the otherwise scenario in guessUtf8Boundary was acting like a magic number and obfuscating genuine 0 values --- src/Data/Text/Encoding.hs | 62 +++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 5cd5906f..c4878b5c 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -236,14 +236,14 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = | i < len1 = B.index bs1 i | otherwise = B.index bs2 $ i - len1 guessUtf8Boundary len' - | wi 3 0xf0 = len' - 3 -- third to last char starts a four-byte code point - | wi 2 0xe0 = len' - 2 -- pre-last char starts a three-or-four-byte code point - | wi 1 0xc0 = len' - 1 -- last char starts a two-(or more-)byte code point - | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point - wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point - wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point - w 1 (< 0x80) = len' -- last char is ASCII - | otherwise = 0 + | wi 3 0xf0 = Just $ len' - 3 -- third to last char starts a four-byte code point + | wi 2 0xe0 = Just $ len' - 2 -- pre-last char starts a three-or-four-byte code point + | wi 1 0xc0 = Just $ len' - 1 -- last char starts a two-(or more-)byte code point + | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point + wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point + wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point + w 1 (< 0x80) = Just $ len' -- last char is ASCII + | otherwise = Nothing -- no clue where w n test = len' >= n && test (index $ len' - n) wc n mask word8 = w n $ (word8 ==) . (mask .&.) @@ -254,8 +254,7 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = #ifdef SIMDUTF isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $ unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen) -#else -#if MIN_VERSION_bytestring(0,11,2) +#elif MIN_VERSION_bytestring(0,11,2) isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off #else isValidBS off bLen bs = start off @@ -272,7 +271,6 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = Accept -> start (ix + 1) Reject -> False Incomplete st' -> step (ix + 1) st' -#endif #endif bsToText bs count dst dstOff = when (count > 0) . withBS bs $ \ fp _ -> @@ -304,27 +302,33 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i' | otherwise = decodeResult False i' wrapUpBs1 off = countValidUtf8 off off Accept - wrapUpBs2 off = - wrapUpBs1 $ if bs2Utf8Boundary > off && isValidBS (off - len1) (bs2Utf8Boundary - off) bs2 - then bs2Utf8Boundary + wrapUpBs2 off = wrapUpBs1 $ + case bs2Utf8Boundary of + Just n -> if n > off && isValidBS (off - len1) (n - off) bs2 + then n else off + _ -> off in - if bs1Utf8Boundary > 0 - then - if isValidBS 0 bs1Utf8Boundary bs1 + case bs1Utf8Boundary of + Just n -> + let checkCodePointAccrossBoundary Reject _ = decodeResult True n + checkCodePointAccrossBoundary (Incomplete a) off + | off < len = + checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1 + | otherwise = decodeResult False n + checkCodePointAccrossBoundary Accept off = wrapUpBs2 off + spanChunks + | n < len1 = + checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1 + | otherwise = wrapUpBs2 len1 + in + if n > 0 then - if bs1Utf8Boundary < len1 - then - let checkCodePointAccrossBoundary Reject _ = decodeResult True bs1Utf8Boundary - checkCodePointAccrossBoundary (Incomplete a) off = if off < len - then checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1 - else decodeResult False bs1Utf8Boundary - checkCodePointAccrossBoundary Accept off = wrapUpBs2 off - in - checkCodePointAccrossBoundary (utf8DetectStart $ index bs1Utf8Boundary) $ bs1Utf8Boundary + 1 - else wrapUpBs2 len1 - else wrapUpBs1 0 - else + if isValidBS 0 n bs1 + then spanChunks + else wrapUpBs1 0 + else spanChunks + _ -> (if len1 > 0 then wrapUpBs1 else wrapUpBs2) 0 From 0db3c8abc5400b7999dbaedaade31b501d2a01c3 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Mon, 8 Aug 2022 22:09:02 -0600 Subject: [PATCH 38/87] A little clean up. --- src/Data/Text/Encoding.hs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index c4878b5c..b2e46478 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -242,14 +242,12 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point - w 1 (< 0x80) = Just $ len' -- last char is ASCII + w 1 (< 0x80) = Just len' -- last char is ASCII | otherwise = Nothing -- no clue where w n test = len' >= n && test (index $ len' - n) wc n mask word8 = w n $ (word8 ==) . (mask .&.) wi n word8 = w n (>= word8) - bs1Utf8Boundary = guessUtf8Boundary len1 - bs2Utf8Boundary = guessUtf8Boundary len isValidBS :: Int -> Int -> ByteString -> Bool #ifdef SIMDUTF isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $ @@ -303,13 +301,13 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = | otherwise = decodeResult False i' wrapUpBs1 off = countValidUtf8 off off Accept wrapUpBs2 off = wrapUpBs1 $ - case bs2Utf8Boundary of + case guessUtf8Boundary len of Just n -> if n > off && isValidBS (off - len1) (n - off) bs2 then n else off _ -> off in - case bs1Utf8Boundary of + case guessUtf8Boundary len1 of Just n -> let checkCodePointAccrossBoundary Reject _ = decodeResult True n checkCodePointAccrossBoundary (Incomplete a) off @@ -338,11 +336,7 @@ data Progression | NeedMore | Invalid -decodeChunks :: -#if defined(ASSERTS) - HasCallStack => -#endif - (Bits w, Num w, Storable w) +decodeChunks :: (Bits w, Num w, Storable w) => w -> ((Int -> Word8) -> Int -> Int -> Progression) -> ByteString @@ -414,8 +408,11 @@ decodeChunksProxy = decodeChunks undefined -- This allows Haskell to -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- -- @since 2.0.2 -decodeUtf16Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') +decodeUtf16Chunks :: +#if defined(ASSERTS) + HasCallStack => +#endif + Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word16 @@ -444,8 +441,11 @@ decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff -> -- they were one continuous 'ByteString' returning a 'DecodeResult'. -- -- @since 2.0.2 -decodeUtf32Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') +decodeUtf32Chunks :: +#if defined(ASSERTS) + HasCallStack => +#endif + Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. -> ByteString -- ^ The second 'ByteString' chunk to decode. -> DecodeResult Text ByteString Word32 From c48ea341cfd03ee562a0668d4a28f91bed16f89a Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 10 Aug 2022 12:44:51 -0600 Subject: [PATCH 39/87] move test utility function whenEqProp --- tests/Tests/Properties/Transcoding.hs | 4 ---- tests/Tests/QuickCheckUtils.hs | 8 ++++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index d8d0ce58..bf4d6589 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -35,10 +35,6 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) -whenEqProp a b next = if a == b - then next - else a === b - t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) t_utf8_c = (\ t -> diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index a6d72529..a09b65b1 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,6 +27,8 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read + + , whenEqProp ) where import Control.Arrow ((***)) @@ -284,3 +286,9 @@ newtype SkewedBool = Skewed { getSkewed :: Bool } instance Arbitrary SkewedBool where arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)] + +-- like Control.Monad.when, but with properties instead of monad values +whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property +whenEqProp a b next = if a == b + then next + else a === b From 1c5f61adadc5fb4e438995b9bdf23be36e146a6f Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sun, 28 Aug 2022 17:56:34 +0100 Subject: [PATCH 40/87] Refactor decodeAsciiPrefix --- src/Data/Text/Encoding.hs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index b2e46478..b67daf3f 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -201,18 +201,20 @@ decodeAsciiPrefix -> (Text, Maybe (Word8, ByteString)) decodeAsciiPrefix bs = if B.null bs then (empty, Nothing) - else runST $ withBS bs $ \ fp len -> - unsafeIOToST . unsafeWithForeignPtr fp $ \src -> do + else unsafeDupablePerformIO $ withBS bs $ \ fp len -> + unsafeWithForeignPtr fp $ \src -> do asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len - (, if asciiPrefixLen < len - then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs) - else Nothing) <$> if asciiPrefixLen == 0 - then pure empty - else unsafeSTToIO $ do - dst <- A.new asciiPrefixLen - A.copyFromPointer dst 0 src asciiPrefixLen - arr <- A.unsafeFreeze dst - pure $ Text arr 0 asciiPrefixLen + let !prefix = if asciiPrefixLen == 0 + then empty + else runST $ do + dst <- A.new asciiPrefixLen + A.copyFromPointer dst 0 src asciiPrefixLen + arr <- A.unsafeFreeze dst + pure $ Text arr 0 asciiPrefixLen + let suffix = if asciiPrefixLen < len + then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs) + else Nothing + pure (prefix, suffix) #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 From 1ae9ede3f42781f8e5cc69b707c2a65e77cbdcdd Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sun, 28 Aug 2022 19:05:05 +0100 Subject: [PATCH 41/87] Stylistic nits in decodeUtf8Chunks --- src/Data/Text/Encoding.hs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index b67daf3f..43bab2d3 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -279,9 +279,11 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = chunksToText utf8Len | utf8Len > 0 = runST $ do dst <- A.new utf8Len - let crossesBsBoundary = utf8Len > len1 - bsToText bs1 (if crossesBsBoundary then len1 else utf8Len) dst 0 - when crossesBsBoundary $ bsToText bs2 (utf8Len - len1) dst len1 + if utf8Len > len1 then do + bsToText bs1 len1 dst 0 + bsToText bs2 (utf8Len - len1) dst len1 + else + bsToText bs1 utf8Len dst 0 arr <- A.unsafeFreeze dst pure $ Text arr 0 utf8Len | otherwise = empty @@ -298,7 +300,7 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = countValidUtf8 i i' (Incomplete a) | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a | otherwise = decodeResult False i - countValidUtf8 _ i' _ + countValidUtf8 _ i' Accept | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i' | otherwise = decodeResult False i' wrapUpBs1 off = countValidUtf8 off off Accept @@ -322,12 +324,9 @@ decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1 | otherwise = wrapUpBs2 len1 in - if n > 0 - then - if isValidBS 0 n bs1 - then spanChunks - else wrapUpBs1 0 - else spanChunks + if n == 0 || isValidBS 0 n bs1 + then spanChunks + else wrapUpBs1 0 _ -> (if len1 > 0 then wrapUpBs1 From 0a1110128b5554411fd232d51044ec8422203c17 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 2 Oct 2022 09:53:37 -0600 Subject: [PATCH 42/87] Back-to-the-drawing-board redesign of a UTF-8 decoder with more caller control. --- benchmarks/haskell/Benchmarks/Stream.hs | 2 +- src/Data/Text/Encoding.hs | 669 ++++++++++-------- src/Data/Text/Encoding/Common.hs | 137 ---- src/Data/Text/Encoding/Error.hs | 96 ++- src/Data/Text/Internal/Encoding/Fusion.hs | 2 +- src/Data/Text/Internal/Encoding/Utf16.hs | 34 +- src/Data/Text/Internal/Encoding/Utf32.hs | 28 - src/Data/Text/Internal/Encoding/Utf8.hs | 54 +- .../Text/Internal/Lazy/Encoding/Fusion.hs | 2 +- src/Data/Text/Lazy/Encoding.hs | 87 +-- tests/Tests/Properties/Transcoding.hs | 186 +---- tests/Tests/QuickCheckUtils.hs | 10 +- tests/Tests/Regressions.hs | 2 +- text.cabal | 1 - 14 files changed, 506 insertions(+), 804 deletions(-) delete mode 100644 src/Data/Text/Encoding/Common.hs diff --git a/benchmarks/haskell/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs index 539b709c..1d53c498 100644 --- a/benchmarks/haskell/Benchmarks/Stream.hs +++ b/benchmarks/haskell/Benchmarks/Stream.hs @@ -21,7 +21,7 @@ import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Lazy as BL import Data.Text.Internal.Fusion.Types (Step (..), Stream (..)) import qualified Data.Text.Encoding as T -import qualified Data.Text.Encoding.Common as E +import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Internal.Encoding.Fusion as T import qualified Data.Text.Internal.Encoding.Fusion.Common as F import qualified Data.Text.Internal.Fusion as T diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 43bab2d3..db0ef035 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -4,7 +4,6 @@ {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} -{-# LANGUAGE TupleSections #-} -- | -- Module : Data.Text.Encoding -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, @@ -32,10 +31,12 @@ module Data.Text.Encoding decodeLatin1 , decodeUtf8Lenient , decodeAsciiPrefix - , DecodeResult(..) - , decodeUtf8Chunks - , decodeUtf16Chunks - , decodeUtf32Chunks + , Utf8DecodeState + , startUtf8State + , outAvailableUtf8Text + , decodeNextUtf8Chunk + , decodeUtf8Chunk + , recoverFromUtf8Error -- *** Catchable failure , decodeUtf8' @@ -49,7 +50,6 @@ module Data.Text.Encoding -- *** Stream oriented decoding -- $stream - , streamDecodeUtf8 , streamDecodeUtf8With , Decoding(..) @@ -62,6 +62,9 @@ module Data.Text.Encoding , decodeUtf32LE , decodeUtf32BE + -- *** Stream oriented decoding + , streamDecodeUtf8 + -- * Encoding Text to ByteStrings , encodeUtf8 , encodeUtf16LE @@ -74,38 +77,34 @@ module Data.Text.Encoding , encodeUtf8BuilderEscaped ) where +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) + import Control.Exception (evaluate, try) -import Control.Monad (when) import Control.Monad.ST (runST) -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Data.Bits (Bits, shiftL, shiftR, (.&.), (.|.)) +import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B -import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode, lenientDecode) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty, append) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) -import Data.Word (Word8, Word16, Word32) +import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) -import Foreign.Storable (Storable(..), poke, peekByteOff) +import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP -import Data.Text.Internal.Encoding.Utf8 (utf8DetectStart, utf8DetectContinue, DetectUtf8Result(..)) -import Data.Text.Internal.Encoding.Utf16 (Utf16Result(..), queryUtf16Bytes) -import Data.Text.Internal.Encoding.Utf32 (queryUtf32Bytes) +import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F import Data.Text.Internal.ByteStringCompat -import Numeric (showHex) #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif @@ -147,10 +146,10 @@ import qualified Data.ByteString.Unsafe as B -- anything except ASCII and copies buffer or throws an error otherwise. -- decodeASCII :: ByteString -> Text -decodeASCII bs@(B.length -> len) = case decodeAsciiPrefix bs of - (t, Nothing) -> t - (_, Just (w, (B.length -> len'))) -> - error $ "decodeASCII: detected non-ASCII codepoint (\\x" ++ showHex w (") at " ++ show (len - len' - 1)) +decodeASCII bs = + case decodeAsciiPrefix bs of + (_, Just errPos) -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show errPos + (t, Nothing) -> t -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -167,22 +166,24 @@ decodeLatin1 :: ByteString -> Text decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> do - let inner srcOff dstOff = if srcOff >= len then pure dstOff else do - let src' = src `plusPtr` srcOff - asciiPrefixLen <- fmap fromIntegral . c_is_ascii src' $ src `plusPtr` len - if asciiPrefixLen == 0 - then do - byte <- peekByteOff src srcOff - unsafeSTToIO $ A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) *> - A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F)) - inner (srcOff + 1) (dstOff + 2) - else do - unsafeSTToIO $ A.copyFromPointer dst dstOff src' asciiPrefixLen - inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) - actualLen <- inner 0 0 - arr <- unsafeSTToIO $ A.resizeM dst actualLen >>= A.unsafeFreeze - pure $ Text arr 0 actualLen + let inner srcOff dstOff = if srcOff >= len then return dstOff else do + asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len) + if asciiPrefixLen == 0 + then do + byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff + A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6)) + A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F)) + inner (srcOff + 1) (dstOff + 2) + else do + unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen + inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) + + actualLen <- inner 0 0 + dst' <- A.resizeM dst actualLen + arr <- A.unsafeFreeze dst' + return $ Text arr 0 actualLen foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize @@ -192,13 +193,12 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii -- This is a total function. The 'ByteString' is decoded until either -- the end is reached or it errors with the first non-ASCII 'Word8' is -- encountered. In either case the function will return what 'Text' was --- decoded. On error, the non-ASCII 'Word8' is also returned followed --- by the rest of the undecoded 'ByteString'. +-- decoded. On error, the index of the non-ASCII 'Word8' is also returned. -- -- @since 2.0.2 decodeAsciiPrefix :: ByteString - -> (Text, Maybe (Word8, ByteString)) + -> (Text, Maybe (Word8, Int)) decodeAsciiPrefix bs = if B.null bs then (empty, Nothing) else unsafeDupablePerformIO $ withBS bs $ \ fp len -> @@ -212,7 +212,7 @@ decodeAsciiPrefix bs = if B.null bs arr <- A.unsafeFreeze dst pure $ Text arr 0 asciiPrefixLen let suffix = if asciiPrefixLen < len - then Just (B.index bs asciiPrefixLen, B.drop (asciiPrefixLen + 1) bs) + then Just (B.index bs asciiPrefixLen, asciiPrefixLen) else Nothing pure (prefix, suffix) @@ -221,243 +221,321 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt #endif --- | Decode two 'ByteString's containing UTF-8-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. +-- | A value that represents the state of a UTF-8 decoding process potentionally +-- across multiple 'ByteString's. -- -- @since 2.0.2 -decodeUtf8Chunks :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> ByteString -- ^ The second 'ByteString' chunk to decode. - -> DecodeResult Text ByteString Word8 -decodeUtf8Chunks bs1@(B.length -> len1) bs2@(B.length -> len2) = - let len = len1 + len2 - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - guessUtf8Boundary len' - | wi 3 0xf0 = Just $ len' - 3 -- third to last char starts a four-byte code point - | wi 2 0xe0 = Just $ len' - 2 -- pre-last char starts a three-or-four-byte code point - | wi 1 0xc0 = Just $ len' - 1 -- last char starts a two-(or more-)byte code point - | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point - wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point - wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point - w 1 (< 0x80) = Just len' -- last char is ASCII - | otherwise = Nothing -- no clue - where - w n test = len' >= n && test (index $ len' - n) - wc n mask word8 = w n $ (word8 ==) . (mask .&.) - wi n word8 = w n (>= word8) - isValidBS :: Int -> Int -> ByteString -> Bool +data Utf8DecodeState = Utf8DecodeState + (Maybe (Utf8CodePointState, Int)) + [ByteString] + Int + [Either (Text, Int) (ByteString, Int, Int, Int)] + Int + deriving (Show) + +-- | This represents the begining state of a UTF-8 decoding process. +-- +-- @since 2.0.2 +startUtf8State :: Utf8DecodeState +startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0 + +-- | Takes whatever data has been decoded thus far and spits it out as a `Text` +-- value and a `Utf8DecodeState` value that no longer references the decoded +-- data. This function operates on error states, but does not clear the error. +-- (See 'recoverFromUtf8Error'.) +-- +-- @since 2.0.2 +outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState) +outAvailableUtf8Text (Utf8DecodeState mCpSt bss bs1Off dataStack tLen) = + if tLen > 0 + then runST $ do + dst <- A.new tLen + mapM_ (\ dat -> + case dat of + Left ((Text arr0 off utf8Len), dstOff) -> A.copyI utf8Len dst dstOff arr0 off + Right (bs, bsOff, utf8Len, dstOff) -> + withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len + ) dataStack + arr <- A.unsafeFreeze dst + pure (Text arr 0 tLen, Utf8DecodeState mCpSt bss bs1Off [] 0) + else (empty, Utf8DecodeState mCpSt bss bs1Off dataStack tLen) + +getCodePointStateOrError :: Utf8DecodeState -> Either Int Int +getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _) = + case mCpSt of + Nothing -> + let (lenInit, _) = foldr + (\ bs@(B.length -> len') (lenInit', (lenN', _)) -> + (lenInit' + lenN', (len', bs))) (0, (len1 - bs1Off, bs1)) bss' + in + Left (bs1Off - lenInit) + Just (_, cpLen) -> Right cpLen +getCodePointStateOrError _ = Right 0 + +decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState) +decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _) = (getCodePointStateOrError st, st) +decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st) +decodeUtf8Chunks + st@(Utf8DecodeState + (Just (cpSt, cpPos)) + bss@(bs1@(B.length -> len1) : bss') + bs1Off + dataStack + tLen + ) + = + {- +bs1Off = 0 cpPos = Δbs1Off + | bs1Off | boundary = Δbs1Off + v v bsX v v + |. . bs1 . .|. .|. . .bsN. . .| + ^-----------^ ^-------------^ + len1 lenN + ^---^ ^-----------^ + len1_ isValidBS span + ^-------^ + lenInit + ^---------------------^ + len + --} + let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytesting that's to be evaluated. + (lenInit, (lenN, bsN)) = foldr + (\ bs@(B.length -> len') (lenInit', (lenN', _)) -> + (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss' + len = lenInit + lenN + in + if len == cpPos + then (getCodePointStateOrError st, st) + else + let index i = + if i < len1_ + then B.index bs1 (i + bs1Off) + else + let index' i' (bs@(B.length -> len0) : bss'') = + if i' < len0 + then B.index bs i' + else index' (i' - len0) bss'' + index' i' _ = B.index bsN i' + in + index' (i - len1_) bss' + guessUtf8Boundary + | wi 3 0xf0 = Just $ len - 3 -- third to last char starts a four-byte code point + | wi 2 0xe0 = Just $ len - 2 -- pre-last char starts a three-or-four-byte code point + | wi 1 0xc2 = Just $ len - 1 -- last char starts a two-(or more-)byte code point + | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point + wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point + wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point + w 1 (< 0x80) = Just len -- last char is ASCII + | otherwise = Nothing -- no clue + where + w n test = len >= n && test (index $ len - n) + wc n mask word8 = w n $ (word8 ==) . (mask .&.) + wi n word8 = w n (>= word8) + -- queue the available valid data, trim the fat, and leave a spot for the code point state/error. + stackValidUtf8 wordCount mCps = + let bs1Off' = bs1Off + wordCount + (bs1Off''', bss'''', dataStack'') = + if wordCount < len1_ + then (bs1Off', bss, Right (bs1, bs1Off, wordCount, tLen) : dataStack) + else + let stackValidUtf8' wordCount' bs1Off'' tLen' bss''@(bs'@(B.length -> len') : bss''') dataStack' = + if wordCount' < len' + then (bs1Off'', bss'', Right (bs', 0, wordCount', tLen') : dataStack') + else stackValidUtf8' (wordCount' - len') (bs1Off'' - len') (tLen' + len') bss''' (Right (bs', 0, len', tLen') : dataStack') + stackValidUtf8' _ _ _ _ dataStack' = (0, [], dataStack') + in + stackValidUtf8' (wordCount - len1_) (bs1Off' - len1) (tLen + len1_) bss' (Right (bs1, bs1Off, len1_, tLen) : dataStack) + st' = Utf8DecodeState mCps bss'''' bs1Off''' dataStack'' $ tLen + wordCount + in + (getCodePointStateOrError st', st') + huntDownError off ndx cps = + if ndx < len + then + case updateUtf8State (index ndx) cps of + Just cps' -> + let ndx' = ndx + 1 in + huntDownError ( + if isUtf8StateIsComplete cps' + then ndx' + else off + ) ndx' cps' + Nothing -> stackValidUtf8 off Nothing + else stackValidUtf8 off $ Just (cps, ndx - off) + in + -- did we find the boundary? + case guessUtf8Boundary of + -- yes + Just boundary -> + -- are we before it? + if cpPos < boundary + -- yes: let's check this incomplete code point before checking the rest up to the boundary + then + let checkIncompleteCodePoint cpSt' cpPos' + -- a complete code point + | isUtf8StateIsComplete cpSt' = + let getEndState ndx cpSt'' + | ndx < len = + case updateUtf8State (index ndx) cpSt'' of + Nothing -> Nothing + Just cpSt''' -> getEndState (ndx + 1) cpSt''' + | otherwise = Just (cpSt'', ndx - boundary) + soFarSoGood = + stackValidUtf8 boundary $ getEndState boundary cpSt' + in + -- are we at the boundary? + if boundary == cpPos' + -- yes: get the state of the last code point + then soFarSoGood + -- no: + else + -- are we before bsN? + if cpPos' < lenInit + -- yes + then + -- keep walking the data until we get to bsN or an error + case updateUtf8State (index cpPos') cpSt' of + Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) + Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen) + -- no: we're in bsN + else let + off = (if lenInit > 0 + then cpPos' - lenInit + else cpPos' + bs1Off) + in + -- is the rest of the bytestring valid utf-8 up to the boundary? + if ( #ifdef SIMDUTF - isValidBS off bLen bs = withBS bs $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 (ptr `plusPtr` off) (fromIntegral bLen) + withBS bsN $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 (plusPtr ptr off) (fromIntegral $ boundary - cpPos') #elif MIN_VERSION_bytestring(0,11,2) - isValidBS off bLen = B.isValidUtf8 . B.take bLen . B.drop off + B.isValidUtf8 . B.take (boundary - cpPos') $ B.drop off bsN #else - isValidBS off bLen bs = start off - where - start ix - | ix >= off + bLen = True - | otherwise = case utf8DetectStart (B.unsafeIndex bs ix) of - Accept -> start (ix + 1) - Reject -> False - Incomplete st -> step (ix + 1) st - step ix st - | ix >= off + bLen = False - | otherwise = case utf8DetectContinue (B.unsafeIndex bs ix) st of - Accept -> start (ix + 1) - Reject -> False - Incomplete st' -> step (ix + 1) st' + let bLen = boundary - cpPos' + step ndx cps + | ndx < off + bLen = + case updateUtf8State (B.unsafeIndex bsN ndx) cps of + Just cps' -> step (ndx + 1) cps' + Nothing -> False + | otherwise = isUtf8StateIsComplete cps + in + step off utf8StartState #endif - bsToText bs count dst dstOff = - when (count > 0) . withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src count - chunksToText utf8Len - | utf8Len > 0 = runST $ do - dst <- A.new utf8Len - if utf8Len > len1 then do - bsToText bs1 len1 dst 0 - bsToText bs2 (utf8Len - len1) dst len1 - else - bsToText bs1 utf8Len dst 0 - arr <- A.unsafeFreeze dst - pure $ Text arr 0 utf8Len - | otherwise = empty - decodeResult isErr off = - let off' = if isErr then off + 1 else off in - DecodeResult - (chunksToText off) - (if isErr then Just $ index off else Nothing) - (if off' < len1 - then B.drop off' bs1 `B.append` bs2 - else B.drop (off' - len1) bs2) - off' - countValidUtf8 i _ Reject = decodeResult True i - countValidUtf8 i i' (Incomplete a) - | i' < len = countValidUtf8 i (i' + 1) $ utf8DetectContinue (index i') a - | otherwise = decodeResult False i - countValidUtf8 _ i' Accept - | i' < len = countValidUtf8 i' (i' + 1) . utf8DetectStart $ index i' - | otherwise = decodeResult False i' - wrapUpBs1 off = countValidUtf8 off off Accept - wrapUpBs2 off = wrapUpBs1 $ - case guessUtf8Boundary len of - Just n -> if n > off && isValidBS (off - len1) (n - off) bs2 - then n - else off - _ -> off - in - case guessUtf8Boundary len1 of - Just n -> - let checkCodePointAccrossBoundary Reject _ = decodeResult True n - checkCodePointAccrossBoundary (Incomplete a) off - | off < len = - checkCodePointAccrossBoundary (utf8DetectContinue (index off) a) $ off + 1 - | otherwise = decodeResult False n - checkCodePointAccrossBoundary Accept off = wrapUpBs2 off - spanChunks - | n < len1 = - checkCodePointAccrossBoundary (utf8DetectStart $ index n) $ n + 1 - | otherwise = wrapUpBs2 len1 - in - if n == 0 || isValidBS 0 n bs1 - then spanChunks - else wrapUpBs1 0 - _ -> - (if len1 > 0 - then wrapUpBs1 - else wrapUpBs2) 0 - -data Progression - = WriteAndAdvance Char Int - | NeedMore - | Invalid - -decodeChunks :: (Bits w, Num w, Storable w) - => w - -> ((Int -> Word8) -> Int -> Int -> Progression) - -> ByteString - -> ByteString - -> DecodeResult Text ByteString w -decodeChunks w transcodeF bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do - marr <- A.new len' - outer marr len' 0 0 - where - wordByteSize = sizeOf w - - index :: Int -> Word8 - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 $ i - len1 - - len :: Int - len = len1 + len2 - len' :: Int - len' = (len `div` wordByteSize) + 4 - - outer dst dstLen = inner - where - inner srcOff dstOff - | srcOff < len - , len >= srcOff + wordByteSize = - if dstOff + 4 > dstLen - -- need more space in destination - then do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - else - case transcodeF index len srcOff of - WriteAndAdvance c srcOff' -> do - d <- unsafeWrite dst dstOff c - inner srcOff' $ dstOff + d - NeedMore -> goodSoFar - Invalid -> - let srcOff' = srcOff + wordByteSize - bytesToWord n word - | n > 0 = bytesToWord (n - 1) $ (fromIntegral . index $ srcOff + wordByteSize - n) .|. (word `shiftL` 8) - | otherwise = word - in - wrapUp srcOff' . Just $ bytesToWord wordByteSize 0 - -- finished (for now) - | otherwise = goodSoFar - where - wrapUp off mW = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - pure $ DecodeResult (Text arr 0 dstOff) mW (if off >= len1 - then B.drop (off - len1) bs2 - else B.drop off $ bs1 `B.append` bs2) off - goodSoFar = - wrapUp srcOff Nothing - -decodeChunksProxy :: (Bits w, Num w, Storable w) - => ((Int -> Word8) -> Int -> Int -> Progression) - -> ByteString - -> ByteString - -> DecodeResult Text ByteString w -decodeChunksProxy = decodeChunks undefined -- This allows Haskell to --- determine the size in bytes of a data type using Storable.sizeOf --- so that it doesn't have to be passed as an arugment. Storable.sizeOf --- discards the actual value without evaluating it. - --- | Decode two 'ByteString's containing UTF-16-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. + ) + -- Yes + then soFarSoGood + -- No + else huntDownError cpPos' cpPos' cpSt' + -- We're mid code point + | otherwise = + if cpPos' < len + then + -- try to complete the code point + case updateUtf8State (index cpPos') cpSt' of + Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) + -- just enough additional data to find an error with the code point + Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen) + else + -- didn't get enough additional data to complete the code point + (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off dataStack tLen) + in + checkIncompleteCodePoint cpSt cpPos + -- no, we're past the boundary + else + -- the code point is the only thing that (potentially) changes + let getEndCodePointState cpPos' cpSt' + | cpPos' < len = + case updateUtf8State (index cpPos') cpSt' of + Nothing -> Nothing + Just cpSt'' -> getEndCodePointState (cpPos' + 1) cpSt'' + | otherwise = Just (cpSt', cpPos' - boundary) + mCpStLen = getEndCodePointState cpPos cpSt + in + ( case mCpStLen of + Nothing -> Left (-lenInit) + Just _ -> Right len + , Utf8DecodeState mCpStLen bss bs1Off dataStack tLen + ) + -- no: there's an error + Nothing -> huntDownError 0 cpPos cpSt + +-- | Decodes a UTF-8 'ByteString' in the context of what has already been +-- decoded which is represented by the 'Utf8DecodeState' value. Returned is the +-- new decode state and either ('Right') the number of 'Word8's that make up the +-- incomplete code point at the end of the input, or ('Left') the start position +-- of an invalid code point that was encountered. The position is relative to +-- the start of the input 'ByteString'. +-- +-- If the previous 'ByteString' ended with an incomplete code point, the +-- beginning of the input data will be treated as a continuation of the code +-- point. NOTE: That in this case if the input causes the previous incomplete +-- code point to be invalid, the returned error ('Left') position value will be +-- negative. +-- +-- If decoding the last 'ByteString' resulted in a error. The input is ignored, +-- and the state value is returned unchanged. Error states can be handled with +-- 'recoverFromUtf8Error'. -- -- @since 2.0.2 -decodeUtf16Chunks :: +decodeNextUtf8Chunk :: #if defined(ASSERTS) HasCallStack => #endif - Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> ByteString -- ^ The second 'ByteString' chunk to decode. - -> DecodeResult Text ByteString Word16 -decodeUtf16Chunks isBE bs1 bs2 = decodeChunksProxy (\ index len srcOff -> - -- get next Word8 pair - let writeAndAdvance c n = WriteAndAdvance c $ srcOff + n - b0 = index $ if isBE then srcOff else srcOff + 1 - b1 = index $ if isBE then srcOff + 1 else srcOff - in - case queryUtf16Bytes b0 b1 of - OneWord16 c -> writeAndAdvance c 2 - TwoWord16 g -> - if len - srcOff < 4 - -- not enough Word8s to finish the code point - then NeedMore - else - let b2 = index $ srcOff + (if isBE then 2 else 3) - b3 = index $ srcOff + (if isBE then 3 else 2) - in - case g b2 b3 of - Just c -> writeAndAdvance c 4 - _ -> Invalid - _ -> Invalid) bs1 bs2 - --- | Decode two 'ByteString's containing UTF-16-encoded text as though --- they were one continuous 'ByteString' returning a 'DecodeResult'. + ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) +decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st) +decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off dataStack tLen) + | len == 0 = (getCodePointStateOrError st, st) + | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off dataStack tLen + +-- | Decodes a 'ByteString' from a clean state. +-- +-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State'@ -- -- @since 2.0.2 -decodeUtf32Chunks :: +decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState) +decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State + +-- | If the 'Utf8DecodeState' value indicates an error state, the 'Word8' that +-- the state value point to is replaced with the input 'Text' value which may +-- be empty. Decoding resumes after the text is inserted and produces the result +-- described by 'decodeNextUtf8Chunk'. +-- +-- If not in an error state, the 'Text' is inserted at the end of the data, but +-- before an incomplete code point at the end of the last input 'ByteString'. +-- +-- @since 2.0.2 +recoverFromUtf8Error :: #if defined(ASSERTS) HasCallStack => #endif - Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> ByteString -- ^ The first 'ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> ByteString -- ^ The second 'ByteString' chunk to decode. - -> DecodeResult Text ByteString Word32 -decodeUtf32Chunks isBE bs1 bs2 = decodeChunksProxy (\ index _ srcOff -> - -- get next Word8 quartet - case queryUtf32Bytes (index $ if isBE then srcOff else srcOff + 3) - (index $ srcOff + (if isBE then 1 else 2)) - (index $ srcOff + (if isBE then 2 else 1)) - (index $ if isBE then srcOff + 3 else srcOff) of - Just c -> WriteAndAdvance c $ srcOff + 4 - _ -> Invalid) bs1 bs2 + Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) +recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@((B.length -> len) : bss') bs1Off dataStack tLen) = + let dammit mCpSt' bss'' bs1Off' = decodeUtf8Chunks . Utf8DecodeState mCpSt' bss'' bs1Off' + ( if utf8Len > 0 + then Left (t, tLen) : dataStack + else dataStack + ) $ tLen + utf8Len + in + case mCpSt of + Nothing -> + let g = dammit (Just (utf8StartState, 0)) + bs1Off' = bs1Off + 1 + in + if bs1Off' == len + then g bss' 0 + else g bss bs1Off' + Just _ -> dammit mCpSt bss bs1Off +recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ dataStack tLen) = + decodeUtf8Chunks $ Utf8DecodeState + (Just (utf8StartState, 0)) + [] + 0 + (if utf8Len > 0 + then Left (t, tLen) : dataStack + else dataStack + ) $ tLen + utf8Len -- | Decode a 'ByteString' containing UTF-8 encoded text. -- @@ -468,17 +546,46 @@ decodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs = case streamDecodeUtf8With onErr bs of - Some t undecoded _ -> codePointToInvalid undecoded t +decodeUtf8With onErr bs + | B.null undecoded = txt + | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of + Nothing -> txt' + Just c -> T.singleton c `append` txt') where - codePointToInvalid bs' txt = - case B.uncons bs' of - Just (x, bs'') -> codePointToInvalid bs'' $ case onErr desc $ Just x of - Just c -> append txt $ T.singleton c - _ -> txt - _ -> txt + (txt, undecoded) = decodeUtf8With2 onErr mempty bs + txt' = decodeUtf8With onErr (B.tail undecoded) + desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - desc = "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" +-- | Decode two consecutive bytestrings, returning Text and undecoded remainder. +decodeUtf8With2 :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> ByteString -> ByteString -> (Text, ByteString) +decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = + let g res isSecondBs = + case res of + ((Left pos), st) -> + g ( recoverFromUtf8Error + ( case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 code point" . Just $ + if pos >= 0 + then B.index (if isSecondBs then bs2 else bs1) pos + else B.index bs1 (len1 + pos) of + Just c -> T.singleton c + Nothing -> empty + ) st + ) isSecondBs + ((Right cpLen), st) -> + if isSecondBs + then + ( fst $ outAvailableUtf8Text st + , if cpLen > len2 + then B.drop (len1 + len2 - cpLen) bs1 `B.append` bs2 + else B.drop (len2 - cpLen) bs2 + ) + else g (decodeNextUtf8Chunk bs2 st) True + in + g (decodeUtf8Chunk bs1) False -- $stream -- @@ -561,7 +668,7 @@ streamDecodeUtf8 :: ByteString -> Decoding streamDecodeUtf8 = streamDecodeUtf8With strictDecode --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8 -- encoded text. -- -- @since 1.0.0.0 @@ -570,18 +677,11 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = g empty mempty +streamDecodeUtf8With onErr = go mempty where - g t bs0 bs1 = - case decodeUtf8Chunks bs0 bs1 of - DecodeResult t' mW bs1' _ -> - case t `append` t' of - txt -> (case mW of - Just _ -> - g (case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" mW of - Just c -> txt `append` T.singleton c - _ -> txt) mempty - _ -> Some txt bs1' . g empty) bs1' + go bs1 bs2 = Some txt undecoded (go undecoded) + where + (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. @@ -783,3 +883,6 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt)) encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} + +cSizeToInt :: CSize -> Int +cSizeToInt = fromIntegral diff --git a/src/Data/Text/Encoding/Common.hs b/src/Data/Text/Encoding/Common.hs deleted file mode 100644 index 0f3bd2db..00000000 --- a/src/Data/Text/Encoding/Common.hs +++ /dev/null @@ -1,137 +0,0 @@ -{-# LANGUAGE CPP, DeriveDataTypeable #-} -{-# LANGUAGE Safe #-} --- | --- Module : Data.Text.Encoding.Common --- Copyright : (c) Bryan O'Sullivan 2009 --- --- License : BSD-style --- Maintainer : bos@serpentine.com --- Portability : GHC --- --- Common functions and types for both lazy and strict encoding and --- decoding including error handling. --- --- The standard functions for encoding and decoding text are strict, --- which is to say that they throw exceptions on invalid input. This --- is often unhelpful on real world input, so alternative functions --- exist that accept custom handlers for dealing with invalid inputs. --- These 'OnError' handlers are normal Haskell functions. You can use --- one of the presupplied functions in this module, or you can write a --- custom handler of your own. - -module Data.Text.Encoding.Common - ( - -- * Full-service result type - DecodeResult(..) - -- * Error handling types - , UnicodeException(..) - , OnError - , OnDecodeError - , OnEncodeError - -- * Useful error handling functions - , lenientDecode - , strictDecode - , strictEncode - , ignore - , replace - ) - where - -import Control.DeepSeq (NFData (..)) -import Control.Exception (Exception, throw) -import Data.Typeable (Typeable) -import Data.Word (Word8) -import Numeric (showHex) - --- | A decoding result on encoded data. It contains: --- --- 1. the decoded data up to an incomplete code point at the end of --- the input data, an invalid word, or to the end of the input, --- 2. the first word of an invalid code point if one was encountered, --- 3. the remaining undecoded data, which is either an incomplete --- code point, the data after the invalid code point, or empty, --- and --- 4. the byte position of remaining undecoded data. --- --- @since 2.0.2 -data DecodeResult t b w = DecodeResult !t !(Maybe w) !b !Int - deriving (Eq, Ord, Show, Read) - --- | Function type for handling a coding error. It is supplied with --- two inputs: --- --- * A 'String' that describes the error. --- --- * The input value that caused the error. If the error arose --- because the end of input was reached or could not be identified --- precisely, this value will be 'Nothing'. --- --- If the handler returns a value wrapped with 'Just', that value will --- be used in the output as the replacement for the invalid input. If --- it returns 'Nothing', no value will be used in the output. --- --- Should the handler need to abort processing, it should use 'error' --- or 'throw' an exception (preferably a 'UnicodeException'). It may --- use the description provided to construct a more helpful error --- report. -type OnError a b = String -> Maybe a -> Maybe b - --- | A handler for a decoding error. -type OnDecodeError = OnError Word8 Char - --- | A handler for an encoding error. -{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} -type OnEncodeError = OnError Char Word8 - --- | An exception type for representing Unicode encoding errors. -data UnicodeException = - DecodeError String (Maybe Word8) - -- ^ Could not decode a byte sequence because it was invalid under - -- the given encoding, or ran out of input in mid-decode. - | EncodeError String (Maybe Char) - -- ^ Tried to encode a character that could not be represented - -- under the given encoding, or ran out of input in mid-encode. - deriving (Eq, Typeable) - -{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-} - -showUnicodeException :: UnicodeException -> String -showUnicodeException (DecodeError desc (Just w)) - = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc) -showUnicodeException (DecodeError desc Nothing) - = "Cannot decode input: " ++ desc -showUnicodeException (EncodeError desc (Just c)) - = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) -showUnicodeException (EncodeError desc Nothing) - = "Cannot encode input: " ++ desc - -instance Show UnicodeException where - show = showUnicodeException - -instance Exception UnicodeException - -instance NFData UnicodeException where - rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () - rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () - --- | Throw a 'UnicodeException' if decoding fails. -strictDecode :: OnDecodeError -strictDecode desc c = throw (DecodeError desc c) - --- | Replace an invalid input byte with the Unicode replacement --- character U+FFFD. -lenientDecode :: OnDecodeError -lenientDecode _ _ = Just '\xfffd' - --- | Throw a 'UnicodeException' if encoding fails. -{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-} -strictEncode :: OnEncodeError -strictEncode desc c = throw (EncodeError desc c) - --- | Ignore an invalid input, substituting nothing in the output. -ignore :: OnError a b -ignore _ _ = Nothing - --- | Replace an invalid input with a valid output. -replace :: b -> OnError a b -replace c _ _ = Just c diff --git a/src/Data/Text/Encoding/Error.hs b/src/Data/Text/Encoding/Error.hs index 83fa3a76..ea9e0997 100644 --- a/src/Data/Text/Encoding/Error.hs +++ b/src/Data/Text/Encoding/Error.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP, DeriveDataTypeable #-} {-# LANGUAGE Safe #-} -- | -- Module : Data.Text.Encoding.Error @@ -33,14 +34,87 @@ module Data.Text.Encoding.Error , replace ) where -import Data.Text.Encoding.Common - ( UnicodeException(..) - , OnError - , OnDecodeError - , OnEncodeError - , lenientDecode - , strictDecode - , strictEncode - , ignore - , replace - ) +import Control.DeepSeq (NFData (..)) +import Control.Exception (Exception, throw) +import Data.Typeable (Typeable) +import Data.Word (Word8) +import Numeric (showHex) + +-- | Function type for handling a coding error. It is supplied with +-- two inputs: +-- +-- * A 'String' that describes the error. +-- +-- * The input value that caused the error. If the error arose +-- because the end of input was reached or could not be identified +-- precisely, this value will be 'Nothing'. +-- +-- If the handler returns a value wrapped with 'Just', that value will +-- be used in the output as the replacement for the invalid input. If +-- it returns 'Nothing', no value will be used in the output. +-- +-- Should the handler need to abort processing, it should use 'error' +-- or 'throw' an exception (preferably a 'UnicodeException'). It may +-- use the description provided to construct a more helpful error +-- report. +type OnError a b = String -> Maybe a -> Maybe b + +-- | A handler for a decoding error. +type OnDecodeError = OnError Word8 Char + +-- | A handler for an encoding error. +{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-} +type OnEncodeError = OnError Char Word8 + +-- | An exception type for representing Unicode encoding errors. +data UnicodeException = + DecodeError String (Maybe Word8) + -- ^ Could not decode a byte sequence because it was invalid under + -- the given encoding, or ran out of input in mid-decode. + | EncodeError String (Maybe Char) + -- ^ Tried to encode a character that could not be represented + -- under the given encoding, or ran out of input in mid-encode. + deriving (Eq, Typeable) + +{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-} + +showUnicodeException :: UnicodeException -> String +showUnicodeException (DecodeError desc (Just w)) + = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc) +showUnicodeException (DecodeError desc Nothing) + = "Cannot decode input: " ++ desc +showUnicodeException (EncodeError desc (Just c)) + = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) +showUnicodeException (EncodeError desc Nothing) + = "Cannot encode input: " ++ desc + +instance Show UnicodeException where + show = showUnicodeException + +instance Exception UnicodeException + +instance NFData UnicodeException where + rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () + rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () + +-- | Throw a 'UnicodeException' if decoding fails. +strictDecode :: OnDecodeError +strictDecode desc c = throw (DecodeError desc c) + +-- | Replace an invalid input byte with the Unicode replacement +-- character U+FFFD. +lenientDecode :: OnDecodeError +lenientDecode _ _ = Just '\xfffd' + +-- | Throw a 'UnicodeException' if encoding fails. +{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-} +strictEncode :: OnEncodeError +strictEncode desc c = throw (EncodeError desc c) + +-- | Ignore an invalid input, substituting nothing in the output. +ignore :: OnError a b +ignore _ _ = Nothing + +-- | Replace an invalid input with a valid output. +replace :: b -> OnError a b +replace c _ _ = Just c diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs index aa3019ab..aa8f0d02 100644 --- a/src/Data/Text/Internal/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Encoding/Fusion.hs @@ -41,7 +41,7 @@ import Data.Bits (shiftL, shiftR) import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy) import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size -import Data.Text.Encoding.Common +import Data.Text.Encoding.Error import Data.Text.Internal.Encoding.Fusion.Common import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) diff --git a/src/Data/Text/Internal/Encoding/Utf16.hs b/src/Data/Text/Internal/Encoding/Utf16.hs index 45b0eeb3..4fe11a62 100644 --- a/src/Data/Text/Internal/Encoding/Utf16.hs +++ b/src/Data/Text/Internal/Encoding/Utf16.hs @@ -23,17 +23,14 @@ module Data.Text.Internal.Encoding.Utf16 chr2 , validate1 , validate2 - , Utf16Result(..) - , queryUtf16Bytes ) where -import Data.Bits ((.&.)) import GHC.Exts -import GHC.Word (Word16(..), Word8(..)) +import GHC.Word (Word16(..)) #if !MIN_VERSION_base(4,16,0) -- harmless to import, except for warnings that it is unused. -import Data.Text.Internal.PrimCompat ( word16ToWord#, word8ToWord# ) +import Data.Text.Internal.PrimCompat ( word16ToWord# ) #endif chr2 :: Word16 -> Word16 -> Char @@ -53,30 +50,3 @@ validate2 :: Word16 -> Word16 -> Bool validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF && x2 >= 0xDC00 && x2 <= 0xDFFF {-# INLINE validate2 #-} - -data Utf16Result - = OneWord16 Char - | TwoWord16 (Word8 -> Word8 -> Maybe Char) - | Invalid16 - -queryUtf16Bytes :: Word8 -> Word8 -> Utf16Result -queryUtf16Bytes b0@(W8# w0#) (W8# w1#) - | b0 < 0xD8 || b0 >= 0xE0 = OneWord16 $ C# (chr# (orI# (word2Int# (shiftL# (word8ToWord# w0#) 8#)) (word2Int# (word8ToWord# w1#)))) - -- 110110xx: start of surrogate pair - | b0 .&. 0xFC == 0xD8 = TwoWord16 $ \ b2@(W8# w2#) (W8# w3#) -> - if b2 .&. 0xFC == 0xDC - -- valid surrogate - then Just $ - C# (chr# ( - (orI# - (orI# - (orI# - (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w0#)))) 18#)) - (word2Int# (shiftL# (word8ToWord# w1#) 10#)) - ) - (word2Int# (shiftL# (int2Word# (andI# 0x3# (word2Int# (word8ToWord# w2#)))) 8#))) - (word2Int# (word8ToWord# w3#))) +# 0x10000# - )) - else Nothing - | otherwise = Invalid16 -{-# INLINE queryUtf16Bytes #-} diff --git a/src/Data/Text/Internal/Encoding/Utf32.hs b/src/Data/Text/Internal/Encoding/Utf32.hs index 40a0e25d..4e8e9b46 100644 --- a/src/Data/Text/Internal/Encoding/Utf32.hs +++ b/src/Data/Text/Internal/Encoding/Utf32.hs @@ -1,5 +1,3 @@ -{-# LANGUAGE CPP, MagicHash #-} - -- | -- Module : Data.Text.Internal.Encoding.Utf32 -- Copyright : (c) 2008, 2009 Tom Harper, @@ -19,36 +17,10 @@ module Data.Text.Internal.Encoding.Utf32 ( validate - , queryUtf32Bytes ) where import Data.Word (Word32) -import GHC.Exts -import GHC.Word (Word8(..)) - -#if !MIN_VERSION_base(4,16,0) --- harmless to import, except for warnings that it is unused. -import Data.Text.Internal.PrimCompat (word8ToWord#) -#endif validate :: Word32 -> Bool validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF) {-# INLINE validate #-} - -queryUtf32Bytes :: (Eq a, Num a) => a -> Word8 -> Word8 -> Word8 -> Maybe Char -queryUtf32Bytes b0 b1@(W8# w1#) b2@(W8# w2#) (W8# w3#) - | b0 == 0 - , b1 < 0x11 - , b1 > 0 || b2 < 0xD8 || b2 >= 0xE0 = - Just $ - C# (chr# - (orI# - (orI# - (word2Int# (shiftL# (word8ToWord# w1#) 16#)) - (word2Int# (shiftL# (word8ToWord# w2#) 8#)) - ) - (word2Int# (word8ToWord# w3#)) - ) - ) - | otherwise = Nothing -{-# INLINE queryUtf32Bytes #-} diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 17ee7cfb..70f518df 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -34,10 +34,10 @@ module Data.Text.Internal.Encoding.Utf8 , validate3 , validate4 -- * Naive decoding - , DetectUtf8Result(..) - , DetectState(..) - , utf8DetectStart - , utf8DetectContinue + , Utf8CodePointState + , utf8StartState + , updateUtf8State + , isUtf8StateIsComplete ) where #if defined(ASSERTS) @@ -242,17 +242,14 @@ byteToClass n = ByteClass (W8# el#) table# :: Addr# table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# -newtype DetectState = DetectState Word8 +newtype Utf8CodePointState = Utf8CodePointState Word8 deriving (Eq, Ord, Show, Read) -utf8AcceptState :: DetectState -utf8AcceptState = DetectState 0 +utf8StartState :: Utf8CodePointState +utf8StartState = Utf8CodePointState 0 -utf8RejectState :: DetectState -utf8RejectState = DetectState 12 - -updateState :: ByteClass -> DetectState -> DetectState -updateState (ByteClass c) (DetectState s) = DetectState (W8# el#) +transitionUtf8State :: ByteClass -> Utf8CodePointState -> Utf8CodePointState +transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W8# el#) where !(I# n#) = word8ToInt (c + s) el# = indexWord8OffAddr# table# n# @@ -260,29 +257,12 @@ updateState (ByteClass c) (DetectState s) = DetectState (W8# el#) table# :: Addr# table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"# --- | @since 2.0 -data DetectUtf8Result - = Accept - | Incomplete !DetectState - | Reject - deriving (Eq, Ord, Show, Read) - --- | @since 2.0 -utf8DetectStart :: Word8 -> DetectUtf8Result -utf8DetectStart !w - | st == utf8AcceptState = Accept - | st == utf8RejectState = Reject - | otherwise = Incomplete st - where - cl = byteToClass w - st = updateState cl utf8AcceptState +updateUtf8State :: Word8 -> Utf8CodePointState -> Maybe Utf8CodePointState +updateUtf8State w st = case transitionUtf8State (byteToClass w) st of + Utf8CodePointState 12 -> Nothing + st' -> Just st' --- | @since 2.0 -utf8DetectContinue :: Word8 -> DetectState -> DetectUtf8Result -utf8DetectContinue !w !st - | st' == utf8AcceptState = Accept - | st' == utf8RejectState = Reject - | otherwise = Incomplete st' - where - cl = byteToClass w - st' = updateState cl st +isUtf8StateIsComplete :: Utf8CodePointState -> Bool +isUtf8StateIsComplete (Utf8CodePointState s) + | s == 0 = True + | otherwise = False diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs index 5ebad0ad..69149779 100644 --- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs @@ -38,7 +38,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Unsafe as B import Data.Text.Internal.ByteStringCompat import Data.Text.Internal.Encoding.Fusion.Common -import Data.Text.Encoding.Common +import Data.Text.Encoding.Error import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 91f36dc8..ad361af5 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -23,11 +23,6 @@ module Data.Text.Lazy.Encoding -- ** Total Functions #total# -- $total decodeLatin1 - , decodeAsciiPrefix - , DecodeResult(..) - , decodeUtf8Chunks - , decodeUtf16Chunks - , decodeUtf32Chunks -- *** Catchable failure , decodeUtf8' @@ -62,9 +57,9 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Monoid (Monoid(..)) -import Data.Text.Encoding.Common (DecodeResult(..), OnDecodeError, UnicodeException, strictDecode) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) -import Data.Word (Word16, Word32, Word8) +import Data.Word (Word8) import qualified Data.ByteString as S import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP @@ -100,84 +95,16 @@ import Data.Text.Unsafe (unsafeDupablePerformIO) -- (preferably not at all). See "Data.Text.Lazy.Encoding#g:total" for better -- solutions. --- | Decode a 'B.ByteString' containing 7-bit ASCII +-- | Decode a 'ByteString' containing 7-bit ASCII -- encoded text. decodeASCII :: B.ByteString -> Text decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks --- | Decode a 'B.ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. +-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. decodeLatin1 :: B.ByteString -> Text decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks --- | Decode a 'B.ByteString' containing ASCII. --- --- This is a total function. The 'B.ByteString' is decoded until either --- the end is reached or it errors with the first non-ASCII 'Word8' is --- encountered. In either case the function will return what 'Text' was --- decoded. On error, the non-ASCII 'Word8' is also returned followed --- by the rest of the undecoded 'B.ByteString'. --- --- @since 2.0.2 -decodeAsciiPrefix - :: B.ByteString - -> (Text, Maybe (Word8, B.ByteString)) -decodeAsciiPrefix = g id - where - g tDiff (B.Chunk sb lb) = - case TE.decodeAsciiPrefix sb of - (t, Nothing) -> g (tDiff . chunk t) lb - (t, Just (w, sb')) -> (tDiff $ chunk t Empty, Just (w, B.chunk sb' lb)) - g tDiff _ = (tDiff Empty, Nothing) - -decodeChunks :: (S.ByteString -> S.ByteString -> DecodeResult T.Text S.ByteString w) - -> B.ByteString - -> B.ByteString - -> DecodeResult Text B.ByteString w -decodeChunks decoder = g id 0 mempty - where - g tDiff pos sb0 (B.Chunk sb1 lb1) lb2 = - let DecodeResult t mW sb1' pos' = decoder sb0 sb1 - pos1 = pos + pos' - in - case mW of - Just _ -> DecodeResult (tDiff $ chunk t Empty) mW (B.chunk sb1' $ lb1 `B.append` lb2) pos1 - _ -> g (tDiff . chunk t) pos1 sb1' lb1 lb2 - g tDiff pos sb0 _ (B.Chunk sb1 lb1) = g tDiff pos sb0 (B.Chunk sb1 lb1) mempty - g tDiff pos sb0 _ _ = DecodeResult (tDiff Empty) Nothing (B.chunk sb0 mempty) pos - --- | Decode two 'B.ByteString's containing UTF-8-encoded text as though --- they were one continuous 'B.ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeUtf8Chunks - :: B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. - -> DecodeResult Text B.ByteString Word8 -decodeUtf8Chunks = decodeChunks TE.decodeUtf8Chunks - --- | Decode two 'B.ByteString's containing UTF-16-encoded text as though --- they were one continuous 'B.ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeUtf16Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. - -> DecodeResult Text B.ByteString Word16 -decodeUtf16Chunks = decodeChunks . TE.decodeUtf16Chunks - --- | Decode two 'B.ByteString's containing UTF-32-encoded text as though --- they were one continuous 'B.ByteString' returning a 'DecodeResult'. --- --- @since 2.0.2 -decodeUtf32Chunks - :: Bool -- ^ Indicates whether the encoding is big-endian ('True') or little-endian ('False') - -> B.ByteString -- ^ The first 'B.ByteString' chunk to decode. Typically this is the undecoded data from the previous call of this function. - -> B.ByteString -- ^ The second 'B.ByteString' chunk to decode. - -> DecodeResult Text B.ByteString Word32 -decodeUtf32Chunks = decodeChunks . TE.decodeUtf32Chunks - --- | Decode a 'B.ByteString' containing UTF-8 encoded text. +-- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr (B.Chunk b0 bs0) = case TE.streamDecodeUtf8With onErr b0 of @@ -198,7 +125,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" decodeUtf8With _ _ = empty --- | Decode a 'B.ByteString' containing UTF-8 encoded text that is known +-- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. -- -- If the input contains any invalid UTF-8 data, an exception will be @@ -209,7 +136,7 @@ decodeUtf8 :: B.ByteString -> Text decodeUtf8 = decodeUtf8With strictDecode {-# INLINE[0] decodeUtf8 #-} --- | Decode a 'B.ByteString' containing UTF-8 encoded text.. +-- | Decode a 'ByteString' containing UTF-8 encoded text.. -- -- If the input contains any invalid UTF-8 data, the relevant -- exception will be returned, otherwise the decoded text. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index bf4d6589..0fabf62e 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -23,7 +23,7 @@ import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E -import qualified Data.Text.Encoding.Common as C +import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -37,46 +37,16 @@ tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) -t_utf8_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf8Chunks mempty $ E.encodeUtf8 t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id tl_utf8' = (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right) -tl_utf8_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf8Chunks mempty $ EL.encodeUtf8 t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16LE = (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id -t_utf16LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks False mempty $ E.encodeUtf16LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16LE = (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id -tl_utf16LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks False mempty $ EL.encodeUtf16LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf16BE = (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id -t_utf16BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf16Chunks True mempty $ E.encodeUtf16BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf16BE = (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id -tl_utf16BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf16Chunks True mempty $ EL.encodeUtf16BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32LE = (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id -t_utf32LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks False mempty $ E.encodeUtf32LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id -tl_utf32LE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks False mempty $ EL.encodeUtf32LE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id -t_utf32BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = E.decodeUtf32Chunks True mempty $ E.encodeUtf32BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id -tl_utf32BE_c = (\ t -> - let E.DecodeResult t' mC bs _ = EL.decodeUtf32Chunks True mempty $ EL.encodeUtf32BE t in - (t', mC, bs)) `eq` (\ t -> (t, Nothing, mempty)) runBuilder :: B.Builder -> B.ByteString runBuilder = @@ -154,13 +124,6 @@ t_utf8_err bad de = forAll (Blind <$> genDecodeErr de) $ \(Blind onErr) -> ioPro length (show err) >= 0 Right _ -> counterexample (show (decoded, l)) $ de /= Strict -t_utf8_c_err :: InvalidUtf8 -> Property -t_utf8_c_err bad = - let E.DecodeResult t mW bs _ = E.decodeUtf8Chunks mempty $ toByteString bad in - case mW of - Just w -> counterexample (show w) True - _ -> counterexample (show t) $ B.length bs > 0 - t_utf8_err' :: B.ByteString -> Bool t_utf8_err' bs = case E.decodeUtf8' bs of Left err -> length (show err) >= 0 @@ -202,132 +165,11 @@ genInvalidUTF8 = B.pack <$> oneof [ ord3_ n = map fromIntegral [(n `shiftR` 12) + 0xE0, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] ord4_ n = map fromIntegral [(n `shiftR` 18) + 0xF0, ((n `shiftR` 12) .&. 0x3F) + 0x80, ((n `shiftR` 6) .&. 0x3F) + 0x80, (n .&. 0x3F) + 0x80] -t_prefix_decode_ascii_1 = - let g bs es is = - case es of - expected : es' -> whenEqProp (E.decodeAsciiPrefix bs) expected $ - case expected of - (_, Just (_, bs')) -> g bs' es' is - _ -> f (is, es') - _ -> counterexample ("More input than expected: " ++ show is) False - f s = case s of - (input : is, es) -> g input es is - (_, es@(_ : _)) -> counterexample ("More expected output than input: " ++ show es) False - _ -> counterexample "" True - in - f ( [ B.pack [0x68, 0x69, 0x2c, 0x20, 0x83, 0x68, 0x65, 0x6c, 0x6c] - , B.pack [0x6f, 0x2c, 0x20, 0x94, - 0x68, 0x6f, 0x77, 0x20, 0xcc, - 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f] - ] - , [ ("hi, ", Just (0x83, B.pack [0x68, 0x65, 0x6c, 0x6c])) - , ("hell", Nothing) - , ("o, ", Just (0x94, B.pack [0x68, 0x6f, 0x77, 0x20, 0xcc, - 0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f])) - , ("how ", Just (0xcc, B.pack [0x61, 0x72, 0x65, 0x20, 0x79, 0x61, 0x3f])) - , ("are ya?", Nothing) - ] - ) - --- test multi-word code points split across bytestring chunks -chunksTests decodeF insExpectedOuts = - f mempty insExpectedOuts - where - f bs s = - case s of - (input, expected) : s' -> whenEqProp (decodeF bs input) expected $ - case expected of - E.DecodeResult _ _ bs' _ -> f bs' s' - _ -> counterexample "" True - -t_chunk_decode_utf8_1 = chunksTests E.decodeUtf8Chunks - [ ( B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4, 0x89, 0x2C, 0x20, - 0xe2, 0x98, 0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, - 0x20, 0xF0, 0x90, 0x90, 0xB7, 0x21] - , E.DecodeResult "hi, ĉ, ☃, and \x10437!" Nothing mempty 22 - ) - ] -t_chunk_decode_utf8_2 = chunksTests E.decodeUtf8Chunks - [ ( B.pack [97, 0xC2, 97] - , E.DecodeResult (T.singleton 'a') (Just 0xC2) (B.singleton 97) 2 - ) - , ( mempty - , E.DecodeResult (T.singleton 'a') Nothing mempty 1 - ) - ] -t_chunk_decode_utf8_3 = chunksTests E.decodeUtf8Chunks - [ ( B.pack [104, 105, 32, 0xe2] - , E.DecodeResult "hi " Nothing (B.singleton 0xe2) 3 - ) - , ( B.singleton 0x98 - , E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0 - ) - , ( B.pack [0x83, 32, 0xFF] - , E.DecodeResult "☃ " (Just 0xFF) mempty 5 - ) - ] --- test multi-word code points split across bytestring chunks -t_chunk_decode_utf8_4 = chunksTests E.decodeUtf8Chunks - [ (B.pack [0x68, 0x69, 0x2C, 0x20, 0xC4], E.DecodeResult "hi, " Nothing (B.singleton 0xc4) 4) - , (B.pack [0x89, 0x2C, 0x20, 0xe2], E.DecodeResult "ĉ, " Nothing (B.singleton 0xe2) 4) - , (B.singleton 0x98, E.DecodeResult "" Nothing (B.pack [0xe2, 0x98]) 0) - , (B.pack [0x83, 0x2C, 0x20, 0x61, 0x6e, 0x64, 0x20, 0xF0], E.DecodeResult "☃, and " Nothing (B.singleton 0xF0) 9) - , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90]) 0) - , (B.singleton 0x90, E.DecodeResult "" Nothing (B.pack [0xF0, 0x90, 0x90]) 0) - , (B.pack [0xB7, 0x21], E.DecodeResult "\x10437!" Nothing mempty 5) - ] - -t_chunk_decode_utf16BE = chunksTests (E.decodeUtf16Chunks True) - [ ( B.pack [0] - , E.DecodeResult T.empty Nothing (B.pack [0]) 0 - ) - , ( B.pack [104, 0, 105, 0, 32, 0xD8, 0x01] - , E.DecodeResult "hi " Nothing (B.pack [0xD8, 0x01]) 6 - ) - , ( B.pack [0xDC, 0x37, 0, 32, 0xDC, 0] - , E.DecodeResult "\x10437 " (Just 0xDC00) mempty 8 - ) - ] -t_chunk_decode_utf16LE = chunksTests (E.decodeUtf16Chunks False) - [ ( B.pack [104] - , E.DecodeResult T.empty Nothing (B.pack [104]) 0 - ) - , ( B.pack [0, 105, 0, 32, 0, 0x01, 0xD8] - , E.DecodeResult "hi " Nothing (B.pack [0x01, 0xD8]) 6 - ) - , ( B.pack [0x37, 0xDC, 32, 0, 0, 0xDC] - , E.DecodeResult "\x10437 " (Just 0xDC) mempty 8 - ) - ] - -t_chunk_decode_utf32BE = chunksTests (E.decodeUtf32Chunks True) - [ ( B.pack [0, 0, 0, 104, 0, 0, 0, 105, 0, 0] - , E.DecodeResult "hi" Nothing (B.pack [0, 0]) 8 - ) - , ( B.pack [0, 32, 0, 0, 0x26] - , E.DecodeResult " " Nothing (B.pack [0, 0, 0x26]) 4 - ) - , ( B.pack [0x03, 0, 0, 0, 32, 0, 0, 0xD8, 0] - , E.DecodeResult "☃ " (Just 0xD800) mempty 12 - ) - ] -t_chunk_decode_utf32LE = chunksTests (E.decodeUtf32Chunks False) - [ ( B.pack [104, 0, 0, 0, 105, 0, 0, 0, 0x20, 0] - , E.DecodeResult "hi" Nothing (B.pack [0x20, 0]) 8 - ) - , ( B.pack [0, 0, 0x03, 0x26, 0] - , E.DecodeResult " " Nothing (B.pack [0x03, 0x26, 0]) 4 - ) - , ( B.pack [0, 32, 0, 0, 0, 0, 0xD8, 0, 0] - , E.DecodeResult "☃ " (Just 0xD80000) mempty 12 - ) - ] - decodeLL :: BL.ByteString -> TL.Text -decodeLL = EL.decodeUtf8With C.lenientDecode +decodeLL = EL.decodeUtf8With E.lenientDecode decodeL :: B.ByteString -> T.Text -decodeL = E.decodeUtf8With C.lenientDecode +decodeL = E.decodeUtf8With E.lenientDecode -- The lenient decoding of lazy bytestrings should not depend on how they are chunked, -- and it should behave the same as decoding of strict bytestrings. @@ -360,7 +202,7 @@ t_decode_with_error4' = t_decode_with_error5' = ioProperty $ do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) pure $ case ret of - Left (_ :: C.UnicodeException) -> True + Left (_ :: E.UnicodeException) -> True Right{} -> False t_infix_concat bs1 text bs2 = @@ -377,28 +219,18 @@ testTranscoding = testProperty "tl_latin1" tl_latin1, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', - testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', - testProperty "tl_utf8_c" tl_utf8_c, testProperty "t_utf16LE" t_utf16LE, - testProperty "t_utf16LE_c" t_utf16LE_c, testProperty "tl_utf16LE" tl_utf16LE, - testProperty "tl_utf16LE_c" tl_utf16LE_c, testProperty "t_utf16BE" t_utf16BE, - testProperty "t_utf16BE_c" t_utf16BE_c, testProperty "tl_utf16BE" tl_utf16BE, - testProperty "tl_utf16BE_c" tl_utf16BE_c, testProperty "t_utf32LE" t_utf32LE, - testProperty "t_utf32LE_c" t_utf32LE_c, testProperty "tl_utf32LE" tl_utf32LE, - testProperty "tl_utf32LE_c" tl_utf32LE_c, testProperty "t_utf32BE" t_utf32BE, - testProperty "t_utf32BE_c" t_utf32BE_c, testProperty "tl_utf32BE" tl_utf32BE, - testProperty "tl_utf32BE_c" tl_utf32BE_c, testGroup "builder" [ testProperty "t_encodeUtf8Builder" t_encodeUtf8Builder, testProperty "t_encodeUtf8Builder_nonZeroOffset" t_encodeUtf8Builder_nonZeroOffset, @@ -408,19 +240,9 @@ testTranscoding = ], testGroup "errors" [ testProperty "t_utf8_err" t_utf8_err, - testProperty "t_utf8_c_err" t_utf8_c_err, testProperty "t_utf8_err'" t_utf8_err' ], testGroup "error recovery" [ - testProperty "t_chunk_decode_utf8_1" t_chunk_decode_utf8_1, - testProperty "t_chunk_decode_utf8_2" t_chunk_decode_utf8_2, - testProperty "t_chunk_decode_utf8_3" t_chunk_decode_utf8_3, - testProperty "t_chunk_decode_utf8_4" t_chunk_decode_utf8_4, - testProperty "t_prefix_decode_ascii_1" t_prefix_decode_ascii_1, - testProperty "t_chunk_decode_utf16BE" t_chunk_decode_utf16BE, - testProperty "t_chunk_decode_utf16LE" t_chunk_decode_utf16LE, - testProperty "t_chunk_decode_utf32BE" t_chunk_decode_utf32BE, - testProperty "t_chunk_decode_utf32LE" t_chunk_decode_utf32LE, testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index a09b65b1..8f36f7ec 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,8 +27,6 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read - - , whenEqProp ) where import Control.Arrow ((***)) @@ -44,7 +42,7 @@ import Tests.Utils import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T -import qualified Data.Text.Encoding.Common as T +import qualified Data.Text.Encoding.Error as T import qualified Data.Text.Internal.Fusion as TF import qualified Data.Text.Internal.Fusion.Common as TF import qualified Data.Text.Internal.Lazy as TL @@ -286,9 +284,3 @@ newtype SkewedBool = Skewed { getSkewed :: Bool } instance Arbitrary SkewedBool where arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)] - --- like Control.Monad.when, but with properties instead of monad values -whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property -whenEqProp a b next = if a == b - then next - else a === b diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs index 7828bc75..cf011680 100644 --- a/tests/Tests/Regressions.hs +++ b/tests/Tests/Regressions.hs @@ -21,7 +21,7 @@ import qualified Data.ByteString.Lazy as LB import qualified Data.Text as T import qualified Data.Text.Array as TA import qualified Data.Text.Encoding as TE -import qualified Data.Text.Encoding.Common as E +import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Internal as T import qualified Data.Text.IO as T import qualified Data.Text.Lazy as LT diff --git a/text.cabal b/text.cabal index cabb1582..488f1451 100644 --- a/text.cabal +++ b/text.cabal @@ -137,7 +137,6 @@ library Data.Text.Array Data.Text.Encoding Data.Text.Encoding.Error - Data.Text.Encoding.Common Data.Text.Foreign Data.Text.IO Data.Text.Internal From 423ef2baac797bb737b99c320c0c7b70018f8bba Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 2 Oct 2022 10:10:54 -0600 Subject: [PATCH 43/87] Update changelog --- changelog.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/changelog.md b/changelog.md index 2f529fc6..1097f8af 100644 --- a/changelog.md +++ b/changelog.md @@ -1,12 +1,13 @@ ### 2.0.2 * A new suite of total decoders have been added in `Data.Text.Encoding` - and `Data.Text.Lazy.Encoding` that allow decoding to be aborted on - errors without the need to raise an `error` and `catch` it elsewhere: + that allow decoding to be aborted on errors without the need to raise + an `error` and `catch` it elsewhere: * `decodeAsciiPrefix` - * `decodeUtf8Chunks` - * `decodeUtf16Chunks` - * `decodeUtf32Chunks` + * `decodeUtf8Chunk` + * `decodeNextUtf8Chunk` + * `recoverFromUtf8Error` + * `outAvailableUtf8Text` ### 2.0.1 From d2aa0312739ec1284621730b417ffd3a522c4e80 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 5 Oct 2022 19:46:02 -0600 Subject: [PATCH 44/87] When possible copy whole bytestring at once regardless of code point boundaries --- changelog.md | 6 +- src/Data/Text/Encoding.hs | 150 +++++++++++++++--------- src/Data/Text/Internal/Encoding/Utf8.hs | 4 +- tests/Tests/Properties/Transcoding.hs | 4 + 4 files changed, 101 insertions(+), 63 deletions(-) diff --git a/changelog.md b/changelog.md index 1097f8af..c2c72420 100644 --- a/changelog.md +++ b/changelog.md @@ -1,8 +1,8 @@ ### 2.0.2 -* A new suite of total decoders have been added in `Data.Text.Encoding` - that allow decoding to be aborted on errors without the need to raise - an `error` and `catch` it elsewhere: +* A suite of functions have been added in `Data.Text.Encoding` that + allow decoding to be aborted on errors without the need to raise an + `error` and `catch` it elsewhere: * `decodeAsciiPrefix` * `decodeUtf8Chunk` * `decodeNextUtf8Chunk` diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index db0ef035..5fe6b98a 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -80,6 +80,7 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) +import Control.Monad (when) import Control.Monad.ST (runST) import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) @@ -226,10 +227,24 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- -- @since 2.0.2 data Utf8DecodeState = Utf8DecodeState - (Maybe (Utf8CodePointState, Int)) + -- Code point decode state or error + (Maybe + -- self-explanatory (I hope) + ( Utf8CodePointState + -- Count of Word8s that have been evaluated so far for this code point. + -- The first word is specified by Position indicator below + , Int + )) + -- ByteStrings containing data whose evaluations are unfinished [ByteString] + -- Postion within the lead ByteString of either unfinished evaluated data or + -- the first word of an invalid code point. Int - [Either (Text, Int) (ByteString, Int, Int, Int)] + -- the first non-listed Word8 in the lead ByteString. + Int + -- Queued text data. + [Either Text (ByteString, Int, Int)] + -- Word8 length of listed data. Int deriving (Show) @@ -237,7 +252,7 @@ data Utf8DecodeState = Utf8DecodeState -- -- @since 2.0.2 startUtf8State :: Utf8DecodeState -startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0 +startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 0 [] 0 -- | Takes whatever data has been decoded thus far and spits it out as a `Text` -- value and a `Utf8DecodeState` value that no longer references the decoded @@ -246,24 +261,36 @@ startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 [] 0 -- -- @since 2.0.2 outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState) -outAvailableUtf8Text (Utf8DecodeState mCpSt bss bs1Off dataStack tLen) = - if tLen > 0 +outAvailableUtf8Text st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) = + let tbpLen = bs1Off - tbpPos + totalLen = tLen + tbpLen + in + if totalLen > 0 then runST $ do - dst <- A.new tLen - mapM_ (\ dat -> - case dat of - Left ((Text arr0 off utf8Len), dstOff) -> A.copyI utf8Len dst dstOff arr0 off - Right (bs, bsOff, utf8Len, dstOff) -> - withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len - ) dataStack + dst <- A.new totalLen + when (tbpLen > 0) . withBS (head bss) $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst tLen (src `plusPtr` tbpPos) tbpLen + let g (dat : dataStack') tLen' = + (case dat of + Left (Text arr0 off utf8Len) -> do + let dstOff = tLen' - utf8Len + A.copyI utf8Len dst dstOff arr0 off + pure dstOff + Right (bs, bsOff, utf8Len) -> do + let dstOff = tLen' - utf8Len + withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len + pure dstOff) >>= g dataStack' + g _ _ = pure () + g dataStack tLen arr <- A.unsafeFreeze dst - pure (Text arr 0 tLen, Utf8DecodeState mCpSt bss bs1Off [] 0) - else (empty, Utf8DecodeState mCpSt bss bs1Off dataStack tLen) + pure (Text arr 0 totalLen, Utf8DecodeState mCpSt bss bs1Off bs1Off [] 0) + else (empty, st) getCodePointStateOrError :: Utf8DecodeState -> Either Int Int -getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _) = +getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _ _) = case mCpSt of Nothing -> let (lenInit, _) = foldr @@ -275,13 +302,14 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') getCodePointStateOrError _ = Right 0 decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _) = (getCodePointStateOrError st, st) -decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st) +decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st) +decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) decodeUtf8Chunks st@(Utf8DecodeState (Just (cpSt, cpPos)) bss@(bs1@(B.length -> len1) : bss') bs1Off + tbpPos dataStack tLen ) @@ -300,7 +328,7 @@ bs1Off = 0 cpPos = Δbs1Off ^---------------------^ len --} - let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytesting that's to be evaluated. + let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytestring that's to be evaluated. (lenInit, (lenN, bsN)) = foldr (\ bs@(B.length -> len') (lenInit', (lenN', _)) -> (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss' @@ -333,21 +361,22 @@ bs1Off = 0 cpPos = Δbs1Off w n test = len >= n && test (index $ len - n) wc n mask word8 = w n $ (word8 ==) . (mask .&.) wi n word8 = w n (>= word8) - -- queue the available valid data, trim the fat, and leave a spot for the code point state/error. - stackValidUtf8 wordCount mCps = + -- push the available valid data on to the list, and remove completely evaulated bytestrings. + pushValidUtf8 wordCount mCps = let bs1Off' = bs1Off + wordCount - (bs1Off''', bss'''', dataStack'') = + (bss'''', bs1Off''', tbpPos'', dataStack'', tLen'') = if wordCount < len1_ - then (bs1Off', bss, Right (bs1, bs1Off, wordCount, tLen) : dataStack) + then (bss, bs1Off', tbpPos, dataStack, tLen) else - let stackValidUtf8' wordCount' bs1Off'' tLen' bss''@(bs'@(B.length -> len') : bss''') dataStack' = + let pushValidUtf8' wordCount' bss''@(bs'@(B.length -> len') : bss''') bs1Off'' tbpPos' dataStack' tLen' = if wordCount' < len' - then (bs1Off'', bss'', Right (bs', 0, wordCount', tLen') : dataStack') - else stackValidUtf8' (wordCount' - len') (bs1Off'' - len') (tLen' + len') bss''' (Right (bs', 0, len', tLen') : dataStack') - stackValidUtf8' _ _ _ _ dataStack' = (0, [], dataStack') + then (bss'', bs1Off'', tbpPos', dataStack', tLen') + else pushValidUtf8' (wordCount' - len') bss''' (bs1Off'' - len') 0 (Right (bs', 0, len') : dataStack') (tLen' + len') + pushValidUtf8' _ _ bs1Off'' tbpPos' dataStack' tLen' = ([], bs1Off'', tbpPos', dataStack', tLen') + bs1WordCount = bs1Off - tbpPos + len1_ in - stackValidUtf8' (wordCount - len1_) (bs1Off' - len1) (tLen + len1_) bss' (Right (bs1, bs1Off, len1_, tLen) : dataStack) - st' = Utf8DecodeState mCps bss'''' bs1Off''' dataStack'' $ tLen + wordCount + pushValidUtf8' (wordCount - len1_) bss' (bs1Off' - len1) 0 (Right (bs1, tbpPos, bs1WordCount) : dataStack) (tLen + bs1WordCount) + st' = Utf8DecodeState mCps bss'''' bs1Off''' tbpPos'' dataStack'' tLen'' in (getCodePointStateOrError st', st') huntDownError off ndx cps = @@ -361,8 +390,8 @@ bs1Off = 0 cpPos = Δbs1Off then ndx' else off ) ndx' cps' - Nothing -> stackValidUtf8 off Nothing - else stackValidUtf8 off $ Just (cps, ndx - off) + Nothing -> pushValidUtf8 off Nothing + else pushValidUtf8 off $ Just (cps, ndx - off) in -- did we find the boundary? case guessUtf8Boundary of @@ -382,7 +411,7 @@ bs1Off = 0 cpPos = Δbs1Off Just cpSt''' -> getEndState (ndx + 1) cpSt''' | otherwise = Just (cpSt'', ndx - boundary) soFarSoGood = - stackValidUtf8 boundary $ getEndState boundary cpSt' + pushValidUtf8 boundary $ getEndState boundary cpSt' in -- are we at the boundary? if boundary == cpPos' @@ -397,7 +426,7 @@ bs1Off = 0 cpPos = Δbs1Off -- keep walking the data until we get to bsN or an error case updateUtf8State (index cpPos') cpSt' of Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) - Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen) + Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen) -- no: we're in bsN else let off = (if lenInit > 0 @@ -436,10 +465,10 @@ bs1Off = 0 cpPos = Δbs1Off case updateUtf8State (index cpPos') cpSt' of Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) -- just enough additional data to find an error with the code point - Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off dataStack tLen) + Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen) else -- didn't get enough additional data to complete the code point - (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off dataStack tLen) + (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off tbpPos dataStack tLen) in checkIncompleteCodePoint cpSt cpPos -- no, we're past the boundary @@ -456,7 +485,7 @@ bs1Off = 0 cpPos = Δbs1Off ( case mCpStLen of Nothing -> Left (-lenInit) Just _ -> Right len - , Utf8DecodeState mCpStLen bss bs1Off dataStack tLen + , Utf8DecodeState mCpStLen bss bs1Off tbpPos dataStack tLen ) -- no: there's an error Nothing -> huntDownError 0 cpPos cpSt @@ -484,10 +513,10 @@ decodeNextUtf8Chunk :: HasCallStack => #endif ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _) = (getCodePointStateOrError st, st) -decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off dataStack tLen) +decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) +decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) | len == 0 = (getCodePointStateOrError st, st) - | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off dataStack tLen + | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off tbpPos dataStack tLen -- | Decodes a 'ByteString' from a clean state. -- @@ -511,29 +540,36 @@ recoverFromUtf8Error :: HasCallStack => #endif Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@((B.length -> len) : bss') bs1Off dataStack tLen) = - let dammit mCpSt' bss'' bs1Off' = decodeUtf8Chunks . Utf8DecodeState mCpSt' bss'' bs1Off' - ( if utf8Len > 0 - then Left (t, tLen) : dataStack - else dataStack - ) $ tLen + utf8Len +recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@(bs1@(B.length -> len) : bss') bs1Off tbpPos dataStack tLen) = + let tbpLen = bs1Off - tbpPos + (dataStack', tLen') = + if tbpLen > 0 + then (Right (bs1, tbpPos, tbpLen) : dataStack, tLen + tbpLen) + else (dataStack, tLen) + (dataStack'', tLen'') = + if utf8Len > 0 + then (Left t : dataStack', tLen' + utf8Len) + else (dataStack', tLen') + (mCpSt'', bss'', bs1Off'') = + case mCpSt of + Just _ -> (mCpSt, bss, bs1Off) + Nothing -> + let bs1Off' = bs1Off + 1 + mCpSt' = Just (utf8StartState, 0) + in + if bs1Off' == len + then (mCpSt', bss', 0) + else (mCpSt', bss, bs1Off') in - case mCpSt of - Nothing -> - let g = dammit (Just (utf8StartState, 0)) - bs1Off' = bs1Off + 1 - in - if bs1Off' == len - then g bss' 0 - else g bss bs1Off' - Just _ -> dammit mCpSt bss bs1Off -recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ dataStack tLen) = + decodeUtf8Chunks $ Utf8DecodeState mCpSt'' bss'' bs1Off'' bs1Off'' dataStack'' tLen'' +recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ _ dataStack tLen) = decodeUtf8Chunks $ Utf8DecodeState (Just (utf8StartState, 0)) [] 0 + 0 (if utf8Len > 0 - then Left (t, tLen) : dataStack + then Left t : dataStack else dataStack ) $ tLen + utf8Len diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 70f518df..5d4aaebd 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -263,6 +263,4 @@ updateUtf8State w st = case transitionUtf8State (byteToClass w) st of st' -> Just st' isUtf8StateIsComplete :: Utf8CodePointState -> Bool -isUtf8StateIsComplete (Utf8CodePointState s) - | s == 0 = True - | otherwise = False +isUtf8StateIsComplete (Utf8CodePointState s) = s == 0 diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 0fabf62e..81047d8a 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -205,6 +205,10 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False +whenEqProp a b next = if a == b + then next + else a === b + t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> text `T.isInfixOf` From ac6a31cfc0afa047e9ba61d23da42b5ee4073a59 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 5 Oct 2022 22:56:43 -0600 Subject: [PATCH 45/87] More test cases --- tests/Tests/Properties/Transcoding.hs | 21 ++++++++++++++++++--- tests/Tests/QuickCheckUtils.hs | 7 +++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 81047d8a..99e6618b 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -35,6 +35,7 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) +t_utf8_c = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id @@ -205,9 +206,21 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False -whenEqProp a b next = if a == b - then next - else a === b +t_decode_chunk = + case E.decodeUtf8Chunk $ B.pack [0xC2, 97] of + (result, st) -> whenEqProp result (Left 0) $ + case E.decodeNextUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0x63, 0x63, 0x63]) st of + (result', st') -> whenEqProp result' result $ + case E.recoverFromUtf8Error "bbbb" st' of + (result'', st'') -> whenEqProp result'' (Right 0) $ + case E.decodeNextUtf8Chunk (B.pack [0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0xc2]) st'' of + (result''', st''') -> whenEqProp result''' (Right 1) $ + case E.outAvailableUtf8Text st''' of + (t, st'''') -> whenEqProp t "bbbbadddddd" $ + case E.decodeNextUtf8Chunk (B.singleton 0x80) st'''' of + (result'''', st''''') -> whenEqProp result'''' (Right 0) $ + case E.outAvailableUtf8Text st''''' of + (t', _) -> t' === "\x80" t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -221,6 +234,7 @@ testTranscoding = testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, testProperty "tl_latin1" tl_latin1, + testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', testProperty "t_utf8_incr" t_utf8_incr, @@ -255,6 +269,7 @@ testTranscoding = testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', testProperty "t_decode_with_error5'" t_decode_with_error5', + testProperty "t_decode_chunk" t_decode_chunk, testProperty "t_infix_concat" t_infix_concat ] ] diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index 8f36f7ec..9834e92c 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,6 +27,7 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read + , whenEqProp ) where import Control.Arrow ((***)) @@ -284,3 +285,9 @@ newtype SkewedBool = Skewed { getSkewed :: Bool } instance Arbitrary SkewedBool where arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)] + +-- like 'when' but with 'Property' instead of a monad. +whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property +whenEqProp a b next = if a == b + then next + else a === b From 340934e7056393b86f2391360f6571ed818d6478 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Mon, 10 Oct 2022 22:01:19 -0600 Subject: [PATCH 46/87] start of decomposition of decodeUtf8Chunks --- src/Data/Text/Encoding.hs | 130 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 125 insertions(+), 5 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 5fe6b98a..d09f47cc 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -217,11 +217,6 @@ decodeAsciiPrefix bs = if B.null bs else Nothing pure (prefix, suffix) -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif - -- | A value that represents the state of a UTF-8 decoding process potentionally -- across multiple 'ByteString's. -- @@ -301,6 +296,131 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') Just (_, cpLen) -> Right cpLen getCodePointStateOrError _ = Right 0 +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + +{- +parseUtf8ChunkFrom chunk from = (n, ms) + +n is the end index of the longest prefix of chunk that is valid UTF-8 starting +from index 'from' where the length of the prefix = n - from + +ms +* When ms = Nothing, there is an error: the bytes from index n and beyond are + not part of a valid UTF-8 code point. +* When ms = Just s, all of the remaining bytes from index n and beyond are the + beginning of an incomplete UTF-8 code point, and s is the corresponding + intermediate decoding state, which can be used to parse the next chunk with + `parseUtf8Chunk` +-} +parseUtf8ChunkFrom :: ByteString -> Int -> (Int, Maybe Utf8CodePointState) +parseUtf8ChunkFrom bs@(B.length -> len) pos = + let w n test = (len - pos) >= n && test (B.index bs $ len - n) + wi n word8 = w n (>= word8) + wc n mask word8 = w n $ (word8 ==) . (mask .&.) + guessUtf8Boundary + | wi 3 0xf0 = Just $ len - 3 -- third to last char starts a four-byte code point + | wi 2 0xe0 = Just $ len - 2 -- pre-last char starts a three-or-four-byte code point + | wi 1 0xc2 = Just $ len - 1 -- last char starts a two-(or more-)byte code point + | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point + wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point + wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point + w 1 (< 0x80) = Just len -- last char is ASCII + | otherwise = Nothing -- no clue + huntDownError ndx0 ndx s = + -- if ndx < len + -- then + case updateUtf8State (B.index bs ndx) s of + Just s' -> + let ndx' = ndx + 1 in + huntDownError ( + if isUtf8StateIsComplete s' + then ndx' + else ndx0 + ) ndx' s' + Nothing -> (ndx0, Nothing) + -- else (ndx0, Just s) + in + case guessUtf8Boundary of + Just boundary -> + let getEndState ndx s + | ndx < len = + case updateUtf8State (B.index bs ndx) s of + Nothing -> Nothing + Just s' -> getEndState (ndx + 1) s' + | otherwise = Just s + in + if pos < boundary + then + -- is the rest of the bytestring valid utf-8 up to the boundary? + if ( +#ifdef SIMDUTF + withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 (plusPtr ptr pos) (fromIntegral $ boundary - pos) +#elif MIN_VERSION_bytestring(0,11,2) + B.isValidUtf8 . B.take (boundary - pos) $ B.drop pos bs +#else + let bLen = boundary - pos + step ndx s + | ndx < pos + bLen = + case updateUtf8State (B.unsafeIndex bs ndx) s of + Just s' -> step (ndx + 1) s' + Nothing -> False + | otherwise = isUtf8StateIsComplete s + in + step pos utf8StartState +#endif + ) + -- Yes + then (pos, getEndState pos utf8StartState) + -- No + else huntDownError pos pos utf8StartState + else (pos, getEndState pos utf8StartState) + Nothing -> huntDownError pos pos utf8StartState + +{- +parseUtf8Chunk chunk s = (n, ms) + +n +* When n >= 0, n is the length of the longest prefix of chunk that is valid + UTF-8 starting from state s (i.e., n points after the end of a full codepoint, + to the beginning of an incomplete codepoint). +* When n > 0, the starting code point from the previous input is either still + incomplete with the additonal chunk or in error. + +ms +* When ms = Nothing, there is an error: the bytes from index n and beyond are + not part of a valid UTF-8 code point. +* When ms = Just s', all of the remaining bytes from index n and beyond are the + beginning of an incomplete UTF-8 code point, and s' is the corresponding + intermediate decoding state, which can be used to parse the next chunk with + `parseUtf8Chunk` +-} +parseUtf8Chunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState) +parseUtf8Chunk bs@(B.length -> len) s = + let g pos s' = + -- first things first. let's try to get to the start of the next code point + if isUtf8StateIsComplete s' + -- found the beginning of the next code point, hand this off to someone else + then parseUtf8ChunkFrom bs pos + -- no, code point is not complete yet + else + -- walk the rest of the code point until error, complete, or no more data + if pos < len + then + case updateUtf8State (B.index bs pos) s' of + -- error + Nothing -> (-1, Nothing) + -- keep going + Just s'' -> g (pos + 1) s'' + -- no more data + else (-1, Just s') + in + g 0 s + decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState) decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st) decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) From 05d898eee7f3dc24833c101281bd7e73e2a88f5f Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 11 Oct 2022 21:44:21 -0600 Subject: [PATCH 47/87] little refactorin' on the prototype. --- src/Data/Text/Encoding.hs | 108 +++++++++++++++----------------------- 1 file changed, 42 insertions(+), 66 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index d09f47cc..a4db6d29 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -302,10 +302,9 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 #endif {- -parseUtf8ChunkFrom chunk from = (n, ms) +parseUtf8Chunk chunk = (n, ms) -n is the end index of the longest prefix of chunk that is valid UTF-8 starting -from index 'from' where the length of the prefix = n - from +n is the end index of the longest prefix of chunk that is valid UTF-8 ms * When ms = Nothing, there is an error: the bytes from index n and beyond are @@ -313,76 +312,52 @@ ms * When ms = Just s, all of the remaining bytes from index n and beyond are the beginning of an incomplete UTF-8 code point, and s is the corresponding intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8Chunk` + `parseUtf8NextChunk` -} -parseUtf8ChunkFrom :: ByteString -> Int -> (Int, Maybe Utf8CodePointState) -parseUtf8ChunkFrom bs@(B.length -> len) pos = - let w n test = (len - pos) >= n && test (B.index bs $ len - n) - wi n word8 = w n (>= word8) - wc n mask word8 = w n $ (word8 ==) . (mask .&.) +parseUtf8Chunk :: ByteString -> (Int, Maybe Utf8CodePointState) +parseUtf8Chunk bs@(B.length -> len) +#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) + | guessUtf8Boundary > 0 && + -- the rest of the bytestring valid utf-8 up to the boundary + ( +#ifdef SIMDUTF + withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) +#else + B.isValidUtf8 $ B.take guessUtf8Boundary bs +#endif + ) = getEndState guessUtf8Boundary + -- No + | otherwise = getEndState 0 + where + getEndState ndx = parseUtf8 ndx ndx utf8StartState + w n word8 = len >= n && word8 <= (B.index bs $ len - n) guessUtf8Boundary - | wi 3 0xf0 = Just $ len - 3 -- third to last char starts a four-byte code point - | wi 2 0xe0 = Just $ len - 2 -- pre-last char starts a three-or-four-byte code point - | wi 1 0xc2 = Just $ len - 1 -- last char starts a two-(or more-)byte code point - | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point - wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point - wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point - w 1 (< 0x80) = Just len -- last char is ASCII - | otherwise = Nothing -- no clue - huntDownError ndx0 ndx s = - -- if ndx < len - -- then + | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point + | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point + | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point + | otherwise = len +#else + = parseUtf8 0 0 utf8StartState + where +#endif + parseUtf8 ndx0 ndx s = + if ndx < len + then case updateUtf8State (B.index bs ndx) s of Just s' -> let ndx' = ndx + 1 in - huntDownError ( + parseUtf8 ( if isUtf8StateIsComplete s' then ndx' else ndx0 ) ndx' s' Nothing -> (ndx0, Nothing) - -- else (ndx0, Just s) - in - case guessUtf8Boundary of - Just boundary -> - let getEndState ndx s - | ndx < len = - case updateUtf8State (B.index bs ndx) s of - Nothing -> Nothing - Just s' -> getEndState (ndx + 1) s' - | otherwise = Just s - in - if pos < boundary - then - -- is the rest of the bytestring valid utf-8 up to the boundary? - if ( -#ifdef SIMDUTF - withBS bs $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> - c_is_valid_utf8 (plusPtr ptr pos) (fromIntegral $ boundary - pos) -#elif MIN_VERSION_bytestring(0,11,2) - B.isValidUtf8 . B.take (boundary - pos) $ B.drop pos bs -#else - let bLen = boundary - pos - step ndx s - | ndx < pos + bLen = - case updateUtf8State (B.unsafeIndex bs ndx) s of - Just s' -> step (ndx + 1) s' - Nothing -> False - | otherwise = isUtf8StateIsComplete s - in - step pos utf8StartState -#endif - ) - -- Yes - then (pos, getEndState pos utf8StartState) - -- No - else huntDownError pos pos utf8StartState - else (pos, getEndState pos utf8StartState) - Nothing -> huntDownError pos pos utf8StartState - + else (ndx0, Just s) + {- -parseUtf8Chunk chunk s = (n, ms) +parseUtf8NextChunk chunk s = (n, ms) n * When n >= 0, n is the length of the longest prefix of chunk that is valid @@ -397,15 +372,16 @@ ms * When ms = Just s', all of the remaining bytes from index n and beyond are the beginning of an incomplete UTF-8 code point, and s' is the corresponding intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8Chunk` + `parseUtf8NextChunk` -} -parseUtf8Chunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState) -parseUtf8Chunk bs@(B.length -> len) s = +parseUtf8NextChunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState) +parseUtf8NextChunk bs@(B.length -> len) s = let g pos s' = -- first things first. let's try to get to the start of the next code point if isUtf8StateIsComplete s' -- found the beginning of the next code point, hand this off to someone else - then parseUtf8ChunkFrom bs pos + then case parseUtf8Chunk $ B.drop pos bs of + (len', s'') -> (pos + len', s'') -- no, code point is not complete yet else -- walk the rest of the code point until error, complete, or no more data From 344c5c1db1b063537c9ea50171fe0e57855ce618 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 16 Oct 2022 19:10:21 -0600 Subject: [PATCH 48/87] parseUtf8Chunk adjustments --- src/Data/Text/Encoding.hs | 219 ++++++++++++++------------ tests/Tests/Properties/Transcoding.hs | 70 ++++++++ 2 files changed, 184 insertions(+), 105 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index a4db6d29..cf2b9f50 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -22,13 +22,15 @@ -- . module Data.Text.Encoding - ( + ( Utf8ParseState + , parseUtf8Chunk + , parseUtf8NextChunk -- * Decoding ByteStrings to Text -- $strict -- ** Total Functions #total# -- $total - decodeLatin1 + , decodeLatin1 , decodeUtf8Lenient , decodeAsciiPrefix , Utf8DecodeState @@ -217,6 +219,114 @@ decodeAsciiPrefix bs = if B.null bs else Nothing pure (prefix, suffix) +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + +data Utf8ParseState = Utf8ParseState [ByteString] Utf8CodePointState + deriving (Eq, Ord, Show, Read) + +{- +`parseUtf8Chunk chunk = (n, es)` + +n is the end index of the longest prefix of chunk that is valid UTF-8 + +es +* When `es = Left p`, there is an error: the bytes from index n and beyond are + not part of a valid UTF-8 code point, and is the index of the start of the + next (possibly valid) code point. `p - n` is the number of invalid bytes. +* When `es = Right s`, all of the remaining bytes from index n and beyond are the + beginning of an incomplete UTF-8 code point, and s is the corresponding + intermediate decoding state, which can be used to parse the next chunk with + `parseUtf8NextChunk` +-} +parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState) +parseUtf8Chunk bs@(B.length -> len) +#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) + | guessUtf8Boundary > 0 && + -- the rest of the bytestring valid utf-8 up to the boundary + ( +#ifdef SIMDUTF + withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) +#else + B.isValidUtf8 $ B.take guessUtf8Boundary bs +#endif + ) = getEndState guessUtf8Boundary + -- No + | otherwise = getEndState 0 + where + getEndState ndx = parseUtf8 ndx ndx utf8StartState + w n word8 = len >= n && word8 <= (B.index bs $ len - n) + guessUtf8Boundary + | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point + | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point + | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point + | otherwise = len +#else + = parseUtf8 0 0 utf8StartState + where +#endif + parseUtf8 ndx0 ndx s = + if ndx < len + then + let ndx' = ndx + 1 in + case updateUtf8State (B.index bs ndx) s of + Just s' -> + parseUtf8 ( + if isUtf8StateIsComplete s' + then ndx' + else ndx0 + ) ndx' s' + Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) + else (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s) + +{- +parseUtf8NextChunk chunk s = (n, es) + +n +* When n >= 0, n is the length of the longest prefix of chunk that is valid + UTF-8 starting from state s (i.e., n points after the end of a full codepoint, + to the beginning of an incomplete codepoint). +* When n > 0, the starting code point from the previous input is either still + incomplete with the additonal chunk or in error. + +es +* When es = Left p, there is an error: the bytes from index n and beyond are + not part of a valid UTF-8 code point, and is the index of the start of the + next (possibly valid) code point. `p - n` is the number of invalid bytes. +* When es = Right s', all of the remaining bytes from index n and beyond are the + beginning of an incomplete UTF-8 code point, and s' is the corresponding + intermediate decoding state, which can be used to parse the next chunk with + `parseUtf8NextChunk` +-} +parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) +parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s) + | len > 0 = + let g pos s' = + -- first things first. let's try to get to the start of the next code point + if isUtf8StateIsComplete s' + -- found the beginning of the next code point, hand this off to someone else + then case parseUtf8Chunk $ B.drop pos bs of + (len', mS) -> (pos + len', mS) + -- no, code point is not complete yet + else + -- walk the rest of the code point until error, complete, or no more data + if pos < len + then + case updateUtf8State (B.index bs pos) s' of + -- error + Nothing -> (leadPos, Left pos) + -- keep going + Just s'' -> g (pos + 1) s'' + -- no more data + else (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s') + in g 0 s + | otherwise = (leadPos, Right st) + where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead) + -- | A value that represents the state of a UTF-8 decoding process potentionally -- across multiple 'ByteString's. -- @@ -296,107 +406,6 @@ getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') Just (_, cpLen) -> Right cpLen getCodePointStateOrError _ = Right 0 -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif - -{- -parseUtf8Chunk chunk = (n, ms) - -n is the end index of the longest prefix of chunk that is valid UTF-8 - -ms -* When ms = Nothing, there is an error: the bytes from index n and beyond are - not part of a valid UTF-8 code point. -* When ms = Just s, all of the remaining bytes from index n and beyond are the - beginning of an incomplete UTF-8 code point, and s is the corresponding - intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8NextChunk` --} -parseUtf8Chunk :: ByteString -> (Int, Maybe Utf8CodePointState) -parseUtf8Chunk bs@(B.length -> len) -#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) - | guessUtf8Boundary > 0 && - -- the rest of the bytestring valid utf-8 up to the boundary - ( -#ifdef SIMDUTF - withBS bs $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> - c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) -#else - B.isValidUtf8 $ B.take guessUtf8Boundary bs -#endif - ) = getEndState guessUtf8Boundary - -- No - | otherwise = getEndState 0 - where - getEndState ndx = parseUtf8 ndx ndx utf8StartState - w n word8 = len >= n && word8 <= (B.index bs $ len - n) - guessUtf8Boundary - | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point - | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point - | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point - | otherwise = len -#else - = parseUtf8 0 0 utf8StartState - where -#endif - parseUtf8 ndx0 ndx s = - if ndx < len - then - case updateUtf8State (B.index bs ndx) s of - Just s' -> - let ndx' = ndx + 1 in - parseUtf8 ( - if isUtf8StateIsComplete s' - then ndx' - else ndx0 - ) ndx' s' - Nothing -> (ndx0, Nothing) - else (ndx0, Just s) - -{- -parseUtf8NextChunk chunk s = (n, ms) - -n -* When n >= 0, n is the length of the longest prefix of chunk that is valid - UTF-8 starting from state s (i.e., n points after the end of a full codepoint, - to the beginning of an incomplete codepoint). -* When n > 0, the starting code point from the previous input is either still - incomplete with the additonal chunk or in error. - -ms -* When ms = Nothing, there is an error: the bytes from index n and beyond are - not part of a valid UTF-8 code point. -* When ms = Just s', all of the remaining bytes from index n and beyond are the - beginning of an incomplete UTF-8 code point, and s' is the corresponding - intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8NextChunk` --} -parseUtf8NextChunk :: ByteString -> Utf8CodePointState -> (Int, Maybe Utf8CodePointState) -parseUtf8NextChunk bs@(B.length -> len) s = - let g pos s' = - -- first things first. let's try to get to the start of the next code point - if isUtf8StateIsComplete s' - -- found the beginning of the next code point, hand this off to someone else - then case parseUtf8Chunk $ B.drop pos bs of - (len', s'') -> (pos + len', s'') - -- no, code point is not complete yet - else - -- walk the rest of the code point until error, complete, or no more data - if pos < len - then - case updateUtf8State (B.index bs pos) s' of - -- error - Nothing -> (-1, Nothing) - -- keep going - Just s'' -> g (pos + 1) s'' - -- no more data - else (-1, Just s') - in - g 0 s - decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState) decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st) decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) @@ -587,7 +596,7 @@ bs1Off = 0 cpPos = Δbs1Off Nothing -> huntDownError 0 cpPos cpSt -- | Decodes a UTF-8 'ByteString' in the context of what has already been --- decoded which is represented by the 'Utf8DecodeState' value. Returned is the +-- decoded which is represented by the 'Utf8ParseState value. Returned is the -- new decode state and either ('Right') the number of 'Word8's that make up the -- incomplete code point at the end of the input, or ('Left') the start position -- of an invalid code point that was encountered. The position is relative to @@ -622,7 +631,7 @@ decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tb decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState) decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State --- | If the 'Utf8DecodeState' value indicates an error state, the 'Word8' that +-- | If the 'Utf8ParseState value indicates an error state, the 'Word8' that -- the state value point to is replaced with the input 'Text' value which may -- be empty. Decoding resumes after the text is inserted and produces the result -- described by 'decodeNextUtf8Chunk'. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 99e6618b..aec54ec0 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -8,6 +8,7 @@ module Tests.Properties.Transcoding import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) +import Data.Either (isLeft, isRight) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -35,6 +36,62 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) +t_p_utf8_1 = case E.parseUtf8Chunk (B.pack [0x63]) of + (result, st) -> whenEqProp result 1 . property $ isRight st +t_p_utf8_2 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of + (result, st) -> whenEqProp result 3 . property $ isRight st +t_p_utf8_3 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of + (result, st) -> whenEqProp result 5 . property $ isRight st +t_p_utf8_4 = case E.parseUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of + (result, st) -> whenEqProp result 5 . property $ isRight st +t_p_utf8_5 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of + (result, st) -> whenEqProp result 5 . property $ isRight st +t_p_utf8_6 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of + (result, st) -> whenEqProp result 2 . property $ isRight st +t_p_utf8_7 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of + (result, st) -> whenEqProp result 3 . property $ isRight st +t_p_utf8_8 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of + (result, st) -> whenEqProp result 0 $ st === Left 3 +t_p_utf8_9 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of + (result, st) -> whenEqProp result 2 $ st === Left 3 +t_p_utf8_0 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of + (result, st) -> whenEqProp result 2 $ st === Left 3 + +t_pn_utf8_1 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of + (result0, mS) -> whenEqProp result0 0 $ + case mS of + Left _ -> counterexample (show mS) False + Right s -> case E.parseUtf8NextChunk (B.pack [0x80]) s of + (result1, mS1) -> whenEqProp result1 1 $ + if isLeft mS1 + then counterexample (show mS1) False + else case E.parseUtf8NextChunk (B.pack [0x7f]) s of + (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0 +t_pn_utf8_2 = case E.parseUtf8Chunk (B.pack [0xF0]) of + (result0, mS0) -> whenEqProp result0 0 $ + case mS0 of + Left _ -> counterexample (show mS0) False + Right s0 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s0 of + (result1, mS1) -> whenEqProp result1 (-1) . + whenEqProp mS1 (Left 0) $ + case E.parseUtf8NextChunk (B.pack [0x90]) s0 of + (result2, mS2) -> whenEqProp result2 (-1) $ + case mS2 of + Left _ -> counterexample (show mS2) False + Right s1 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s1 of + (result3, mS3) -> whenEqProp result3 (-2) . + whenEqProp mS3 (Left 0) $ + case E.parseUtf8NextChunk (B.pack [0x80]) s1 of + (result4, mS4) -> whenEqProp result4 (-2) $ + case mS4 of + Left _ -> counterexample (show mS3) False + Right s2 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s2 of + (result5, mS5) -> whenEqProp result5 (-3) . + whenEqProp mS5 (Left 0) $ + case E.parseUtf8NextChunk (B.pack [0x80]) s2 of + (result6, mS6) -> whenEqProp result6 1 $ + property $ isRight mS6 + t_utf8_c = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) @@ -234,6 +291,19 @@ testTranscoding = testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, testProperty "tl_latin1" tl_latin1, + testProperty "t_p_utf8_1" t_p_utf8_1, + testProperty "t_p_utf8_2" t_p_utf8_2, + testProperty "t_p_utf8_3" t_p_utf8_3, + testProperty "t_p_utf8_4" t_p_utf8_4, + testProperty "t_p_utf8_5" t_p_utf8_5, + testProperty "t_p_utf8_6" t_p_utf8_6, + testProperty "t_p_utf8_7" t_p_utf8_7, + testProperty "t_p_utf8_8" t_p_utf8_8, + testProperty "t_p_utf8_9" t_p_utf8_9, + testProperty "t_p_utf8_0" t_p_utf8_0, + testProperty "t_pn_utf8_1" t_pn_utf8_1, + testProperty "t_pn_utf8_2" t_pn_utf8_2, + -- testProperty "t_pn_utf8_3" t_pn_utf8_3, testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', From dd1f57d8262165a31f0cadeac6b85d21c6ba8bc0 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 16 Oct 2022 22:51:06 -0600 Subject: [PATCH 49/87] a little more prototypin' UTF-8 parser/decoder... --- src/Data/Text/Encoding.hs | 117 +++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 22 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index cf2b9f50..e7e07459 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -23,8 +23,17 @@ module Data.Text.Encoding ( Utf8ParseState + , partialCodePoint + , codePointState , parseUtf8Chunk , parseUtf8NextChunk + , TextDataStack + , dataStack + , stackLen + , startUtf8ParseState + , injectText + , stackToText + -- * Decoding ByteStrings to Text -- $strict @@ -224,9 +233,15 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt #endif -data Utf8ParseState = Utf8ParseState [ByteString] Utf8CodePointState +data Utf8ParseState = Utf8ParseState + { partialCodePoint :: [ByteString] + , codePointState :: Utf8CodePointState + } deriving (Eq, Ord, Show, Read) +startUtf8ParseState ::Utf8ParseState +startUtf8ParseState = Utf8ParseState [] utf8StartState + {- `parseUtf8Chunk chunk = (n, es)` @@ -269,9 +284,8 @@ parseUtf8Chunk bs@(B.length -> len) = parseUtf8 0 0 utf8StartState where #endif - parseUtf8 ndx0 ndx s = - if ndx < len - then + parseUtf8 ndx0 ndx s + | ndx < len = let ndx' = ndx + 1 in case updateUtf8State (B.index bs ndx) s of Just s' -> @@ -281,7 +295,7 @@ parseUtf8Chunk bs@(B.length -> len) else ndx0 ) ndx' s' Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - else (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s) + | otherwise = (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s) {- parseUtf8NextChunk chunk s = (n, es) @@ -305,28 +319,87 @@ es parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s) | len > 0 = - let g pos s' = + let g pos s' -- first things first. let's try to get to the start of the next code point - if isUtf8StateIsComplete s' - -- found the beginning of the next code point, hand this off to someone else - then case parseUtf8Chunk $ B.drop pos bs of - (len', mS) -> (pos + len', mS) - -- no, code point is not complete yet - else - -- walk the rest of the code point until error, complete, or no more data - if pos < len - then - case updateUtf8State (B.index bs pos) s' of - -- error - Nothing -> (leadPos, Left pos) - -- keep going - Just s'' -> g (pos + 1) s'' - -- no more data - else (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s') + | isUtf8StateIsComplete s' = + -- found the beginning of the next code point, hand this off to someone else + case parseUtf8Chunk $ B.drop pos bs of + (len', mS) -> (pos + len', mS) + -- code point is not complete yet + -- walk the rest of the code point until error, complete, or no more data + | pos < len = + case updateUtf8State (B.index bs pos) s' of + -- error + Nothing -> (leadPos, Left pos) + -- keep going + Just s'' -> g (pos + 1) s'' + -- no more data + | otherwise = (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s') in g 0 s | otherwise = (leadPos, Right st) where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead) +data TextDataStack = TextDataStack + { dataStack :: [Either Text ByteString] + , stackLen :: Int + } + deriving Show + +injectText :: Text -> TextDataStack -> TextDataStack +injectText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = + if tLen > 0 + then TextDataStack (Left t : stack) $ sLen + tLen + else tds + +stackToText :: TextDataStack -> Text +stackToText (TextDataStack stack sLen) + | sLen > 0 = runST $ + do + dst <- A.new sLen + let g (dat : dataStack') tLen' = + (case dat of + Left (Text arr0 off utf8Len) -> do + let dstOff = tLen' - utf8Len + A.copyI utf8Len dst dstOff arr0 off + pure dstOff + Right bs@(B.length -> utf8Len) -> do + let dstOff = tLen' - utf8Len + withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len + pure dstOff) >>= g dataStack' + g _ _ = pure () + g stack sLen + arr <- A.unsafeFreeze dst + pure $ Text arr 0 sLen + | otherwise = empty + +decodeNextUtf8Chunk' + :: ByteString + -> Utf8ParseState + -> TextDataStack + -> (Either (Int, ByteString) Utf8ParseState, TextDataStack) +decodeNextUtf8Chunk' bs s tds = + case parseUtf8NextChunk bs s of + (len, res) -> + let stackedData' + | len >= 0 = + let stackedData@(TextDataStack stack' sLen') = + foldr (\ bs'@(B.length -> bLen) (TextDataStack stack sLen) -> + TextDataStack (Right bs' : stack) $ sLen + bLen) tds $ partialCodePoint s + in + if len > 0 + then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len + else stackedData + | otherwise = tds + in + case res of + Left pos -> (Left (pos, B.drop pos bs), stackedData') + Right s' -> (Right s', stackedData') + +decodeUtf8Chunk' :: ByteString -> (Either (Int, ByteString) Utf8ParseState, TextDataStack) +decodeUtf8Chunk' bs = decodeNextUtf8Chunk' bs startUtf8ParseState $ TextDataStack [] 0 + -- | A value that represents the state of a UTF-8 decoding process potentionally -- across multiple 'ByteString's. -- From ee53c15fb3e6762eacee011b0cd37ceff06c0f42 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 22 Oct 2022 23:07:01 -0600 Subject: [PATCH 50/87] getting there... --- src/Data/Text/Encoding.hs | 503 ++++---------------------- tests/Tests/Properties/Transcoding.hs | 62 ++-- 2 files changed, 117 insertions(+), 448 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index e7e07459..1c4298c3 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -26,12 +26,13 @@ module Data.Text.Encoding , partialCodePoint , codePointState , parseUtf8Chunk - , parseUtf8NextChunk + , parseNextUtf8Chunk + , startUtf8ParseState , TextDataStack , dataStack , stackLen - , startUtf8ParseState - , injectText + , emptyStack + , pushText , stackToText -- * Decoding ByteStrings to Text @@ -42,12 +43,8 @@ module Data.Text.Encoding , decodeLatin1 , decodeUtf8Lenient , decodeAsciiPrefix - , Utf8DecodeState - , startUtf8State - , outAvailableUtf8Text , decodeNextUtf8Chunk , decodeUtf8Chunk - , recoverFromUtf8Error -- *** Catchable failure , decodeUtf8' @@ -91,14 +88,13 @@ module Data.Text.Encoding import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try) -import Control.Monad (when) import Control.Monad.ST (runST) import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Internal (Text(..), empty, append) +import Data.Text.Internal (Text(..), empty) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) @@ -254,7 +250,7 @@ es * When `es = Right s`, all of the remaining bytes from index n and beyond are the beginning of an incomplete UTF-8 code point, and s is the corresponding intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8NextChunk` + `parseNextUtf8Chunk` -} parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState) parseUtf8Chunk bs@(B.length -> len) @@ -295,10 +291,10 @@ parseUtf8Chunk bs@(B.length -> len) else ndx0 ) ndx' s' Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - | otherwise = (ndx0, Right $ Utf8ParseState [B.drop ndx0 bs] s) + | otherwise = (ndx0, Right $ Utf8ParseState (if ndx0 < len then [B.drop ndx0 bs] else []) s) {- -parseUtf8NextChunk chunk s = (n, es) +parseNextUtf8Chunk chunk s = (n, es) n * When n >= 0, n is the length of the longest prefix of chunk that is valid @@ -314,17 +310,17 @@ es * When es = Right s', all of the remaining bytes from index n and beyond are the beginning of an incomplete UTF-8 code point, and s' is the corresponding intermediate decoding state, which can be used to parse the next chunk with - `parseUtf8NextChunk` + `parseNextUtf8Chunk` -} -parseUtf8NextChunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) -parseUtf8NextChunk bs@(B.length -> len) st@(Utf8ParseState lead s) +parseNextUtf8Chunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) +parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s) | len > 0 = let g pos s' -- first things first. let's try to get to the start of the next code point | isUtf8StateIsComplete s' = -- found the beginning of the next code point, hand this off to someone else case parseUtf8Chunk $ B.drop pos bs of - (len', mS) -> (pos + len', mS) + (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS) -- code point is not complete yet -- walk the rest of the code point until error, complete, or no more data | pos < len = @@ -345,8 +341,11 @@ data TextDataStack = TextDataStack } deriving Show -injectText :: Text -> TextDataStack -> TextDataStack -injectText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = +emptyStack :: TextDataStack +emptyStack = TextDataStack [] 0 + +pushText :: Text -> TextDataStack -> TextDataStack +pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = if tLen > 0 then TextDataStack (Left t : stack) $ sLen + tLen else tds @@ -374,382 +373,37 @@ stackToText (TextDataStack stack sLen) pure $ Text arr 0 sLen | otherwise = empty -decodeNextUtf8Chunk' +decodeNextUtf8Chunk :: ByteString -> Utf8ParseState -> TextDataStack - -> (Either (Int, ByteString) Utf8ParseState, TextDataStack) -decodeNextUtf8Chunk' bs s tds = - case parseUtf8NextChunk bs s of + -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) +decodeNextUtf8Chunk bs s tds = + case parseNextUtf8Chunk bs s of (len, res) -> let stackedData' | len >= 0 = let stackedData@(TextDataStack stack' sLen') = - foldr (\ bs'@(B.length -> bLen) (TextDataStack stack sLen) -> - TextDataStack (Right bs' : stack) $ sLen + bLen) tds $ partialCodePoint s + foldl (\ tds'@(TextDataStack stack sLen) bs'@(B.length -> bLen) -> + if bLen > 0 + then TextDataStack (Right bs' : stack) $ sLen + bLen + else tds' + ) tds $ partialCodePoint s in if len > 0 then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len else stackedData | otherwise = tds in - case res of - Left pos -> (Left (pos, B.drop pos bs), stackedData') - Right s' -> (Right s', stackedData') + ( ( len + , case res of + Left pos -> Left (pos, B.drop pos bs) + Right s' -> Right s' + ) + , stackedData') -decodeUtf8Chunk' :: ByteString -> (Either (Int, ByteString) Utf8ParseState, TextDataStack) -decodeUtf8Chunk' bs = decodeNextUtf8Chunk' bs startUtf8ParseState $ TextDataStack [] 0 - --- | A value that represents the state of a UTF-8 decoding process potentionally --- across multiple 'ByteString's. --- --- @since 2.0.2 -data Utf8DecodeState = Utf8DecodeState - -- Code point decode state or error - (Maybe - -- self-explanatory (I hope) - ( Utf8CodePointState - -- Count of Word8s that have been evaluated so far for this code point. - -- The first word is specified by Position indicator below - , Int - )) - -- ByteStrings containing data whose evaluations are unfinished - [ByteString] - -- Postion within the lead ByteString of either unfinished evaluated data or - -- the first word of an invalid code point. - Int - -- the first non-listed Word8 in the lead ByteString. - Int - -- Queued text data. - [Either Text (ByteString, Int, Int)] - -- Word8 length of listed data. - Int - deriving (Show) - --- | This represents the begining state of a UTF-8 decoding process. --- --- @since 2.0.2 -startUtf8State :: Utf8DecodeState -startUtf8State = Utf8DecodeState (Just (utf8StartState, 0)) [] 0 0 [] 0 - --- | Takes whatever data has been decoded thus far and spits it out as a `Text` --- value and a `Utf8DecodeState` value that no longer references the decoded --- data. This function operates on error states, but does not clear the error. --- (See 'recoverFromUtf8Error'.) --- --- @since 2.0.2 -outAvailableUtf8Text :: Utf8DecodeState -> (Text, Utf8DecodeState) -outAvailableUtf8Text st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) = - let tbpLen = bs1Off - tbpPos - totalLen = tLen + tbpLen - in - if totalLen > 0 - then runST $ do - dst <- A.new totalLen - when (tbpLen > 0) . withBS (head bss) $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst tLen (src `plusPtr` tbpPos) tbpLen - let g (dat : dataStack') tLen' = - (case dat of - Left (Text arr0 off utf8Len) -> do - let dstOff = tLen' - utf8Len - A.copyI utf8Len dst dstOff arr0 off - pure dstOff - Right (bs, bsOff, utf8Len) -> do - let dstOff = tLen' - utf8Len - withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` bsOff) utf8Len - pure dstOff) >>= g dataStack' - g _ _ = pure () - g dataStack tLen - arr <- A.unsafeFreeze dst - pure (Text arr 0 totalLen, Utf8DecodeState mCpSt bss bs1Off bs1Off [] 0) - else (empty, st) - -getCodePointStateOrError :: Utf8DecodeState -> Either Int Int -getCodePointStateOrError (Utf8DecodeState mCpSt (bs1@(B.length -> len1) : bss') bs1Off _ _ _) = - case mCpSt of - Nothing -> - let (lenInit, _) = foldr - (\ bs@(B.length -> len') (lenInit', (lenN', _)) -> - (lenInit' + lenN', (len', bs))) (0, (len1 - bs1Off, bs1)) bss' - in - Left (bs1Off - lenInit) - Just (_, cpLen) -> Right cpLen -getCodePointStateOrError _ = Right 0 - -decodeUtf8Chunks :: Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -decodeUtf8Chunks st@(Utf8DecodeState _ [] _ _ _ _) = (getCodePointStateOrError st, st) -decodeUtf8Chunks st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) -decodeUtf8Chunks - st@(Utf8DecodeState - (Just (cpSt, cpPos)) - bss@(bs1@(B.length -> len1) : bss') - bs1Off - tbpPos - dataStack - tLen - ) - = - {- -bs1Off = 0 cpPos = Δbs1Off - | bs1Off | boundary = Δbs1Off - v v bsX v v - |. . bs1 . .|. .|. . .bsN. . .| - ^-----------^ ^-------------^ - len1 lenN - ^---^ ^-----------^ - len1_ isValidBS span - ^-------^ - lenInit - ^---------------------^ - len - --} - let len1_ = len1 - bs1Off -- the length of the trailing portion of the first bytestring that's to be evaluated. - (lenInit, (lenN, bsN)) = foldr - (\ bs@(B.length -> len') (lenInit', (lenN', _)) -> - (lenInit' + lenN', (len', bs))) (0, (len1_, bs1)) bss' - len = lenInit + lenN - in - if len == cpPos - then (getCodePointStateOrError st, st) - else - let index i = - if i < len1_ - then B.index bs1 (i + bs1Off) - else - let index' i' (bs@(B.length -> len0) : bss'') = - if i' < len0 - then B.index bs i' - else index' (i' - len0) bss'' - index' i' _ = B.index bsN i' - in - index' (i - len1_) bss' - guessUtf8Boundary - | wi 3 0xf0 = Just $ len - 3 -- third to last char starts a four-byte code point - | wi 2 0xe0 = Just $ len - 2 -- pre-last char starts a three-or-four-byte code point - | wi 1 0xc2 = Just $ len - 1 -- last char starts a two-(or more-)byte code point - | wc 4 0xf8 0xf0 || -- last four bytes are a four-byte code point - wc 3 0xf0 0xe0 || -- last three bytes are a three-byte code point - wc 2 0xe0 0xc0 || -- last two bytes are a two-byte code point - w 1 (< 0x80) = Just len -- last char is ASCII - | otherwise = Nothing -- no clue - where - w n test = len >= n && test (index $ len - n) - wc n mask word8 = w n $ (word8 ==) . (mask .&.) - wi n word8 = w n (>= word8) - -- push the available valid data on to the list, and remove completely evaulated bytestrings. - pushValidUtf8 wordCount mCps = - let bs1Off' = bs1Off + wordCount - (bss'''', bs1Off''', tbpPos'', dataStack'', tLen'') = - if wordCount < len1_ - then (bss, bs1Off', tbpPos, dataStack, tLen) - else - let pushValidUtf8' wordCount' bss''@(bs'@(B.length -> len') : bss''') bs1Off'' tbpPos' dataStack' tLen' = - if wordCount' < len' - then (bss'', bs1Off'', tbpPos', dataStack', tLen') - else pushValidUtf8' (wordCount' - len') bss''' (bs1Off'' - len') 0 (Right (bs', 0, len') : dataStack') (tLen' + len') - pushValidUtf8' _ _ bs1Off'' tbpPos' dataStack' tLen' = ([], bs1Off'', tbpPos', dataStack', tLen') - bs1WordCount = bs1Off - tbpPos + len1_ - in - pushValidUtf8' (wordCount - len1_) bss' (bs1Off' - len1) 0 (Right (bs1, tbpPos, bs1WordCount) : dataStack) (tLen + bs1WordCount) - st' = Utf8DecodeState mCps bss'''' bs1Off''' tbpPos'' dataStack'' tLen'' - in - (getCodePointStateOrError st', st') - huntDownError off ndx cps = - if ndx < len - then - case updateUtf8State (index ndx) cps of - Just cps' -> - let ndx' = ndx + 1 in - huntDownError ( - if isUtf8StateIsComplete cps' - then ndx' - else off - ) ndx' cps' - Nothing -> pushValidUtf8 off Nothing - else pushValidUtf8 off $ Just (cps, ndx - off) - in - -- did we find the boundary? - case guessUtf8Boundary of - -- yes - Just boundary -> - -- are we before it? - if cpPos < boundary - -- yes: let's check this incomplete code point before checking the rest up to the boundary - then - let checkIncompleteCodePoint cpSt' cpPos' - -- a complete code point - | isUtf8StateIsComplete cpSt' = - let getEndState ndx cpSt'' - | ndx < len = - case updateUtf8State (index ndx) cpSt'' of - Nothing -> Nothing - Just cpSt''' -> getEndState (ndx + 1) cpSt''' - | otherwise = Just (cpSt'', ndx - boundary) - soFarSoGood = - pushValidUtf8 boundary $ getEndState boundary cpSt' - in - -- are we at the boundary? - if boundary == cpPos' - -- yes: get the state of the last code point - then soFarSoGood - -- no: - else - -- are we before bsN? - if cpPos' < lenInit - -- yes - then - -- keep walking the data until we get to bsN or an error - case updateUtf8State (index cpPos') cpSt' of - Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) - Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen) - -- no: we're in bsN - else let - off = (if lenInit > 0 - then cpPos' - lenInit - else cpPos' + bs1Off) - in - -- is the rest of the bytestring valid utf-8 up to the boundary? - if ( -#ifdef SIMDUTF - withBS bsN $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> - c_is_valid_utf8 (plusPtr ptr off) (fromIntegral $ boundary - cpPos') -#elif MIN_VERSION_bytestring(0,11,2) - B.isValidUtf8 . B.take (boundary - cpPos') $ B.drop off bsN -#else - let bLen = boundary - cpPos' - step ndx cps - | ndx < off + bLen = - case updateUtf8State (B.unsafeIndex bsN ndx) cps of - Just cps' -> step (ndx + 1) cps' - Nothing -> False - | otherwise = isUtf8StateIsComplete cps - in - step off utf8StartState -#endif - ) - -- Yes - then soFarSoGood - -- No - else huntDownError cpPos' cpPos' cpSt' - -- We're mid code point - | otherwise = - if cpPos' < len - then - -- try to complete the code point - case updateUtf8State (index cpPos') cpSt' of - Just cpSt'' -> checkIncompleteCodePoint cpSt'' (cpPos' + 1) - -- just enough additional data to find an error with the code point - Nothing -> (Left (-lenInit), Utf8DecodeState Nothing bss bs1Off tbpPos dataStack tLen) - else - -- didn't get enough additional data to complete the code point - (Right cpPos', Utf8DecodeState (Just (cpSt', cpPos')) bss bs1Off tbpPos dataStack tLen) - in - checkIncompleteCodePoint cpSt cpPos - -- no, we're past the boundary - else - -- the code point is the only thing that (potentially) changes - let getEndCodePointState cpPos' cpSt' - | cpPos' < len = - case updateUtf8State (index cpPos') cpSt' of - Nothing -> Nothing - Just cpSt'' -> getEndCodePointState (cpPos' + 1) cpSt'' - | otherwise = Just (cpSt', cpPos' - boundary) - mCpStLen = getEndCodePointState cpPos cpSt - in - ( case mCpStLen of - Nothing -> Left (-lenInit) - Just _ -> Right len - , Utf8DecodeState mCpStLen bss bs1Off tbpPos dataStack tLen - ) - -- no: there's an error - Nothing -> huntDownError 0 cpPos cpSt - --- | Decodes a UTF-8 'ByteString' in the context of what has already been --- decoded which is represented by the 'Utf8ParseState value. Returned is the --- new decode state and either ('Right') the number of 'Word8's that make up the --- incomplete code point at the end of the input, or ('Left') the start position --- of an invalid code point that was encountered. The position is relative to --- the start of the input 'ByteString'. --- --- If the previous 'ByteString' ended with an incomplete code point, the --- beginning of the input data will be treated as a continuation of the code --- point. NOTE: That in this case if the input causes the previous incomplete --- code point to be invalid, the returned error ('Left') position value will be --- negative. --- --- If decoding the last 'ByteString' resulted in a error. The input is ignored, --- and the state value is returned unchanged. Error states can be handled with --- 'recoverFromUtf8Error'. --- --- @since 2.0.2 -decodeNextUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -decodeNextUtf8Chunk _ st@(Utf8DecodeState Nothing _ _ _ _ _) = (getCodePointStateOrError st, st) -decodeNextUtf8Chunk bs@(B.length -> len) st@(Utf8DecodeState mCpSt bss bs1Off tbpPos dataStack tLen) - | len == 0 = (getCodePointStateOrError st, st) - | otherwise = decodeUtf8Chunks $ Utf8DecodeState mCpSt (bss ++ [bs]) bs1Off tbpPos dataStack tLen - --- | Decodes a 'ByteString' from a clean state. --- --- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State'@ --- --- @since 2.0.2 -decodeUtf8Chunk :: ByteString -> (Either Int Int, Utf8DecodeState) -decodeUtf8Chunk = flip decodeNextUtf8Chunk startUtf8State - --- | If the 'Utf8ParseState value indicates an error state, the 'Word8' that --- the state value point to is replaced with the input 'Text' value which may --- be empty. Decoding resumes after the text is inserted and produces the result --- described by 'decodeNextUtf8Chunk'. --- --- If not in an error state, the 'Text' is inserted at the end of the data, but --- before an incomplete code point at the end of the last input 'ByteString'. --- --- @since 2.0.2 -recoverFromUtf8Error :: -#if defined(ASSERTS) - HasCallStack => -#endif - Text -> Utf8DecodeState -> (Either Int Int, Utf8DecodeState) -recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState mCpSt bss@(bs1@(B.length -> len) : bss') bs1Off tbpPos dataStack tLen) = - let tbpLen = bs1Off - tbpPos - (dataStack', tLen') = - if tbpLen > 0 - then (Right (bs1, tbpPos, tbpLen) : dataStack, tLen + tbpLen) - else (dataStack, tLen) - (dataStack'', tLen'') = - if utf8Len > 0 - then (Left t : dataStack', tLen' + utf8Len) - else (dataStack', tLen') - (mCpSt'', bss'', bs1Off'') = - case mCpSt of - Just _ -> (mCpSt, bss, bs1Off) - Nothing -> - let bs1Off' = bs1Off + 1 - mCpSt' = Just (utf8StartState, 0) - in - if bs1Off' == len - then (mCpSt', bss', 0) - else (mCpSt', bss, bs1Off') - in - decodeUtf8Chunks $ Utf8DecodeState mCpSt'' bss'' bs1Off'' bs1Off'' dataStack'' tLen'' -recoverFromUtf8Error t@(Text _ _ utf8Len) (Utf8DecodeState _ _ _ _ dataStack tLen) = - decodeUtf8Chunks $ Utf8DecodeState - (Just (utf8StartState, 0)) - [] - 0 - 0 - (if utf8Len > 0 - then Left t : dataStack - else dataStack - ) $ tLen + utf8Len +decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) +decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack -- | Decode a 'ByteString' containing UTF-8 encoded text. -- @@ -760,46 +414,21 @@ decodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs - | B.null undecoded = txt - | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of - Nothing -> txt' - Just c -> T.singleton c `append` txt') - where - (txt, undecoded) = decodeUtf8With2 onErr mempty bs - txt' = decodeUtf8With onErr (B.tail undecoded) - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - --- | Decode two consecutive bytestrings, returning Text and undecoded remainder. -decodeUtf8With2 :: -#if defined(ASSERTS) - HasCallStack => -#endif - OnDecodeError -> ByteString -> ByteString -> (Text, ByteString) -decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = - let g res isSecondBs = +decodeUtf8With onErr bs = + let handleErr errPos errEndPos bs' tds + | errPos < errEndPos = handleErr (errPos + 1) errEndPos bs' $ + case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of + Just c -> pushText (T.singleton c) tds + Nothing -> tds + | otherwise = tds + g bs'@(B.length -> bLen) res = case res of - ((Left pos), st) -> - g ( recoverFromUtf8Error - ( case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 code point" . Just $ - if pos >= 0 - then B.index (if isSecondBs then bs2 else bs1) pos - else B.index bs1 (len1 + pos) of - Just c -> T.singleton c - Nothing -> empty - ) st - ) isSecondBs - ((Right cpLen), st) -> - if isSecondBs - then - ( fst $ outAvailableUtf8Text st - , if cpLen > len2 - then B.drop (len1 + len2 - cpLen) bs1 `B.append` bs2 - else B.drop (len2 - cpLen) bs2 - ) - else g (decodeNextUtf8Chunk bs2 st) True + ((len, eS), tds) -> + case eS of + Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ handleErr len pos bs' tds + Right _ -> stackToText $ handleErr len bLen bs' tds in - g (decodeUtf8Chunk bs1) False + g bs $ decodeUtf8Chunk bs -- $stream -- @@ -891,11 +520,39 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = go mempty - where - go bs1 bs2 = Some txt undecoded (go undecoded) - where - (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 +-- streamDecodeUtf8With onErr = go mempty +-- where +-- go bs1 bs2 = Some txt undecoded (go undecoded) +-- where +-- (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 +streamDecodeUtf8With onErr bs = + let handleErr errPos errEndPos bss tds + | errPos < errEndPos = + let errPos' = errPos + 1 + (bs'@(B.length -> len) : bss') = bss + h = if errPos' < len + then handleErr errPos' errEndPos bss + else handleErr 0 (errEndPos - len) bss' + in + h $ + case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of + Just c -> pushText (T.singleton c) tds + Nothing -> tds + | otherwise = tds + g bs' s tds = + case decodeNextUtf8Chunk bs' s tds of + ((len, eS), tds') -> + case eS of + Left (pos, bs'') -> g bs'' startUtf8ParseState $ + ( if len < 0 + then handleErr 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs'] + else handleErr len pos [bs'] + ) tds' + Right s' -> let bss' = partialCodePoint s' in + Some (stackToText tds') (B.concat bss') $ \ bs'' -> + g bs'' s' emptyStack + in + g bs startUtf8ParseState emptyStack -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index aec54ec0..2d5395a0 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -25,6 +25,7 @@ import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E +import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete) import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -61,38 +62,46 @@ t_pn_utf8_1 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of (result0, mS) -> whenEqProp result0 0 $ case mS of Left _ -> counterexample (show mS) False - Right s -> case E.parseUtf8NextChunk (B.pack [0x80]) s of + Right s -> case E.parseNextUtf8Chunk (B.pack [0x80]) s of (result1, mS1) -> whenEqProp result1 1 $ if isLeft mS1 then counterexample (show mS1) False - else case E.parseUtf8NextChunk (B.pack [0x7f]) s of + else case E.parseNextUtf8Chunk (B.pack [0x7f]) s of (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0 t_pn_utf8_2 = case E.parseUtf8Chunk (B.pack [0xF0]) of (result0, mS0) -> whenEqProp result0 0 $ case mS0 of Left _ -> counterexample (show mS0) False - Right s0 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s0 of + Right s0 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s0 of (result1, mS1) -> whenEqProp result1 (-1) . whenEqProp mS1 (Left 0) $ - case E.parseUtf8NextChunk (B.pack [0x90]) s0 of + case E.parseNextUtf8Chunk (B.pack [0x90]) s0 of (result2, mS2) -> whenEqProp result2 (-1) $ case mS2 of Left _ -> counterexample (show mS2) False - Right s1 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s1 of + Right s1 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s1 of (result3, mS3) -> whenEqProp result3 (-2) . whenEqProp mS3 (Left 0) $ - case E.parseUtf8NextChunk (B.pack [0x80]) s1 of + case E.parseNextUtf8Chunk (B.pack [0x80]) s1 of (result4, mS4) -> whenEqProp result4 (-2) $ case mS4 of Left _ -> counterexample (show mS3) False - Right s2 -> case E.parseUtf8NextChunk (B.pack [0x7f]) s2 of + Right s2 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s2 of (result5, mS5) -> whenEqProp result5 (-3) . whenEqProp mS5 (Left 0) $ - case E.parseUtf8NextChunk (B.pack [0x80]) s2 of + case E.parseNextUtf8Chunk (B.pack [0x80]) s2 of (result6, mS6) -> whenEqProp result6 1 $ property $ isRight mS6 +t_pn_utf8_3 = case E.parseUtf8Chunk $ B.pack [0xc2] of + (len1, eS1) -> whenEqProp len1 0 $ case eS1 of + Left _ -> counterexample (show eS1) False + Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $ + if isUtf8StateIsComplete $ E.codePointState s1 + then counterexample (show $ E.codePointState s1) False + else case E.parseNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of + (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2 -t_utf8_c = (fst . E.outAvailableUtf8Text . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id +t_utf8_c = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id @@ -248,6 +257,11 @@ t_decode_with_error3 = t_decode_with_error4 = E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa" +t_decode_with_error1' = + case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) of + E.Some x1 bs1 f1 -> whenEqProp x1 "" . whenEqProp bs1 (B.pack [0xc2]) $ + case f1 $ B.pack [0x80, 0x80] of + E.Some x2 bs2 _ -> whenEqProp x2 "\x80x" $ bs2 === mempty t_decode_with_error2' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) of E.Some x _ _ -> x === "xa" @@ -263,21 +277,18 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False -t_decode_chunk = - case E.decodeUtf8Chunk $ B.pack [0xC2, 97] of - (result, st) -> whenEqProp result (Left 0) $ - case E.decodeNextUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0x63, 0x63, 0x63]) st of - (result', st') -> whenEqProp result' result $ - case E.recoverFromUtf8Error "bbbb" st' of - (result'', st'') -> whenEqProp result'' (Right 0) $ - case E.decodeNextUtf8Chunk (B.pack [0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0xc2]) st'' of - (result''', st''') -> whenEqProp result''' (Right 1) $ - case E.outAvailableUtf8Text st''' of - (t, st'''') -> whenEqProp t "bbbbadddddd" $ - case E.decodeNextUtf8Chunk (B.singleton 0x80) st'''' of - (result'''', st''''') -> whenEqProp result'''' (Right 0) $ - case E.outAvailableUtf8Text st''''' of - (t', _) -> t' === "\x80" +t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of + ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ + case eS1 of + Left _ -> counterexample (show eS1) False + Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $ + if isUtf8StateIsComplete $ E.codePointState s1 + then counterexample (show $ E.codePointState s1) False + else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of + ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $ + case eS2 of + Right _ -> counterexample (show eS2) False + Left res -> res === (2, mempty) t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -303,7 +314,7 @@ testTranscoding = testProperty "t_p_utf8_0" t_p_utf8_0, testProperty "t_pn_utf8_1" t_pn_utf8_1, testProperty "t_pn_utf8_2" t_pn_utf8_2, - -- testProperty "t_pn_utf8_3" t_pn_utf8_3, + testProperty "t_pn_utf8_3" t_pn_utf8_3, testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', @@ -335,6 +346,7 @@ testTranscoding = testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, + testProperty "t_decode_with_error1'" t_decode_with_error1', testProperty "t_decode_with_error2'" t_decode_with_error2', testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', From c971a88653c85a409d8c856438b4f33f50f618fe Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 23 Oct 2022 23:18:55 -0600 Subject: [PATCH 51/87] A little refactorin' --- src/Data/Text/Encoding.hs | 188 ++++++++++++++++++--------------- src/Data/Text/Lazy/Encoding.hs | 32 +++--- 2 files changed, 119 insertions(+), 101 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 1c4298c3..f7bf8997 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -28,12 +28,6 @@ module Data.Text.Encoding , parseUtf8Chunk , parseNextUtf8Chunk , startUtf8ParseState - , TextDataStack - , dataStack - , stackLen - , emptyStack - , pushText - , stackToText -- * Decoding ByteStrings to Text -- $strict @@ -41,15 +35,22 @@ module Data.Text.Encoding -- ** Total Functions #total# -- $total , decodeLatin1 - , decodeUtf8Lenient , decodeAsciiPrefix + , TextDataStack + , dataStack + , stackLen + , emptyStack + , pushText + , stackToText , decodeNextUtf8Chunk , decodeUtf8Chunk + , decodeUtf8Lenient -- *** Catchable failure , decodeUtf8' -- *** Controllable error handling + , handleUtf8Err , decodeUtf8With , decodeUtf16LEWith , decodeUtf16BEWith @@ -147,8 +148,38 @@ import qualified Data.ByteString.Unsafe as B -- (preferably not at all). See "Data.Text.Encoding#g:total" for better -- solutions. --- | Decode a 'ByteString' containing 7-bit ASCII --- encoded text. +-- | Decode a 'ByteString' containing 7-bit ASCII encoded text. +-- +-- This is a total function. The 'ByteString' is decoded until either +-- the end is reached or it errors with the first non-ASCII 'Word8' is +-- encountered. In either case the function will return the 'Text' +-- value of the longest prefix that is valid ASCII. On error, the index +-- of the non-ASCII 'Word8' is also returned. +-- +-- @since 2.0.2 +decodeAsciiPrefix :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> (Text, Maybe (Word8, Int)) +decodeAsciiPrefix bs = if B.null bs + then (empty, Nothing) + else unsafeDupablePerformIO $ withBS bs $ \ fp len -> + unsafeWithForeignPtr fp $ \src -> do + asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len + let !prefix = if asciiPrefixLen == 0 + then empty + else runST $ do + dst <- A.new asciiPrefixLen + A.copyFromPointer dst 0 src asciiPrefixLen + arr <- A.unsafeFreeze dst + pure $ Text arr 0 asciiPrefixLen + let suffix = if asciiPrefixLen < len + then Just (B.index bs asciiPrefixLen, asciiPrefixLen) + else Nothing + pure (prefix, suffix) + +-- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a partial function: it checks that input does not contain -- anything except ASCII and copies buffer or throws an error otherwise. @@ -175,7 +206,7 @@ decodeLatin1 :: decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) let inner srcOff dstOff = if srcOff >= len then return dstOff else do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + asciiPrefixLen <- fmap fromIntegral $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len) if asciiPrefixLen == 0 then do @@ -196,39 +227,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize --- | Decode a 'ByteString' containing ASCII. --- --- This is a total function. The 'ByteString' is decoded until either --- the end is reached or it errors with the first non-ASCII 'Word8' is --- encountered. In either case the function will return what 'Text' was --- decoded. On error, the index of the non-ASCII 'Word8' is also returned. --- --- @since 2.0.2 -decodeAsciiPrefix - :: ByteString - -> (Text, Maybe (Word8, Int)) -decodeAsciiPrefix bs = if B.null bs - then (empty, Nothing) - else unsafeDupablePerformIO $ withBS bs $ \ fp len -> - unsafeWithForeignPtr fp $ \src -> do - asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len - let !prefix = if asciiPrefixLen == 0 - then empty - else runST $ do - dst <- A.new asciiPrefixLen - A.copyFromPointer dst 0 src asciiPrefixLen - arr <- A.unsafeFreeze dst - pure $ Text arr 0 asciiPrefixLen - let suffix = if asciiPrefixLen < len - then Just (B.index bs asciiPrefixLen, asciiPrefixLen) - else Nothing - pure (prefix, suffix) - -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif - data Utf8ParseState = Utf8ParseState { partialCodePoint :: [ByteString] , codePointState :: Utf8CodePointState @@ -238,6 +236,11 @@ data Utf8ParseState = Utf8ParseState startUtf8ParseState ::Utf8ParseState startUtf8ParseState = Utf8ParseState [] utf8StartState +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + {- `parseUtf8Chunk chunk = (n, es)` @@ -252,7 +255,11 @@ es intermediate decoding state, which can be used to parse the next chunk with `parseNextUtf8Chunk` -} -parseUtf8Chunk :: ByteString -> (Int, Either Int Utf8ParseState) +parseUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> (Int, Either Int Utf8ParseState) parseUtf8Chunk bs@(B.length -> len) #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) | guessUtf8Boundary > 0 && @@ -312,7 +319,11 @@ es intermediate decoding state, which can be used to parse the next chunk with `parseNextUtf8Chunk` -} -parseNextUtf8Chunk :: ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) +parseNextUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s) | len > 0 = let g pos s' @@ -350,7 +361,11 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = then TextDataStack (Left t : stack) $ sLen + tLen else tds -stackToText :: TextDataStack -> Text +stackToText :: +#if defined(ASSERTS) + HasCallStack => +#endif + TextDataStack -> Text stackToText (TextDataStack stack sLen) | sLen > 0 = runST $ do @@ -373,8 +388,11 @@ stackToText (TextDataStack stack sLen) pure $ Text arr 0 sLen | otherwise = empty -decodeNextUtf8Chunk - :: ByteString +decodeNextUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> Utf8ParseState -> TextDataStack -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) @@ -405,6 +423,38 @@ decodeNextUtf8Chunk bs s tds = decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack +handleUtf8Err + :: OnDecodeError + -> String + -> Int + -> Int + -> Utf8ParseState + -> ByteString + -> TextDataStack + -> TextDataStack +handleUtf8Err onErr errMsg len pos s bs tds = + let h errPos errEndPos bss tds' + | errPos < errEndPos = + let errPos' = errPos + 1 in + case bss of + bs'@(B.length -> len') : bss' -> + ( if errPos' < len' + then h errPos' errEndPos bss + else h 0 (errEndPos - len') bss' + ) $ case onErr errMsg . Just $ B.index bs' errPos of + Just c -> pushText (T.singleton c) tds' + Nothing -> tds' + [] -> tds' + | otherwise = tds' + in + ( if len < 0 + then h 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs] + else h len pos [bs] + ) tds + +invalidUtf8Msg :: String +invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" + -- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- Surrogate code points in replacement character returned by 'OnDecodeError' @@ -415,18 +465,13 @@ decodeUtf8With :: #endif OnDecodeError -> ByteString -> Text decodeUtf8With onErr bs = - let handleErr errPos errEndPos bs' tds - | errPos < errEndPos = handleErr (errPos + 1) errEndPos bs' $ - case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of - Just c -> pushText (T.singleton c) tds - Nothing -> tds - | otherwise = tds - g bs'@(B.length -> bLen) res = + let g bs'@(B.length -> bLen) res = case res of ((len, eS), tds) -> + let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in case eS of - Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ handleErr len pos bs' tds - Right _ -> stackToText $ handleErr len bLen bs' tds + Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ h invalidUtf8Msg pos startUtf8ParseState + Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s in g bs $ decodeUtf8Chunk bs @@ -520,34 +565,12 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding --- streamDecodeUtf8With onErr = go mempty --- where --- go bs1 bs2 = Some txt undecoded (go undecoded) --- where --- (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 streamDecodeUtf8With onErr bs = - let handleErr errPos errEndPos bss tds - | errPos < errEndPos = - let errPos' = errPos + 1 - (bs'@(B.length -> len) : bss') = bss - h = if errPos' < len - then handleErr errPos' errEndPos bss - else handleErr 0 (errEndPos - len) bss' - in - h $ - case onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just $ B.index bs' errPos of - Just c -> pushText (T.singleton c) tds - Nothing -> tds - | otherwise = tds - g bs' s tds = + let g bs' s tds = case decodeNextUtf8Chunk bs' s tds of ((len, eS), tds') -> case eS of - Left (pos, bs'') -> g bs'' startUtf8ParseState $ - ( if len < 0 - then handleErr 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs'] - else handleErr len pos [bs'] - ) tds' + Left (pos, bs'') -> g bs'' startUtf8ParseState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' Right s' -> let bss' = partialCodePoint s' in Some (stackToText tds') (B.concat bss') $ \ bs'' -> g bs'' s' emptyStack @@ -754,6 +777,3 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt)) encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} - -cSizeToInt :: CSize -> Int -cSizeToInt = fromIntegral diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index ad361af5..19f605a7 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BangPatterns,CPP #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Text.Lazy.Encoding @@ -65,7 +66,6 @@ import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B -import qualified Data.Text as T import qualified Data.Text.Encoding as TE import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E import qualified Data.Text.Internal.Lazy.Fusion as F @@ -107,22 +107,20 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr (B.Chunk b0 bs0) = - case TE.streamDecodeUtf8With onErr b0 of - TE.Some t l f -> chunk t (go f l bs0) - where - go f0 _ (B.Chunk b bs) = - case f0 b of - TE.Some t l f -> chunk t (go f l bs) - go _ l _ - | S.null l = empty - | otherwise = - let !t = T.pack (skipBytes l) - skipBytes = S.foldr (\x s' -> - case onErr desc (Just x) of - Just c -> c : s' - Nothing -> s') [] in - Chunk t Empty - desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" + let g bs@(S.length -> bLen) lbs s tds diffText = + case TE.decodeNextUtf8Chunk bs s tds of + ((len, eS), tds') -> + let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in + case eS of + Left (pos, bs') -> g bs' lbs TE.startUtf8ParseState + (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText + Right s' -> + case lbs of + B.Chunk bs' lbs' -> + g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds') + B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty + in + g b0 bs0 TE.startUtf8ParseState TE.emptyStack id decodeUtf8With _ _ = empty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known From d63effad3ff2c01fa829a9c0dc9e98b427df356d Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 25 Oct 2022 22:54:29 -0600 Subject: [PATCH 52/87] Documenting new functions. --- changelog.md | 16 +- src/Data/Text/Encoding.hs | 193 ++++++++++++++---------- src/Data/Text/Internal/Encoding/Utf8.hs | 2 +- src/Data/Text/Lazy/Encoding.hs | 4 +- tests/Tests/Properties/Transcoding.hs | 44 +++--- 5 files changed, 154 insertions(+), 105 deletions(-) diff --git a/changelog.md b/changelog.md index c2c72420..752c7fbb 100644 --- a/changelog.md +++ b/changelog.md @@ -4,10 +4,22 @@ allow decoding to be aborted on errors without the need to raise an `error` and `catch` it elsewhere: * `decodeAsciiPrefix` + * `TextDataStack` + * `dataStack` + * `stackLen` + * `emptyStack` * `decodeUtf8Chunk` * `decodeNextUtf8Chunk` - * `recoverFromUtf8Error` - * `outAvailableUtf8Text` + * `pushText` + * `stackToText` + +* Added functions to validate `ByteString`s that represent encoded text: + * `Utf8ValidState` + * `partialUtf8CodePoint` + * `utf8CodePointState` + * `validateUtf8Chunk` + * `validateNextUtf8Chunk` + * `startUtf8ValidState` ### 2.0.1 diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index f7bf8997..d3d0396f 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -22,12 +22,15 @@ -- . module Data.Text.Encoding - ( Utf8ParseState - , partialCodePoint - , codePointState - , parseUtf8Chunk - , parseNextUtf8Chunk - , startUtf8ParseState + ( + -- * ByteString validation + -- $validation + Utf8ValidState + , partialUtf8CodePoint + , utf8CodePointState + , validateUtf8Chunk + , validateNextUtf8Chunk + , startUtf8ValidState -- * Decoding ByteStrings to Text -- $strict @@ -124,6 +127,9 @@ import Foreign.C.Types (CInt(..)) import qualified Data.ByteString.Unsafe as B #endif +-- $validation +-- These functions are for validating 'ByteString's as encoded text. + -- $strict -- -- All of the single-parameter functions for decoding bytestrings @@ -227,40 +233,53 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize -data Utf8ParseState = Utf8ParseState - { partialCodePoint :: [ByteString] - , codePointState :: Utf8CodePointState +-- | This data type represents the state of a 'ByteString' representing +-- UTF-8-encoded text. It consists of a value representing whether or +-- not the last byte is a complete code point, and on incompletion what +-- the 1 to 3 end bytes are that make up the incomplete code point. +data Utf8ValidState = Utf8ValidState + { -- | Get the incomplete UTF-8 code point of the 'ByteString's that + -- have been validated thus far. + partialUtf8CodePoint :: [ByteString] + -- | Get the current UTF-8 code point state of the 'ByteString's + -- that have been validated thus far. + , utf8CodePointState :: Utf8CodePointState } - deriving (Eq, Ord, Show, Read) + deriving (Eq, Ord, Show) -startUtf8ParseState ::Utf8ParseState -startUtf8ParseState = Utf8ParseState [] utf8StartState +-- | This represtents the starting state of a UTF-8 validation check. +startUtf8ValidState :: Utf8ValidState +startUtf8ValidState = Utf8ValidState [] utf8StartState #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt #endif -{- -`parseUtf8Chunk chunk = (n, es)` - -n is the end index of the longest prefix of chunk that is valid UTF-8 - -es -* When `es = Left p`, there is an error: the bytes from index n and beyond are - not part of a valid UTF-8 code point, and is the index of the start of the - next (possibly valid) code point. `p - n` is the number of invalid bytes. -* When `es = Right s`, all of the remaining bytes from index n and beyond are the - beginning of an incomplete UTF-8 code point, and s is the corresponding - intermediate decoding state, which can be used to parse the next chunk with - `parseNextUtf8Chunk` --} -parseUtf8Chunk :: +-- | Validate a 'ByteString' as a UTF-8-encoded text. +-- +-- @validateUtf8Chunk chunk = (n, es)@ +-- +-- This function returns two values: +-- +-- * The value 'n' indicates the longest prefix of the 'ByteString' +-- that is valid UTF-8-encoded data. +-- * The value 'es' indicates whether the 'ByteString' +-- +-- * (@Left p@) contains an invalid code point and where the next +-- (potentially valid) code point begins, so that @p - n@ is the +-- number of invalid bytes, or +-- * (@Right s@) is valid, and all of the remaining bytes starting +-- at inbex 'n' are the beginning of an incomplete UTF-8 code +-- point, and 's' is the resulting 'Utf8ValidState' value, which +-- can be used to validate against a following 'ByteString' with +-- 'validateNextUtf8Chunk'. +validateUtf8Chunk :: #if defined(ASSERTS) HasCallStack => #endif - ByteString -> (Int, Either Int Utf8ParseState) -parseUtf8Chunk bs@(B.length -> len) + ByteString -> (Int, Either Int Utf8ValidState) +validateUtf8Chunk bs@(B.length -> len) #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) | guessUtf8Boundary > 0 && -- the rest of the bytestring valid utf-8 up to the boundary @@ -276,7 +295,7 @@ parseUtf8Chunk bs@(B.length -> len) -- No | otherwise = getEndState 0 where - getEndState ndx = parseUtf8 ndx ndx utf8StartState + getEndState ndx = validateUtf8 ndx ndx utf8StartState w n word8 = len >= n && word8 <= (B.index bs $ len - n) guessUtf8Boundary | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point @@ -284,53 +303,56 @@ parseUtf8Chunk bs@(B.length -> len) | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point | otherwise = len #else - = parseUtf8 0 0 utf8StartState + = validateUtf8 0 0 utf8StartState where #endif - parseUtf8 ndx0 ndx s + validateUtf8 ndx0 ndx s | ndx < len = let ndx' = ndx + 1 in case updateUtf8State (B.index bs ndx) s of Just s' -> - parseUtf8 ( + validateUtf8 ( if isUtf8StateIsComplete s' then ndx' else ndx0 ) ndx' s' Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - | otherwise = (ndx0, Right $ Utf8ParseState (if ndx0 < len then [B.drop ndx0 bs] else []) s) - -{- -parseNextUtf8Chunk chunk s = (n, es) - -n -* When n >= 0, n is the length of the longest prefix of chunk that is valid - UTF-8 starting from state s (i.e., n points after the end of a full codepoint, - to the beginning of an incomplete codepoint). -* When n > 0, the starting code point from the previous input is either still - incomplete with the additonal chunk or in error. - -es -* When es = Left p, there is an error: the bytes from index n and beyond are - not part of a valid UTF-8 code point, and is the index of the start of the - next (possibly valid) code point. `p - n` is the number of invalid bytes. -* When es = Right s', all of the remaining bytes from index n and beyond are the - beginning of an incomplete UTF-8 code point, and s' is the corresponding - intermediate decoding state, which can be used to parse the next chunk with - `parseNextUtf8Chunk` --} -parseNextUtf8Chunk :: + | otherwise = (ndx0, Right $ Utf8ValidState (if ndx0 < len then [B.drop ndx0 bs] else []) s) + +-- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. +-- +-- @validateNextUtf8Chunk chunk s = (n, es)@ +-- +-- This function returns two values: +-- +-- * The value 'n' indicates the end position of longest prefix of the +-- 'ByteString' that is valid UTF-8-encoded data from the starting +-- state 's'. If 's' contains an incomplete code point, the input +-- 'ByteString' is considered a continuation. As a result 'n' will be +-- negative if the code point is still incomplete or is proven to be +-- invalid. +-- +-- * The value 'es' indicates whether the 'ByteString' +-- +-- * (@Left p@) contains an invalid code point and where the next +-- (potentially valid) code point begins, so that @p - n@ is the +-- number of invalid bytes, or +-- * (@Right s'@) is valid, and all of the remaining bytes starting +-- at inbex 'n' are the beginning of an incomplete UTF-8 code +-- point, and `s'` is the resulting 'Utf8ValidState' value, which +-- can be used to validate against a following 'ByteString'. +validateNextUtf8Chunk :: #if defined(ASSERTS) HasCallStack => #endif - ByteString -> Utf8ParseState -> (Int, Either Int Utf8ParseState) -parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s) + ByteString -> Utf8ValidState -> (Int, Either Int Utf8ValidState) +validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s) | len > 0 = let g pos s' -- first things first. let's try to get to the start of the next code point | isUtf8StateIsComplete s' = -- found the beginning of the next code point, hand this off to someone else - case parseUtf8Chunk $ B.drop pos bs of + case validateUtf8Chunk $ B.drop pos bs of (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS) -- code point is not complete yet -- walk the rest of the code point until error, complete, or no more data @@ -341,26 +363,32 @@ parseNextUtf8Chunk bs@(B.length -> len) st@(Utf8ParseState lead s) -- keep going Just s'' -> g (pos + 1) s'' -- no more data - | otherwise = (leadPos, Right $ Utf8ParseState (lead ++ [bs]) s') + | otherwise = (leadPos, Right $ Utf8ValidState (lead ++ [bs]) s') in g 0 s | otherwise = (leadPos, Right st) where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead) +-- | Validated UTF-8 data to be converted into a 'Text' value. data TextDataStack = TextDataStack - { dataStack :: [Either Text ByteString] + { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values. + dataStack :: [Either Text ByteString] + -- | Returns total number of UTF-8 valid bytes in the stack. , stackLen :: Int } deriving Show +-- | Empty stack emptyStack :: TextDataStack emptyStack = TextDataStack [] 0 +-- | Push a text value onto the stack pushText :: Text -> TextDataStack -> TextDataStack pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = if tLen > 0 then TextDataStack (Left t : stack) $ sLen + tLen else tds +-- | Create a 'Text' value from the contents of a stack. stackToText :: #if defined(ASSERTS) HasCallStack => @@ -388,16 +416,17 @@ stackToText (TextDataStack stack sLen) pure $ Text arr 0 sLen | otherwise = empty +-- | Decode a 'ByteString' in the context of what has been already been decoded. decodeNextUtf8Chunk :: #if defined(ASSERTS) HasCallStack => #endif ByteString - -> Utf8ParseState + -> Utf8ValidState -> TextDataStack - -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) + -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) decodeNextUtf8Chunk bs s tds = - case parseNextUtf8Chunk bs s of + case validateNextUtf8Chunk bs s of (len, res) -> let stackedData' | len >= 0 = @@ -406,7 +435,7 @@ decodeNextUtf8Chunk bs s tds = if bLen > 0 then TextDataStack (Right bs' : stack) $ sLen + bLen else tds' - ) tds $ partialCodePoint s + ) tds $ partialUtf8CodePoint s in if len > 0 then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len @@ -420,36 +449,44 @@ decodeNextUtf8Chunk bs s tds = ) , stackedData') -decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ParseState), TextDataStack) -decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ParseState emptyStack +-- | Decode a 'ByteString'. +-- +-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@ +decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) +decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack +-- | Call an error handler with the give 'String' message for each byte +-- in given 'ByteString' and lead data in the given 'Utf8ValidState' +-- value. The bytes are the positions from 'errStart' (inclusive) to +-- 'errEnd' (exclusive). Any substite characters are pushed onto the +-- supplied 'TextDataStack' argument. handleUtf8Err :: OnDecodeError -> String -> Int -> Int - -> Utf8ParseState + -> Utf8ValidState -> ByteString -> TextDataStack -> TextDataStack -handleUtf8Err onErr errMsg len pos s bs tds = +handleUtf8Err onErr errMsg errStart errEnd s bs tds = let h errPos errEndPos bss tds' | errPos < errEndPos = let errPos' = errPos + 1 in case bss of - bs'@(B.length -> len') : bss' -> - ( if errPos' < len' + bs'@(B.length -> len) : bss' -> + ( if errPos' < len then h errPos' errEndPos bss - else h 0 (errEndPos - len') bss' + else h 0 (errEndPos - len) bss' ) $ case onErr errMsg . Just $ B.index bs' errPos of Just c -> pushText (T.singleton c) tds' Nothing -> tds' [] -> tds' | otherwise = tds' in - ( if len < 0 - then h 0 (pos - len) $ partialCodePoint s ++ [B.take pos bs] - else h len pos [bs] + ( if errStart < 0 + then h 0 (errEnd - errStart) $ partialUtf8CodePoint s ++ [B.take errEnd bs] + else h errStart errEnd [bs] ) tds invalidUtf8Msg :: String @@ -470,7 +507,7 @@ decodeUtf8With onErr bs = ((len, eS), tds) -> let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in case eS of - Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ParseState $ h invalidUtf8Msg pos startUtf8ParseState + Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ValidState $ h invalidUtf8Msg pos startUtf8ValidState Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s in g bs $ decodeUtf8Chunk bs @@ -570,12 +607,12 @@ streamDecodeUtf8With onErr bs = case decodeNextUtf8Chunk bs' s tds of ((len, eS), tds') -> case eS of - Left (pos, bs'') -> g bs'' startUtf8ParseState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' - Right s' -> let bss' = partialCodePoint s' in + Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' + Right s' -> let bss' = partialUtf8CodePoint s' in Some (stackToText tds') (B.concat bss') $ \ bs'' -> g bs'' s' emptyStack in - g bs startUtf8ParseState emptyStack + g bs startUtf8ValidState emptyStack -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 5d4aaebd..148995a8 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -243,7 +243,7 @@ byteToClass n = ByteClass (W8# el#) table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# newtype Utf8CodePointState = Utf8CodePointState Word8 - deriving (Eq, Ord, Show, Read) + deriving (Eq, Ord, Show) utf8StartState :: Utf8CodePointState utf8StartState = Utf8CodePointState 0 diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 19f605a7..5b5e8bb9 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -112,7 +112,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = ((len, eS), tds') -> let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in case eS of - Left (pos, bs') -> g bs' lbs TE.startUtf8ParseState + Left (pos, bs') -> g bs' lbs TE.startUtf8ValidState (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText Right s' -> case lbs of @@ -120,7 +120,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds') B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty in - g b0 bs0 TE.startUtf8ParseState TE.emptyStack id + g b0 bs0 TE.startUtf8ValidState TE.emptyStack id decodeUtf8With _ _ = empty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 2d5395a0..82419614 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -37,68 +37,68 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) -t_p_utf8_1 = case E.parseUtf8Chunk (B.pack [0x63]) of +t_p_utf8_1 = case E.validateUtf8Chunk (B.pack [0x63]) of (result, st) -> whenEqProp result 1 . property $ isRight st -t_p_utf8_2 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of +t_p_utf8_2 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of (result, st) -> whenEqProp result 3 . property $ isRight st -t_p_utf8_3 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of +t_p_utf8_3 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_4 = case E.parseUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of +t_p_utf8_4 = case E.validateUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_5 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of +t_p_utf8_5 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_6 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of +t_p_utf8_6 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of (result, st) -> whenEqProp result 2 . property $ isRight st -t_p_utf8_7 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of +t_p_utf8_7 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of (result, st) -> whenEqProp result 3 . property $ isRight st -t_p_utf8_8 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of +t_p_utf8_8 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of (result, st) -> whenEqProp result 0 $ st === Left 3 -t_p_utf8_9 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of +t_p_utf8_9 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of (result, st) -> whenEqProp result 2 $ st === Left 3 -t_p_utf8_0 = case E.parseUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of +t_p_utf8_0 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of (result, st) -> whenEqProp result 2 $ st === Left 3 -t_pn_utf8_1 = case E.parseUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of +t_pn_utf8_1 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of (result0, mS) -> whenEqProp result0 0 $ case mS of Left _ -> counterexample (show mS) False - Right s -> case E.parseNextUtf8Chunk (B.pack [0x80]) s of + Right s -> case E.validateNextUtf8Chunk (B.pack [0x80]) s of (result1, mS1) -> whenEqProp result1 1 $ if isLeft mS1 then counterexample (show mS1) False - else case E.parseNextUtf8Chunk (B.pack [0x7f]) s of + else case E.validateNextUtf8Chunk (B.pack [0x7f]) s of (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0 -t_pn_utf8_2 = case E.parseUtf8Chunk (B.pack [0xF0]) of +t_pn_utf8_2 = case E.validateUtf8Chunk (B.pack [0xF0]) of (result0, mS0) -> whenEqProp result0 0 $ case mS0 of Left _ -> counterexample (show mS0) False - Right s0 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s0 of + Right s0 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s0 of (result1, mS1) -> whenEqProp result1 (-1) . whenEqProp mS1 (Left 0) $ - case E.parseNextUtf8Chunk (B.pack [0x90]) s0 of + case E.validateNextUtf8Chunk (B.pack [0x90]) s0 of (result2, mS2) -> whenEqProp result2 (-1) $ case mS2 of Left _ -> counterexample (show mS2) False - Right s1 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s1 of + Right s1 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s1 of (result3, mS3) -> whenEqProp result3 (-2) . whenEqProp mS3 (Left 0) $ - case E.parseNextUtf8Chunk (B.pack [0x80]) s1 of + case E.validateNextUtf8Chunk (B.pack [0x80]) s1 of (result4, mS4) -> whenEqProp result4 (-2) $ case mS4 of Left _ -> counterexample (show mS3) False - Right s2 -> case E.parseNextUtf8Chunk (B.pack [0x7f]) s2 of + Right s2 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s2 of (result5, mS5) -> whenEqProp result5 (-3) . whenEqProp mS5 (Left 0) $ - case E.parseNextUtf8Chunk (B.pack [0x80]) s2 of + case E.validateNextUtf8Chunk (B.pack [0x80]) s2 of (result6, mS6) -> whenEqProp result6 1 $ property $ isRight mS6 -t_pn_utf8_3 = case E.parseUtf8Chunk $ B.pack [0xc2] of +t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of (len1, eS1) -> whenEqProp len1 0 $ case eS1 of Left _ -> counterexample (show eS1) False Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $ if isUtf8StateIsComplete $ E.codePointState s1 then counterexample (show $ E.codePointState s1) False - else case E.parseNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of + else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2 t_utf8_c = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id From 1f18883ea000df72f0976c95aa7ae75d804cfc2e Mon Sep 17 00:00:00 2001 From: david-sledge Date: Tue, 25 Oct 2022 23:10:12 -0600 Subject: [PATCH 53/87] Check tests before pushing... --- tests/Tests/Properties/Transcoding.hs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 82419614..3b9c908f 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -95,9 +95,9 @@ t_pn_utf8_2 = case E.validateUtf8Chunk (B.pack [0xF0]) of t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of (len1, eS1) -> whenEqProp len1 0 $ case eS1 of Left _ -> counterexample (show eS1) False - Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $ - if isUtf8StateIsComplete $ E.codePointState s1 - then counterexample (show $ E.codePointState s1) False + Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $ + if isUtf8StateIsComplete $ E.utf8CodePointState s1 + then counterexample (show $ E.utf8CodePointState s1) False else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2 @@ -281,9 +281,9 @@ t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ case eS1 of Left _ -> counterexample (show eS1) False - Right s1 -> whenEqProp (E.partialCodePoint s1) [B.pack [0xc2]] $ - if isUtf8StateIsComplete $ E.codePointState s1 - then counterexample (show $ E.codePointState s1) False + Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $ + if isUtf8StateIsComplete $ E.utf8CodePointState s1 + then counterexample (show $ E.utf8CodePointState s1) False else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $ case eS2 of From 8dc84fe7263a6b0713f672db97d3e2d929e72a1a Mon Sep 17 00:00:00 2001 From: david-sledge Date: Mon, 31 Oct 2022 19:39:22 -0600 Subject: [PATCH 54/87] Finish documentation... for now. --- src/Data/Text/Encoding.hs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index d3d0396f..711457bc 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -388,7 +388,7 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = then TextDataStack (Left t : stack) $ sLen + tLen else tds --- | Create a 'Text' value from the contents of a stack. +-- | Create a 'Text' value from the contents of a 'TextDataStack'. stackToText :: #if defined(ASSERTS) HasCallStack => @@ -417,6 +417,14 @@ stackToText (TextDataStack stack sLen) | otherwise = empty -- | Decode a 'ByteString' in the context of what has been already been decoded. +-- +-- The 'ByteString' is validated against the 'Utf8ValidState' using the rules +-- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added +-- to the input 'TextDataStack' which is returned with the end position of the +-- valid prefix, and either the resulting 'Utf8ValidState' +-- (@Right Utf8ValidState@) or the position of the of the first (potentially) +-- valid byte after the invalid bytes with remainder of the 'ByteString' +-- (@Left (Int, ByteString)@). decodeNextUtf8Chunk :: #if defined(ASSERTS) HasCallStack => @@ -447,9 +455,11 @@ decodeNextUtf8Chunk bs s tds = Left pos -> Left (pos, B.drop pos bs) Right s' -> Right s' ) - , stackedData') + , stackedData' + ) --- | Decode a 'ByteString'. +-- | Decode a 'ByteString' against a start 'Utf8ValidState' with an empty +-- 'TextDataStack'. -- -- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@ decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) From 37d19dee870067541cf60db7b387a85c4d9d9728 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sun, 13 Nov 2022 22:40:54 -0700 Subject: [PATCH 55/87] minor changes --- src/Data/Text/Encoding.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 711457bc..0af27b8d 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -193,7 +193,7 @@ decodeAsciiPrefix bs = if B.null bs decodeASCII :: ByteString -> Text decodeASCII bs = case decodeAsciiPrefix bs of - (_, Just errPos) -> error $ "decodeASCII: detected non-ASCII codepoint at " ++ show errPos + (_, Just (word, errPos)) -> error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos (t, Nothing) -> t -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. @@ -306,7 +306,7 @@ validateUtf8Chunk bs@(B.length -> len) = validateUtf8 0 0 utf8StartState where #endif - validateUtf8 ndx0 ndx s + validateUtf8 !ndx0 ndx s | ndx < len = let ndx' = ndx + 1 in case updateUtf8State (B.index bs ndx) s of From 1e3db5b546b133b8829f32ad785def66ae25922a Mon Sep 17 00:00:00 2001 From: Sledge Date: Fri, 30 Dec 2022 10:18:36 -0700 Subject: [PATCH 56/87] Update src/Data/Text/Encoding.hs typo Co-authored-by: Xia Li-yao --- src/Data/Text/Encoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 0af27b8d..bc9778c3 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -338,7 +338,7 @@ validateUtf8Chunk bs@(B.length -> len) -- (potentially valid) code point begins, so that @p - n@ is the -- number of invalid bytes, or -- * (@Right s'@) is valid, and all of the remaining bytes starting --- at inbex 'n' are the beginning of an incomplete UTF-8 code +-- at index 'n' are the beginning of an incomplete UTF-8 code -- point, and `s'` is the resulting 'Utf8ValidState' value, which -- can be used to validate against a following 'ByteString'. validateNextUtf8Chunk :: From 90a0c5bca12aaadf93764c7a28aefd9044444ae5 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 14 Jan 2023 11:11:24 -0700 Subject: [PATCH 57/87] encode partial codepoint in Word32 --- changelog.md | 2 + src/Data/Text/Encoding.hs | 157 ++++++++++++++++---------- tests/Tests/Properties/Transcoding.hs | 47 +++++++- 3 files changed, 144 insertions(+), 62 deletions(-) diff --git a/changelog.md b/changelog.md index 752c7fbb..9ea8beef 100644 --- a/changelog.md +++ b/changelog.md @@ -16,6 +16,8 @@ * Added functions to validate `ByteString`s that represent encoded text: * `Utf8ValidState` * `partialUtf8CodePoint` + * `partUtf8CPLen` + * `wordAtPartUft8CP` * `utf8CodePointState` * `validateUtf8Chunk` * `validateNextUtf8Chunk` diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index bc9778c3..69a10658 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -27,6 +27,8 @@ module Data.Text.Encoding -- $validation Utf8ValidState , partialUtf8CodePoint + , partUtf8CPLen + , wordAtPartUft8CP , utf8CodePointState , validateUtf8Chunk , validateNextUtf8Chunk @@ -89,34 +91,35 @@ module Data.Text.Encoding , encodeUtf8BuilderEscaped ) where -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) - import Control.Exception (evaluate, try) -import Control.Monad.ST (runST) -import Data.Bits (shiftR, (.&.)) +import Control.Monad.ST (ST, runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Data.Bifunctor (Bifunctor(first)) +import Data.Bits (shiftL, shiftR, (.|.), (.&.)) import Data.ByteString (ByteString) -import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B +import Data.Maybe (fromJust) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) -import Data.Word (Word8) +import Data.Word (Word32, Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP -import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F -import Data.Text.Internal.ByteStringCompat #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif @@ -224,7 +227,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) - actualLen <- inner 0 0 dst' <- A.resizeM dst actualLen arr <- A.unsafeFreeze dst' @@ -239,17 +241,34 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii -- the 1 to 3 end bytes are that make up the incomplete code point. data Utf8ValidState = Utf8ValidState { -- | Get the incomplete UTF-8 code point of the 'ByteString's that - -- have been validated thus far. - partialUtf8CodePoint :: [ByteString] + -- have been validated thus far. The first byte of the 'Word32' + -- indicates the number of bytes of the code point are available, + -- and is followed by the bytes of the code point. + partialUtf8CodePoint :: Word32 -- | Get the current UTF-8 code point state of the 'ByteString's -- that have been validated thus far. , utf8CodePointState :: Utf8CodePointState } deriving (Eq, Ord, Show) +partUtf8CPLen :: Word32 -> Int +partUtf8CPLen partCP = fromIntegral $ partCP `shiftR` 24 + +wordAtPartUft8CP :: Int -> Word32 -> Maybe Word8 +wordAtPartUft8CP n partCP + | n < partUtf8CPLen partCP && n >= 0 = + Just . fromIntegral $ partCP `shiftR` (16 - 8 * n) + | otherwise = Nothing + -- | This represtents the starting state of a UTF-8 validation check. startUtf8ValidState :: Utf8ValidState -startUtf8ValidState = Utf8ValidState [] utf8StartState +startUtf8ValidState = Utf8ValidState 0 utf8StartState + +appendPartialCodePoint :: Word32 -> ByteString -> Word32 +appendPartialCodePoint cp bs@(B.length -> len) = + fst $ B.foldl (\ (cp', bsc) word -> + (cp' .|. (fromIntegral word `shiftL` (16 - bsc * 8)), bsc + 1) + ) (cp + (fromIntegral len `shiftL` 24), partUtf8CPLen cp) bs #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 @@ -317,7 +336,7 @@ validateUtf8Chunk bs@(B.length -> len) else ndx0 ) ndx' s' Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - | otherwise = (ndx0, Right $ Utf8ValidState (if ndx0 < len then [B.drop ndx0 bs] else []) s) + | otherwise = (ndx0, Right $ Utf8ValidState (appendPartialCodePoint 0 $ B.drop ndx0 bs) s) -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. -- @@ -353,7 +372,7 @@ validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s) | isUtf8StateIsComplete s' = -- found the beginning of the next code point, hand this off to someone else case validateUtf8Chunk $ B.drop pos bs of - (len', mS) -> (pos + len', case mS of Left p -> Left (p + pos); _ -> mS) + (len', mS) -> (pos + len', first (+ pos) mS) -- code point is not complete yet -- walk the rest of the code point until error, complete, or no more data | pos < len = @@ -363,15 +382,15 @@ validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s) -- keep going Just s'' -> g (pos + 1) s'' -- no more data - | otherwise = (leadPos, Right $ Utf8ValidState (lead ++ [bs]) s') + | otherwise = (leadPos, Right $ Utf8ValidState (appendPartialCodePoint lead bs) s') in g 0 s | otherwise = (leadPos, Right st) - where leadPos = -(foldr (\ bs' len' -> len' + B.length bs') 0 lead) + where leadPos = -(partUtf8CPLen lead) -- | Validated UTF-8 data to be converted into a 'Text' value. data TextDataStack = TextDataStack { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values. - dataStack :: [Either Text ByteString] + dataStack :: [Either Text (Either Word32 ByteString)] -- | Returns total number of UTF-8 valid bytes in the stack. , stackLen :: Int } @@ -388,6 +407,35 @@ pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = then TextDataStack (Left t : stack) $ sLen + tLen else tds +copyFromStack :: [Either Text (Either Word32 ByteString)] -> Int -> A.MArray s -> ST s () +copyFromStack (dat : dataStack') tLen dst = + (case dat of + Left (Text arr0 off utf8Len) -> do + let dstOff = tLen - utf8Len + A.copyI utf8Len dst dstOff arr0 off + pure dstOff + Right encoded -> + case encoded of + Left partial -> + let utf8Len = partUtf8CPLen partial + dstOff = tLen - utf8Len + g dstOff' = + case wordAtPartUft8CP (dstOff' - dstOff) partial of + Just w -> do + A.unsafeWrite dst dstOff' w + g $ dstOff' + 1 + Nothing -> pure dstOff + in + g dstOff + Right bs@(B.length -> utf8Len) -> do + let dstOff = tLen - utf8Len + withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len + pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst) +copyFromStack _ _ _ = pure () +{-# INLINE copyFromStack #-} + -- | Create a 'Text' value from the contents of a 'TextDataStack'. stackToText :: #if defined(ASSERTS) @@ -398,20 +446,7 @@ stackToText (TextDataStack stack sLen) | sLen > 0 = runST $ do dst <- A.new sLen - let g (dat : dataStack') tLen' = - (case dat of - Left (Text arr0 off utf8Len) -> do - let dstOff = tLen' - utf8Len - A.copyI utf8Len dst dstOff arr0 off - pure dstOff - Right bs@(B.length -> utf8Len) -> do - let dstOff = tLen' - utf8Len - withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len - pure dstOff) >>= g dataStack' - g _ _ = pure () - g stack sLen + copyFromStack stack sLen dst arr <- A.unsafeFreeze dst pure $ Text arr 0 sLen | otherwise = empty @@ -433,20 +468,20 @@ decodeNextUtf8Chunk :: -> Utf8ValidState -> TextDataStack -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) -decodeNextUtf8Chunk bs s tds = +decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) = case validateNextUtf8Chunk bs s of (len, res) -> let stackedData' | len >= 0 = - let stackedData@(TextDataStack stack' sLen') = - foldl (\ tds'@(TextDataStack stack sLen) bs'@(B.length -> bLen) -> - if bLen > 0 - then TextDataStack (Right bs' : stack) $ sLen + bLen - else tds' - ) tds $ partialUtf8CodePoint s + let partCP = partialUtf8CodePoint s + partLen = partUtf8CPLen partCP + stackedData@(TextDataStack stack' sLen') = + if partLen > 0 + then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen + else tds in if len > 0 - then TextDataStack (Right (B.take len bs) : stack') $ sLen' + len + then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len else stackedData | otherwise = tds in @@ -468,7 +503,7 @@ decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack -- | Call an error handler with the give 'String' message for each byte -- in given 'ByteString' and lead data in the given 'Utf8ValidState' -- value. The bytes are the positions from 'errStart' (inclusive) to --- 'errEnd' (exclusive). Any substite characters are pushed onto the +-- 'errEnd' (exclusive). Any substitute characters are pushed onto the -- supplied 'TextDataStack' argument. handleUtf8Err :: OnDecodeError @@ -480,23 +515,27 @@ handleUtf8Err -> TextDataStack -> TextDataStack handleUtf8Err onErr errMsg errStart errEnd s bs tds = - let h errPos errEndPos bss tds' - | errPos < errEndPos = - let errPos' = errPos + 1 in - case bss of - bs'@(B.length -> len) : bss' -> - ( if errPos' < len - then h errPos' errEndPos bss - else h 0 (errEndPos - len) bss' - ) $ case onErr errMsg . Just $ B.index bs' errPos of - Just c -> pushText (T.singleton c) tds' - Nothing -> tds' - [] -> tds' + let h errPos tds' + | errPos < errEnd = + h (errPos + 1) $ + case onErr errMsg . Just $ B.index bs errPos of + Just c -> pushText (T.singleton c) tds' + Nothing -> tds' | otherwise = tds' in ( if errStart < 0 - then h 0 (errEnd - errStart) $ partialUtf8CodePoint s ++ [B.take errEnd bs] - else h errStart errEnd [bs] + then + let partCP = partialUtf8CodePoint s + g pNdx tds' = + case wordAtPartUft8CP pNdx partCP of + Nothing -> h (partUtf8CPLen partCP + errStart) tds' + mW -> g (pNdx + 1) $ + case onErr errMsg mW of + Just c -> pushText (T.singleton c) tds' + Nothing -> tds' + in + g 0 + else h errStart ) tds invalidUtf8Msg :: String @@ -613,13 +652,17 @@ streamDecodeUtf8With :: #endif OnDecodeError -> ByteString -> Decoding streamDecodeUtf8With onErr bs = - let g bs' s tds = + let h w32 n ws = + if n >= 0 + then h w32 (n - 1) $ fromJust (wordAtPartUft8CP n w32) : ws + else ws + g bs' s tds = case decodeNextUtf8Chunk bs' s tds of ((len, eS), tds') -> case eS of Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' Right s' -> let bss' = partialUtf8CodePoint s' in - Some (stackToText tds') (B.concat bss') $ \ bs'' -> + Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' -> g bs'' s' emptyStack in g bs startUtf8ValidState emptyStack diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 9f45638a..5585803e 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -2,10 +2,12 @@ {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} +{-# LANGUAGE BangPatterns #-} module Tests.Properties.Transcoding ( testTranscoding ) where +-- import Debug.Trace (trace) import Prelude hiding (head, tail) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) @@ -29,6 +31,8 @@ import qualified Data.Text.Encoding.Error as E import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete) import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL +-- import qualified Data.Text.Internal.Lazy as BL +-- import qualified Data.ByteString.Lazy.Internal as BS t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t @@ -96,7 +100,7 @@ t_pn_utf8_2 = case E.validateUtf8Chunk (B.pack [0xF0]) of t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of (len1, eS1) -> whenEqProp len1 0 $ case eS1 of Left _ -> counterexample (show eS1) False - Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $ + Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) 0x01c20000 $ if isUtf8StateIsComplete $ E.utf8CodePointState s1 then counterexample (show $ E.utf8CodePointState s1) False else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of @@ -245,6 +249,11 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs +-- t_decode_utf8_lenient = +-- let !res = trace "HEREHEREHERE" $ +-- EL.decodeUtf8With E.lenientDecode (BS.Chunk (B.pack [0xe1]) (BS.Chunk (B.pack [0xa0]) BS.Empty)) === TL.fromStrict (T.pack "\xFFFD\xFFFD") in +-- trace "THERETHERETHERE" res + -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... -- When faced with this illegal byte sequence ... a UTF-8 conformant process @@ -278,19 +287,46 @@ t_decode_with_error5' = ioProperty $ do Left (_ :: E.UnicodeException) -> True Right{} -> False -t_decode_chunk = case E.decodeUtf8Chunk $ B.pack [0xc2] of +t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ case eS1 of Left _ -> counterexample (show eS1) False - Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) [B.pack [0xc2]] $ + Right s1 -> let partCP = E.partialUtf8CodePoint s1 in + whenEqProp partCP 0x01c20000 . + whenEqProp (E.partUtf8CPLen partCP) 1 . + whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xc2) . + whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $ if isUtf8StateIsComplete $ E.utf8CodePointState s1 then counterexample (show $ E.utf8CodePointState s1) False else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of - ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ B.pack [0x80], Right $ B.pack [0xc2]], 2) $ + ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left 0x1c20000], 2) $ case eS2 of Right _ -> counterexample (show eS2) False Left res -> res === (2, mempty) +t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of + ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ + case eS1 of + Left _ -> counterexample (show eS1) False + Right s1 -> let partCP = E.partialUtf8CodePoint s1 in + whenEqProp partCP 0x01f00000 . + whenEqProp (E.partUtf8CPLen partCP) 1 . + whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xf0) . + whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $ + if isUtf8StateIsComplete $ E.utf8CodePointState s1 + then counterexample (show $ E.utf8CodePointState s1) False + else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of + ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (-1, [], 0) $ + case eS2 of + Left _ -> counterexample (show eS2) False + Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in + whenEqProp partCP2 0x03f09080 . + whenEqProp (E.partUtf8CPLen partCP2) 3 . + whenEqProp (E.wordAtPartUft8CP 0 partCP2) (Just 0xf0) . + whenEqProp (E.wordAtPartUft8CP 1 partCP2) (Just 0x90) . + whenEqProp (E.wordAtPartUft8CP 2 partCP2) (Just 0x80) $ + E.wordAtPartUft8CP 3 partCP2 === Nothing + t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> text `T.isInfixOf` @@ -352,7 +388,8 @@ testTranscoding = testProperty "t_decode_with_error3'" t_decode_with_error3', testProperty "t_decode_with_error4'" t_decode_with_error4', testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_chunk" t_decode_chunk, + testProperty "t_decode_chunk1" t_decode_chunk1, + testProperty "t_decode_chunk2" t_decode_chunk2, testProperty "t_infix_concat" t_infix_concat ] ] From 2e4988e801cad3cebc34a8fe103eab7ee6f4a1cd Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 3 Jan 2023 11:58:43 -0500 Subject: [PATCH 58/87] Rebase against master --- .drone.yml | 45 ---------------------- .github/workflows/emulated.yml | 2 +- .github/workflows/haskell-ci.yml | 5 --- .github/workflows/i386.yml | 26 +++++++++++++ cabal.haskell-ci | 2 +- changelog.md | 4 ++ src/Data/Text/Internal/ByteStringCompat.hs | 28 -------------- tests/Tests/Properties/LowLevel.hs | 3 -- text.cabal | 9 ++--- 9 files changed, 36 insertions(+), 88 deletions(-) delete mode 100644 .drone.yml create mode 100644 .github/workflows/i386.yml diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 633e4dd9..00000000 --- a/.drone.yml +++ /dev/null @@ -1,45 +0,0 @@ -kind: pipeline -name: arm64-ghc9.0 -platform: { os: linux, arch: arm64 } -steps: -- name: Test - image: haskell:9.0 - commands: - - uname -a # check platform - - getconf LONG_BIT # check bitness - - cabal update - - cabal new-test ---- -kind: pipeline -name: arm64-ghc9.2 -platform: { os: linux, arch: arm64 } -steps: -- name: Test - image: haskell:9.2 - commands: - - uname -a # check platform - - getconf LONG_BIT # check bitness - - cabal update - - cabal new-test ---- -kind: pipeline -name: i386 -platform: { os: linux, arch: amd64 } -steps: -- name: Test - image: i386/ubuntu - commands: - - export LC_ALL=C.UTF-8 - - apt-get update -y - - apt-get install -y ghc cabal-install zlib1g-dev libgmp-dev build-essential curl - # Build and install a modern cabal, because 1.24 - # does not support cxx-sources or cxx-options. - - cabal --version # 1.24 - - cabal update - - cabal install cabal-install - - export PATH="$HOME/.cabal/bin:$PATH" - - hash -r - # Now with a modern cabal - - cabal --version # 3.0 - - cabal new-update - - cabal new-test diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index 29ba6253..9dae4a25 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: true matrix: - arch: ['s390x', 'ppc64le'] + arch: ['s390x', 'ppc64le', 'armv7', 'aarch64'] steps: - uses: actions/checkout@v2 - uses: uraimo/run-on-arch-action@v2.1.1 diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml index 576842ad..c23ba1d3 100644 --- a/.github/workflows/haskell-ci.yml +++ b/.github/workflows/haskell-ci.yml @@ -68,11 +68,6 @@ jobs: compilerVersion: 8.2.2 setup-method: ghcup allow-failure: false - - compiler: ghc-8.0.2 - compilerKind: ghc - compilerVersion: 8.0.2 - setup-method: ghcup - allow-failure: false fail-fast: false steps: - name: apt diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml new file mode 100644 index 00000000..5ece5ea2 --- /dev/null +++ b/.github/workflows/i386.yml @@ -0,0 +1,26 @@ +name: ci-i386 +on: + - push + - pull_request + +defaults: + run: + shell: bash + +jobs: + i386: + runs-on: ubuntu-latest + container: + image: i386/ubuntu:bionic + steps: + - name: Install + run: | + apt-get update -y + apt-get install -y autoconf build-essential zlib1g-dev libgmp-dev curl libncurses5 libtinfo5 libncurses5-dev libtinfo-dev + curl --proto '=https' --tlsv1.2 -sSf https://get-ghcup.haskell.org | BOOTSTRAP_HASKELL_NONINTERACTIVE=1 sh + - uses: actions/checkout@v1 + - name: Test + run: | + source ~/.ghcup/env + cabal update + cabal test diff --git a/cabal.haskell-ci b/cabal.haskell-ci index 9d605ba8..f7425e79 100644 --- a/cabal.haskell-ci +++ b/cabal.haskell-ci @@ -1,4 +1,4 @@ -ghcup-jobs: >=8.0 +ghcup-jobs: >=8.2 docspec: True docspec-options: --timeout 2 haddock: >=8.6 diff --git a/changelog.md b/changelog.md index 9ea8beef..9bba57f1 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,7 @@ +### Unreleased + +* Remove support for GHC 8.0. + ### 2.0.2 * A suite of functions have been added in `Data.Text.Encoding` that diff --git a/src/Data/Text/Internal/ByteStringCompat.hs b/src/Data/Text/Internal/ByteStringCompat.hs index 4f8b3949..c5f729b6 100644 --- a/src/Data/Text/Internal/ByteStringCompat.hs +++ b/src/Data/Text/Internal/ByteStringCompat.hs @@ -8,13 +8,7 @@ import Data.Word (Word8) import Foreign.ForeignPtr (ForeignPtr) #if !MIN_VERSION_bytestring(0,11,0) -#if MIN_VERSION_base(4,10,0) import GHC.ForeignPtr (plusForeignPtr) -#else -import GHC.ForeignPtr (ForeignPtr(ForeignPtr)) -import GHC.Types (Int (..)) -import GHC.Prim (plusAddr#) -#endif #endif mkBS :: ForeignPtr Word8 -> Int -> ByteString @@ -32,25 +26,3 @@ withBS (BS !sfp !slen) kont = kont sfp slen withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen #endif {-# INLINE withBS #-} - -#if !MIN_VERSION_bytestring(0,11,0) -#if !MIN_VERSION_base(4,10,0) --- |Advances the given address by the given offset in bytes. --- --- The new 'ForeignPtr' shares the finalizer of the original, --- equivalent from a finalization standpoint to just creating another --- reference to the original. That is, the finalizer will not be --- called before the new 'ForeignPtr' is unreachable, nor will it be --- called an additional time due to this call, and the finalizer will --- be called with the same address that it would have had this call --- not happened, *not* the new address. -plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b -plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts -{-# INLINE [0] plusForeignPtr #-} -{-# RULES -"ByteString plusForeignPtr/0" forall fp . - plusForeignPtr fp 0 = fp - #-} -#endif -#endif - diff --git a/tests/Tests/Properties/LowLevel.hs b/tests/Tests/Properties/LowLevel.hs index 92c4e209..c3b0a605 100644 --- a/tests/Tests/Properties/LowLevel.hs +++ b/tests/Tests/Properties/LowLevel.hs @@ -132,10 +132,7 @@ testLowLevel = [ (`hasNoTypes` [''Char, ''[]]) , (`doesNotUseAnyOf` ['T.pack, 'S.unstream, 'T.map, 'safe, 'S.streamList]) , (`doesNotUseAnyOf` ['GHC.unpackCString#, 'GHC.unpackCStringUtf8#]) -#if MIN_VERSION_base(4,10,0) - -- skip this test for GHC 8.0 , (`doesNotUseAnyOf` ['T.unpackCString#, 'T.unpackCStringAscii#]) -#endif ] 't_literal_foo) #endif diff --git a/text.cabal b/text.cabal index 90eca24b..d3176579 100644 --- a/text.cabal +++ b/text.cabal @@ -46,7 +46,6 @@ copyright: 2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper, 2021 Andrew Le category: Data, Text build-type: Simple tested-with: - GHC == 8.0.2 GHC == 8.2.2 GHC == 8.4.4 GHC == 8.6.5 @@ -117,7 +116,7 @@ library -- Certain version of GHC crash on Windows, when TemplateHaskell encounters C++. -- https://gitlab.haskell.org/ghc/ghc/-/issues/19417 - if flag(simdutf) && os(windows) && impl(ghc == 8.0.1 || >= 8.8 && < 8.10.5 || == 9.0.1) + if flag(simdutf) && os(windows) && impl(ghc >= 8.8 && < 8.10.5 || == 9.0.1) build-depends: base < 0 -- For GHC 8.2, 8.6.3 and 8.10.1 even TH + C crash Windows linker. @@ -187,7 +186,7 @@ library build-depends: array >= 0.3 && < 0.6, - base >= 4.9 && < 5, + base >= 4.10 && < 5, binary >= 0.5 && < 0.9, bytestring >= 0.10.4 && < 0.12, deepseq >= 1.1 && < 1.5, @@ -266,9 +265,9 @@ test-suite tests transformers, text - -- Plugin infrastructure does not work properly in 8.0.1 and 8.6.1, and + -- Plugin infrastructure does not work properly in 8.6.1, and -- ghc-9.2.1 library depends on parsec, which causes a circular dependency. - if impl(ghc >= 8.0.2 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2) + if impl(ghc >= 8.2.1 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2) build-depends: tasty-inspection-testing default-language: Haskell2010 From 6e51fddad9f3206a507a0d0de8c8a1d175fcde13 Mon Sep 17 00:00:00 2001 From: david-sledge Date: Wed, 10 Aug 2022 12:44:51 -0600 Subject: [PATCH 59/87] move test utility function whenEqProp --- tests/Tests/QuickCheckUtils.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index 9834e92c..8dc24ffd 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,6 +27,10 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read +<<<<<<< HEAD +======= + +>>>>>>> 06137ca (move test utility function whenEqProp) , whenEqProp ) where @@ -286,7 +290,7 @@ newtype SkewedBool = Skewed { getSkewed :: Bool } instance Arbitrary SkewedBool where arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)] --- like 'when' but with 'Property' instead of a monad. +-- like Control.Monad.when, but with properties instead of monad values whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property whenEqProp a b next = if a == b then next From 47b4dca24dbfb6a7a27b8d4a15876556da397b5a Mon Sep 17 00:00:00 2001 From: david-sledge Date: Sat, 14 Jan 2023 12:03:16 -0700 Subject: [PATCH 60/87] remove sponge left in patient --- tests/Tests/QuickCheckUtils.hs | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index 8dc24ffd..58ad3c47 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,10 +27,7 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read -<<<<<<< HEAD -======= ->>>>>>> 06137ca (move test utility function whenEqProp) , whenEqProp ) where From 9fb6205c185b1b79720be31548af3cda6b67ffd3 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 30 Jan 2023 20:16:04 +0000 Subject: [PATCH 61/87] Refactor decodeASCII --- changelog.md | 2 +- src/Data/Text/Encoding.hs | 69 +++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/changelog.md b/changelog.md index 9bba57f1..136d89a3 100644 --- a/changelog.md +++ b/changelog.md @@ -7,7 +7,7 @@ * A suite of functions have been added in `Data.Text.Encoding` that allow decoding to be aborted on errors without the need to raise an `error` and `catch` it elsewhere: - * `decodeAsciiPrefix` + * `decodeASCIIPrefix` * `TextDataStack` * `dataStack` * `stackLen` diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 69a10658..ee5d1c69 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -40,7 +40,7 @@ module Data.Text.Encoding -- ** Total Functions #total# -- $total , decodeLatin1 - , decodeAsciiPrefix + , decodeASCIIPrefix , TextDataStack , dataStack , stackLen @@ -97,6 +97,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bifunctor (Bifunctor(first)) import Data.Bits (shiftL, shiftR, (.|.), (.&.)) import Data.ByteString (ByteString) +import qualified Data.ByteString.Short.Internal as SBS import Data.Maybe (fromJust) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty) @@ -157,47 +158,51 @@ import qualified Data.ByteString.Unsafe as B -- (preferably not at all). See "Data.Text.Encoding#g:total" for better -- solutions. --- | Decode a 'ByteString' containing 7-bit ASCII encoded text. +-- | Decode a 'ByteString' containing ASCII text. +-- +-- This is a total function which returns a pair of the longest ASCII prefix +-- as 'Text', and the remaining suffix as 'ByteString'. -- --- This is a total function. The 'ByteString' is decoded until either --- the end is reached or it errors with the first non-ASCII 'Word8' is --- encountered. In either case the function will return the 'Text' --- value of the longest prefix that is valid ASCII. On error, the index --- of the non-ASCII 'Word8' is also returned. +-- Important note: the pair is lazy. This lets you check for errors by testing +-- whether the second component is empty, without forcing the first component +-- (which does a copy). +-- To drop references to the input bytestring, force the prefix +-- (using 'seq' or @BangPatterns@) and drop references to the suffix. +-- +-- Properties: +-- - If @(prefix, suffix) = decodeAsciiPrefix s@, then @'encodeUtf8' prefix <> suffix = s@. +-- - Either @suffix@ is empty, or @'B.head' suffix > 127@. -- -- @since 2.0.2 -decodeAsciiPrefix :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> (Text, Maybe (Word8, Int)) -decodeAsciiPrefix bs = if B.null bs - then (empty, Nothing) - else unsafeDupablePerformIO $ withBS bs $ \ fp len -> - unsafeWithForeignPtr fp $ \src -> do - asciiPrefixLen <- fmap fromIntegral . c_is_ascii src $ src `plusPtr` len - let !prefix = if asciiPrefixLen == 0 - then empty - else runST $ do - dst <- A.new asciiPrefixLen - A.copyFromPointer dst 0 src asciiPrefixLen - arr <- A.unsafeFreeze dst - pure $ Text arr 0 asciiPrefixLen - let suffix = if asciiPrefixLen < len - then Just (B.index bs asciiPrefixLen, asciiPrefixLen) - else Nothing - pure (prefix, suffix) +decodeASCIIPrefix :: ByteString -> (Text, ByteString) +decodeASCIIPrefix bs = if B.null bs + then (empty, B.empty) + else + let len = asciiPrefixLength bs + prefix = + let !(SBS.SBS arr) = SBS.toShort (B.take len bs) in + Text (A.ByteArray arr) 0 len + suffix = B.drop len bs in + (prefix, suffix) + +-- | Length of the longest ASCII prefix. +asciiPrefixLength :: ByteString -> Int +asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len -> + unsafeWithForeignPtr fp $ \src -> do + fromIntegral <$> c_is_ascii src (src `plusPtr` len) -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a partial function: it checks that input does not contain -- anything except ASCII and copies buffer or throws an error otherwise. --- decodeASCII :: ByteString -> Text decodeASCII bs = - case decodeAsciiPrefix bs of - (_, Just (word, errPos)) -> error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos - (t, Nothing) -> t + let (prefix, suffix) = decodeASCIIPrefix bs in + case B.uncons suffix of + Nothing -> prefix + Just (word, _) -> + let !errPos = B.length bs - B.length suffix in + error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- From fe5b4e69b93878ea8930bdf5c0b50dc9cfab1540 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Tue, 31 Jan 2023 03:57:00 +0000 Subject: [PATCH 62/87] Refactor decodeUtf8 (WIP) --- src/Data/Text/Encoding.hs | 352 +--------------------- src/Data/Text/Internal/Encoding.hs | 402 ++++++++++++++++++++++++++ src/Data/Text/Lazy/Encoding.hs | 4 +- tests/Tests/Properties/Transcoding.hs | 32 +- text.cabal | 1 + 5 files changed, 429 insertions(+), 362 deletions(-) create mode 100644 src/Data/Text/Internal/Encoding.hs diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index ee5d1c69..e8f2941e 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -25,14 +25,10 @@ module Data.Text.Encoding ( -- * ByteString validation -- $validation - Utf8ValidState - , partialUtf8CodePoint - , partUtf8CPLen - , wordAtPartUft8CP - , utf8CodePointState + Utf8State , validateUtf8Chunk , validateNextUtf8Chunk - , startUtf8ValidState + , startUtf8State -- * Decoding ByteStrings to Text -- $strict @@ -92,26 +88,22 @@ module Data.Text.Encoding ) where import Control.Exception (evaluate, try) -import Control.Monad.ST (ST, runST) +import Control.Monad.ST (runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Data.Bifunctor (Bifunctor(first)) -import Data.Bits (shiftL, shiftR, (.|.), (.&.)) +import Data.Bits (shiftR, (.&.)) import Data.ByteString (ByteString) import qualified Data.ByteString.Short.Internal as SBS -import Data.Maybe (fromJust) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) -import Data.Word (Word32, Word8) +import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) import Data.Text.Internal.ByteStringCompat (withBS) -import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Builder as B @@ -119,6 +111,7 @@ import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP import qualified Data.Text.Array as A +import Data.Text.Internal.Encoding import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F #if defined(ASSERTS) @@ -126,7 +119,6 @@ import GHC.Stack (HasCallStack) #endif #ifdef SIMDUTF -import Foreign.C.Types (CInt(..)) #elif !MIN_VERSION_bytestring(0,11,2) import qualified Data.ByteString.Unsafe as B #endif @@ -240,332 +232,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize --- | This data type represents the state of a 'ByteString' representing --- UTF-8-encoded text. It consists of a value representing whether or --- not the last byte is a complete code point, and on incompletion what --- the 1 to 3 end bytes are that make up the incomplete code point. -data Utf8ValidState = Utf8ValidState - { -- | Get the incomplete UTF-8 code point of the 'ByteString's that - -- have been validated thus far. The first byte of the 'Word32' - -- indicates the number of bytes of the code point are available, - -- and is followed by the bytes of the code point. - partialUtf8CodePoint :: Word32 - -- | Get the current UTF-8 code point state of the 'ByteString's - -- that have been validated thus far. - , utf8CodePointState :: Utf8CodePointState - } - deriving (Eq, Ord, Show) - -partUtf8CPLen :: Word32 -> Int -partUtf8CPLen partCP = fromIntegral $ partCP `shiftR` 24 - -wordAtPartUft8CP :: Int -> Word32 -> Maybe Word8 -wordAtPartUft8CP n partCP - | n < partUtf8CPLen partCP && n >= 0 = - Just . fromIntegral $ partCP `shiftR` (16 - 8 * n) - | otherwise = Nothing - --- | This represtents the starting state of a UTF-8 validation check. -startUtf8ValidState :: Utf8ValidState -startUtf8ValidState = Utf8ValidState 0 utf8StartState - -appendPartialCodePoint :: Word32 -> ByteString -> Word32 -appendPartialCodePoint cp bs@(B.length -> len) = - fst $ B.foldl (\ (cp', bsc) word -> - (cp' .|. (fromIntegral word `shiftL` (16 - bsc * 8)), bsc + 1) - ) (cp + (fromIntegral len `shiftL` 24), partUtf8CPLen cp) bs - -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif - --- | Validate a 'ByteString' as a UTF-8-encoded text. --- --- @validateUtf8Chunk chunk = (n, es)@ --- --- This function returns two values: --- --- * The value 'n' indicates the longest prefix of the 'ByteString' --- that is valid UTF-8-encoded data. --- * The value 'es' indicates whether the 'ByteString' --- --- * (@Left p@) contains an invalid code point and where the next --- (potentially valid) code point begins, so that @p - n@ is the --- number of invalid bytes, or --- * (@Right s@) is valid, and all of the remaining bytes starting --- at inbex 'n' are the beginning of an incomplete UTF-8 code --- point, and 's' is the resulting 'Utf8ValidState' value, which --- can be used to validate against a following 'ByteString' with --- 'validateNextUtf8Chunk'. -validateUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> (Int, Either Int Utf8ValidState) -validateUtf8Chunk bs@(B.length -> len) -#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) - | guessUtf8Boundary > 0 && - -- the rest of the bytestring valid utf-8 up to the boundary - ( -#ifdef SIMDUTF - withBS bs $ \ fp _ -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> - c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) -#else - B.isValidUtf8 $ B.take guessUtf8Boundary bs -#endif - ) = getEndState guessUtf8Boundary - -- No - | otherwise = getEndState 0 - where - getEndState ndx = validateUtf8 ndx ndx utf8StartState - w n word8 = len >= n && word8 <= (B.index bs $ len - n) - guessUtf8Boundary - | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point - | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point - | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point - | otherwise = len -#else - = validateUtf8 0 0 utf8StartState - where -#endif - validateUtf8 !ndx0 ndx s - | ndx < len = - let ndx' = ndx + 1 in - case updateUtf8State (B.index bs ndx) s of - Just s' -> - validateUtf8 ( - if isUtf8StateIsComplete s' - then ndx' - else ndx0 - ) ndx' s' - Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - | otherwise = (ndx0, Right $ Utf8ValidState (appendPartialCodePoint 0 $ B.drop ndx0 bs) s) - --- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. --- --- @validateNextUtf8Chunk chunk s = (n, es)@ --- --- This function returns two values: --- --- * The value 'n' indicates the end position of longest prefix of the --- 'ByteString' that is valid UTF-8-encoded data from the starting --- state 's'. If 's' contains an incomplete code point, the input --- 'ByteString' is considered a continuation. As a result 'n' will be --- negative if the code point is still incomplete or is proven to be --- invalid. --- --- * The value 'es' indicates whether the 'ByteString' --- --- * (@Left p@) contains an invalid code point and where the next --- (potentially valid) code point begins, so that @p - n@ is the --- number of invalid bytes, or --- * (@Right s'@) is valid, and all of the remaining bytes starting --- at index 'n' are the beginning of an incomplete UTF-8 code --- point, and `s'` is the resulting 'Utf8ValidState' value, which --- can be used to validate against a following 'ByteString'. -validateNextUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> Utf8ValidState -> (Int, Either Int Utf8ValidState) -validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8ValidState lead s) - | len > 0 = - let g pos s' - -- first things first. let's try to get to the start of the next code point - | isUtf8StateIsComplete s' = - -- found the beginning of the next code point, hand this off to someone else - case validateUtf8Chunk $ B.drop pos bs of - (len', mS) -> (pos + len', first (+ pos) mS) - -- code point is not complete yet - -- walk the rest of the code point until error, complete, or no more data - | pos < len = - case updateUtf8State (B.index bs pos) s' of - -- error - Nothing -> (leadPos, Left pos) - -- keep going - Just s'' -> g (pos + 1) s'' - -- no more data - | otherwise = (leadPos, Right $ Utf8ValidState (appendPartialCodePoint lead bs) s') - in g 0 s - | otherwise = (leadPos, Right st) - where leadPos = -(partUtf8CPLen lead) - --- | Validated UTF-8 data to be converted into a 'Text' value. -data TextDataStack = TextDataStack - { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values. - dataStack :: [Either Text (Either Word32 ByteString)] - -- | Returns total number of UTF-8 valid bytes in the stack. - , stackLen :: Int - } - deriving Show - --- | Empty stack -emptyStack :: TextDataStack -emptyStack = TextDataStack [] 0 - --- | Push a text value onto the stack -pushText :: Text -> TextDataStack -> TextDataStack -pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = - if tLen > 0 - then TextDataStack (Left t : stack) $ sLen + tLen - else tds - -copyFromStack :: [Either Text (Either Word32 ByteString)] -> Int -> A.MArray s -> ST s () -copyFromStack (dat : dataStack') tLen dst = - (case dat of - Left (Text arr0 off utf8Len) -> do - let dstOff = tLen - utf8Len - A.copyI utf8Len dst dstOff arr0 off - pure dstOff - Right encoded -> - case encoded of - Left partial -> - let utf8Len = partUtf8CPLen partial - dstOff = tLen - utf8Len - g dstOff' = - case wordAtPartUft8CP (dstOff' - dstOff) partial of - Just w -> do - A.unsafeWrite dst dstOff' w - g $ dstOff' + 1 - Nothing -> pure dstOff - in - g dstOff - Right bs@(B.length -> utf8Len) -> do - let dstOff = tLen - utf8Len - withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len - pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst) -copyFromStack _ _ _ = pure () -{-# INLINE copyFromStack #-} - --- | Create a 'Text' value from the contents of a 'TextDataStack'. -stackToText :: -#if defined(ASSERTS) - HasCallStack => -#endif - TextDataStack -> Text -stackToText (TextDataStack stack sLen) - | sLen > 0 = runST $ - do - dst <- A.new sLen - copyFromStack stack sLen dst - arr <- A.unsafeFreeze dst - pure $ Text arr 0 sLen - | otherwise = empty - --- | Decode a 'ByteString' in the context of what has been already been decoded. --- --- The 'ByteString' is validated against the 'Utf8ValidState' using the rules --- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added --- to the input 'TextDataStack' which is returned with the end position of the --- valid prefix, and either the resulting 'Utf8ValidState' --- (@Right Utf8ValidState@) or the position of the of the first (potentially) --- valid byte after the invalid bytes with remainder of the 'ByteString' --- (@Left (Int, ByteString)@). -decodeNextUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString - -> Utf8ValidState - -> TextDataStack - -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) -decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) = - case validateNextUtf8Chunk bs s of - (len, res) -> - let stackedData' - | len >= 0 = - let partCP = partialUtf8CodePoint s - partLen = partUtf8CPLen partCP - stackedData@(TextDataStack stack' sLen') = - if partLen > 0 - then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen - else tds - in - if len > 0 - then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len - else stackedData - | otherwise = tds - in - ( ( len - , case res of - Left pos -> Left (pos, B.drop pos bs) - Right s' -> Right s' - ) - , stackedData' - ) - --- | Decode a 'ByteString' against a start 'Utf8ValidState' with an empty --- 'TextDataStack'. --- --- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8ValidState' 'emptyStack'@ -decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8ValidState), TextDataStack) -decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8ValidState emptyStack - --- | Call an error handler with the give 'String' message for each byte --- in given 'ByteString' and lead data in the given 'Utf8ValidState' --- value. The bytes are the positions from 'errStart' (inclusive) to --- 'errEnd' (exclusive). Any substitute characters are pushed onto the --- supplied 'TextDataStack' argument. -handleUtf8Err - :: OnDecodeError - -> String - -> Int - -> Int - -> Utf8ValidState - -> ByteString - -> TextDataStack - -> TextDataStack -handleUtf8Err onErr errMsg errStart errEnd s bs tds = - let h errPos tds' - | errPos < errEnd = - h (errPos + 1) $ - case onErr errMsg . Just $ B.index bs errPos of - Just c -> pushText (T.singleton c) tds' - Nothing -> tds' - | otherwise = tds' - in - ( if errStart < 0 - then - let partCP = partialUtf8CodePoint s - g pNdx tds' = - case wordAtPartUft8CP pNdx partCP of - Nothing -> h (partUtf8CPLen partCP + errStart) tds' - mW -> g (pNdx + 1) $ - case onErr errMsg mW of - Just c -> pushText (T.singleton c) tds' - Nothing -> tds' - in - g 0 - else h errStart - ) tds - -invalidUtf8Msg :: String -invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - --- | Decode a 'ByteString' containing UTF-8 encoded text. --- --- Surrogate code points in replacement character returned by 'OnDecodeError' --- will be automatically remapped to the replacement char @U+FFFD@. -decodeUtf8With :: -#if defined(ASSERTS) - HasCallStack => -#endif - OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs = - let g bs'@(B.length -> bLen) res = - case res of - ((len, eS), tds) -> - let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in - case eS of - Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8ValidState $ h invalidUtf8Msg pos startUtf8ValidState - Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s - in - g bs $ decodeUtf8Chunk bs - -- $stream -- -- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept @@ -659,18 +325,18 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr bs = let h w32 n ws = if n >= 0 - then h w32 (n - 1) $ fromJust (wordAtPartUft8CP n w32) : ws + then h w32 (n - 1) $ partUtf8CPUnsafeIndex n w32 : ws else ws g bs' s tds = case decodeNextUtf8Chunk bs' s tds of ((len, eS), tds') -> case eS of - Left (pos, bs'') -> g bs'' startUtf8ValidState $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' + Left (pos, bs'') -> g bs'' startUtf8State $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' Right s' -> let bss' = partialUtf8CodePoint s' in Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' -> g bs'' s' emptyStack in - g bs startUtf8ValidState emptyStack + g bs startUtf8State emptyStack -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs new file mode 100644 index 00000000..a16ac3bd --- /dev/null +++ b/src/Data/Text/Internal/Encoding.hs @@ -0,0 +1,402 @@ +{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, + UnliftedFFITypes #-} +{-# LANGUAGE Trustworthy #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} +-- | +module Data.Text.Internal.Encoding where + +import Control.Exception (evaluate, try) +import Control.Monad.ST (ST, runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Data.Bifunctor (Bifunctor(first)) +import Data.Bits (shiftL, shiftR, (.&.)) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Short.Internal as SBS +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) +import Data.Text.Internal (Text(..), empty) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Show as T (singleton) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Word (Word32, Word8) +import Foreign.C.Types (CSize(..)) +import Foreign.Ptr (Ptr, minusPtr, plusPtr) +import Foreign.Storable (poke, peekByteOff) +import GHC.Exts (byteArrayContents#, unsafeCoerce#) +import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding.Utf8 + (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) +import qualified Data.ByteString.Builder.Prim as BP +import qualified Data.ByteString.Builder.Prim.Internal as BP +import qualified Data.Text.Array as A +import qualified Data.Text.Internal.Encoding.Fusion as E +import qualified Data.Text.Internal.Fusion as F +#if defined(ASSERTS) +import GHC.Stack (HasCallStack) +#endif + +#ifdef SIMDUTF +import Foreign.C.Types (CInt(..)) +#elif !MIN_VERSION_bytestring(0,11,2) +import qualified Data.ByteString.Unsafe as B +#endif + +-- | State of decoding a 'ByteString' in UTF-8. +-- It consists of a value representing whether or +-- not the last byte is a complete code point, and on incompletion what +-- the 1 to 3 end bytes are that make up the incomplete code point. +data Utf8State = Utf8State + { -- | Current UTF-8 code point state of the 'ByteString's + -- that have been validated thus far. + utf8CodePointState :: Utf8CodePointState + -- | Get the incomplete UTF-8 code point of the 'ByteString's that + -- have been validated thus far. The first byte of the 'Word32' + -- indicates the number of bytes of the code point are available, + -- and is followed by the bytes of the code point. + , partialUtf8CodePoint :: PartialUtf8CodePoint + } + deriving (Eq, Ord, Show) + +-- | This represtents the starting state of a UTF-8 validation check. +startUtf8State :: Utf8State +startUtf8State = Utf8State utf8StartState partUtf8CPEmpty + +-- | Prefix of a valid UTF-8 encoded code point. +-- This consists of a length (in bytes) between 1 and 3 stored in the most +-- significant byte, and the actual bytes in the rest of the word. +newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32 + deriving (Eq, Ord, Show) + +-- | Empty prefix. +partUtf8CPEmpty :: PartialUtf8CodePoint +partUtf8CPEmpty = PartialUtf8CodePoint 0 + +-- | Length of the partial code point, stored in the most significant byte. +partUtf8CPLen :: PartialUtf8CodePoint -> Int +partUtf8CPLen (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24 + +-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8CPLen c@. +partUtf8CPUnsafeIndex :: +#if defined(ASSERTS) + HasCallStack => +#endif + Int -> PartialUtf8CodePoint -> Word8 +partUtf8CPUnsafeIndex n (PartialUtf8CodePoint w) = +#if defined(ASSERTS) + assert (0 <= n && n < partUtf8CPLen w) $ +#endif + fromIntegral $ w `shiftR` (16 - 8 * n) + +partUtf8CPUnsafeAppend :: +#if defined(ASSERTS) + HasCallStack => +#endif + PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint +partUtf8CPUnsafeAppend c@(PartialUtf8CodePoint word) bs = +#if defined(ASSERTS) + assert (lenc + lenbs <= 3) $ +#endif + PartialUtf8CodePoint $ + tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24) + where + lenc = partUtf8CPLen c + lenbs = B.length bs + tryPush i w = + if i < lenbs + then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i))) + else w + +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + +-- | Validate a 'ByteString' as a UTF-8-encoded text. +-- +-- @validateUtf8Chunk chunk = (n, es)@ +-- +-- This function returns two values: +-- +-- * The value 'n' indicates the longest prefix of the 'ByteString' +-- that is valid UTF-8-encoded data. +-- * The value 'es' indicates whether the 'ByteString' +-- +-- * (@Left p@) contains an invalid code point and where the next +-- (potentially valid) code point begins, so that @p - n@ is the +-- number of invalid bytes, or +-- * (@Right s@) is valid, and all of the remaining bytes starting +-- at inbex 'n' are the beginning of an incomplete UTF-8 code +-- point, and 's' is the resulting 'Utf8State' value, which +-- can be used to validate against a following 'ByteString' with +-- 'validateNextUtf8Chunk'. +validateUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> (Int, Either Int Utf8State) +validateUtf8Chunk bs@(B.length -> len) +#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) + | guessUtf8Boundary > 0 && + -- the rest of the bytestring valid utf-8 up to the boundary + ( +#ifdef SIMDUTF + withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) +#else + B.isValidUtf8 $ B.take guessUtf8Boundary bs +#endif + ) = getEndState guessUtf8Boundary + -- No + | otherwise = getEndState 0 + where + getEndState ndx = validateUtf8 ndx ndx utf8StartState + w n word8 = len >= n && word8 <= (B.index bs $ len - n) + guessUtf8Boundary + | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point + | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point + | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point + | otherwise = len +#else + = validateUtf8 0 0 utf8StartState + where +#endif + validateUtf8 !ndx0 ndx s + | ndx < len = + let ndx' = ndx + 1 in + case updateUtf8State (B.index bs ndx) s of + Just s' -> + validateUtf8 ( + if isUtf8StateIsComplete s' + then ndx' + else ndx0 + ) ndx' s' + Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) + | otherwise = (ndx0, Right $ Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty $ B.drop ndx0 bs)) + +-- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. +-- +-- @validateNextUtf8Chunk chunk s = (n, es)@ +-- +-- This function returns two values: +-- +-- * The value 'n' indicates the end position of longest prefix of the +-- 'ByteString' that is valid UTF-8-encoded data from the starting +-- state 's'. If 's' contains an incomplete code point, the input +-- 'ByteString' is considered a continuation. As a result 'n' will be +-- negative if the code point is still incomplete or is proven to be +-- invalid. +-- +-- * The value 'es' indicates whether the 'ByteString' +-- +-- * (@Left p@) contains an invalid code point and where the next +-- (potentially valid) code point begins, so that @p - n@ is the +-- number of invalid bytes, or +-- * (@Right s'@) is valid, and all of the remaining bytes starting +-- at index 'n' are the beginning of an incomplete UTF-8 code +-- point, and `s'` is the resulting 'Utf8State' value, which +-- can be used to validate against a following 'ByteString'. +validateNextUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString -> Utf8State -> (Int, Either Int Utf8State) +validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8State s lead) + | len > 0 = + let g pos s' + -- first things first. let's try to get to the start of the next code point + | isUtf8StateIsComplete s' = + -- found the beginning of the next code point, hand this off to someone else + case validateUtf8Chunk $ B.drop pos bs of + (len', mS) -> (pos + len', first (+ pos) mS) + -- code point is not complete yet + -- walk the rest of the code point until error, complete, or no more data + | pos < len = + case updateUtf8State (B.index bs pos) s' of + -- error + Nothing -> (leadPos, Left pos) + -- keep going + Just s'' -> g (pos + 1) s'' + -- no more data + | otherwise = (leadPos, Right $ Utf8State s' (partUtf8CPUnsafeAppend lead bs)) + in g 0 s + | otherwise = (leadPos, Right st) + where leadPos = -(partUtf8CPLen lead) + +-- | Validated UTF-8 data to be converted into a 'Text' value. +data TextDataStack = TextDataStack + { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values. + dataStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] + -- | Returns total number of UTF-8 valid bytes in the stack. + , stackLen :: Int + } + deriving Show + +-- | Empty stack +emptyStack :: TextDataStack +emptyStack = TextDataStack [] 0 + +-- | Push a text value onto the stack +pushText :: Text -> TextDataStack -> TextDataStack +pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = + if tLen > 0 + then TextDataStack (Left t : stack) $ sLen + tLen + else tds + +copyFromStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] -> Int -> A.MArray s -> ST s () +copyFromStack (dat : dataStack') tLen dst = + (case dat of + Left (Text arr0 off utf8Len) -> do + let dstOff = tLen - utf8Len + A.copyI utf8Len dst dstOff arr0 off + pure dstOff + Right encoded -> + case encoded of + Left partial -> do + let utf8Len = partUtf8CPLen partial + dstOff = tLen - utf8Len + g i | i < utf8Len = do + A.unsafeWrite dst (dstOff + i) (partUtf8CPUnsafeIndex i partial) + g (i + 1) + | otherwise = pure () + g dstOff + pure utf8Len + Right bs@(B.length -> utf8Len) -> do + let dstOff = tLen - utf8Len + withBS bs $ \ fp _ -> + unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> + unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len + pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst) +copyFromStack _ _ _ = pure () +{-# INLINE copyFromStack #-} + +-- | Create a 'Text' value from the contents of a 'TextDataStack'. +stackToText :: +#if defined(ASSERTS) + HasCallStack => +#endif + TextDataStack -> Text +stackToText (TextDataStack stack sLen) + | sLen > 0 = runST $ + do + dst <- A.new sLen + copyFromStack stack sLen dst + arr <- A.unsafeFreeze dst + pure $ Text arr 0 sLen + | otherwise = empty + +-- | Decode a 'ByteString' in the context of what has been already been decoded. +-- +-- The 'ByteString' is validated against the 'Utf8State' using the rules +-- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added +-- to the input 'TextDataStack' which is returned with the end position of the +-- valid prefix, and either the resulting 'Utf8State' +-- (@Right Utf8State@) or the position of the of the first (potentially) +-- valid byte after the invalid bytes with remainder of the 'ByteString' +-- (@Left (Int, ByteString)@). +decodeNextUtf8Chunk :: +#if defined(ASSERTS) + HasCallStack => +#endif + ByteString + -> Utf8State + -> TextDataStack + -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack) +decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) = + case validateNextUtf8Chunk bs s of + (len, res) -> + let stackedData' + | len >= 0 = + let partCP = partialUtf8CodePoint s + partLen = partUtf8CPLen partCP + stackedData@(TextDataStack stack' sLen') = + if partLen > 0 + then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen + else tds + in + if len > 0 + then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len + else stackedData + | otherwise = tds + in + ( ( len + , case res of + Left pos -> Left (pos, B.drop pos bs) + Right s' -> Right s' + ) + , stackedData' + ) + +-- | Decode a 'ByteString' against a start 'Utf8State' with an empty +-- 'TextDataStack'. +-- +-- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@ +decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack) +decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8State emptyStack + +-- | Call an error handler with the give 'String' message for each byte +-- in given 'ByteString' and lead data in the given 'Utf8State' +-- value. The bytes are the positions from 'errStart' (inclusive) to +-- 'errEnd' (exclusive). Any substitute characters are pushed onto the +-- supplied 'TextDataStack' argument. +handleUtf8Err + :: OnDecodeError + -> String + -> Int + -> Int + -> Utf8State + -> ByteString + -> TextDataStack + -> TextDataStack +handleUtf8Err onErr errMsg errStart errEnd s bs tds = + let h errPos tds' + | errPos < errEnd = + h (errPos + 1) $ + case onErr errMsg . Just $ B.index bs errPos of + Just c -> pushText (T.singleton c) tds' + Nothing -> tds' + | otherwise = tds' + in + ( if errStart < 0 + then + let partCP = partialUtf8CodePoint s + partCPLen = partUtf8CPLen partCP + g i tds' + | i < partCPLen = g (i + 1) $ + case onErr errMsg (Just (partUtf8CPUnsafeIndex i partCP)) of + Just c -> pushText (T.singleton c) tds' + Nothing -> tds' + | otherwise = h (partCPLen + errStart) tds' + in + g 0 + else h errStart + ) tds + +invalidUtf8Msg :: String +invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" + +-- | Decode a 'ByteString' containing UTF-8 encoded text. +-- +-- Surrogate code points in replacement character returned by 'OnDecodeError' +-- will be automatically remapped to the replacement char @U+FFFD@. +decodeUtf8With :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> ByteString -> Text +decodeUtf8With onErr bs = + let g bs' res = + case res of + ((len, eS), tds) -> + let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in + case eS of + Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8State $ h invalidUtf8Msg pos startUtf8State + Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" (B.length bs') s + in + g bs $ decodeUtf8Chunk bs diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 5b5e8bb9..418e473e 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -112,7 +112,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = ((len, eS), tds') -> let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in case eS of - Left (pos, bs') -> g bs' lbs TE.startUtf8ValidState + Left (pos, bs') -> g bs' lbs TE.startUtf8State (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText Right s' -> case lbs of @@ -120,7 +120,7 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds') B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty in - g b0 bs0 TE.startUtf8ValidState TE.emptyStack id + g b0 bs0 TE.startUtf8State TE.emptyStack id decodeUtf8With _ _ = empty -- | Decode a 'ByteString' containing UTF-8 encoded text that is known diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 5585803e..d75cfaaa 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -29,6 +29,7 @@ import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete) +import qualified Data.Text.Internal.Encoding as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL -- import qualified Data.Text.Internal.Lazy as BL @@ -100,11 +101,11 @@ t_pn_utf8_2 = case E.validateUtf8Chunk (B.pack [0xF0]) of t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of (len1, eS1) -> whenEqProp len1 0 $ case eS1 of Left _ -> counterexample (show eS1) False - Right s1 -> whenEqProp (E.partialUtf8CodePoint s1) 0x01c20000 $ + Right s1 -> E.partialUtf8CodePoint s1 === E.PartialUtf8CodePoint 0x01c20000 .&&. if isUtf8StateIsComplete $ E.utf8CodePointState s1 then counterexample (show $ E.utf8CodePointState s1) False else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of - (len2, eS2) -> whenEqProp len2 1 $ eS2 === Left 2 + (len2, eS2) -> len2 === 1 .&&. eS2 === Left 2 t_utf8_c = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id @@ -292,14 +293,13 @@ t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of case eS1 of Left _ -> counterexample (show eS1) False Right s1 -> let partCP = E.partialUtf8CodePoint s1 in - whenEqProp partCP 0x01c20000 . - whenEqProp (E.partUtf8CPLen partCP) 1 . - whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xc2) . - whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $ + partCP === E.PartialUtf8CodePoint 0x01c20000 .&&. + E.partUtf8CPLen partCP === 1 .&&. + E.partUtf8CPUnsafeIndex 0 partCP === 0xc2 .&&. if isUtf8StateIsComplete $ E.utf8CodePointState s1 then counterexample (show $ E.utf8CodePointState s1) False else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of - ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left 0x1c20000], 2) $ + ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left (E.PartialUtf8CodePoint 0x1c20000)], 2) $ case eS2 of Right _ -> counterexample (show eS2) False Left res -> res === (2, mempty) @@ -309,10 +309,9 @@ t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of case eS1 of Left _ -> counterexample (show eS1) False Right s1 -> let partCP = E.partialUtf8CodePoint s1 in - whenEqProp partCP 0x01f00000 . - whenEqProp (E.partUtf8CPLen partCP) 1 . - whenEqProp (E.wordAtPartUft8CP 0 partCP) (Just 0xf0) . - whenEqProp (E.wordAtPartUft8CP 1 partCP) Nothing $ + partCP === E.PartialUtf8CodePoint 0x01f00000 .&&. + E.partUtf8CPLen partCP === 1 .&&. + E.partUtf8CPUnsafeIndex 0 partCP === 0xf0 .&&. if isUtf8StateIsComplete $ E.utf8CodePointState s1 then counterexample (show $ E.utf8CodePointState s1) False else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of @@ -320,12 +319,11 @@ t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of case eS2 of Left _ -> counterexample (show eS2) False Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in - whenEqProp partCP2 0x03f09080 . - whenEqProp (E.partUtf8CPLen partCP2) 3 . - whenEqProp (E.wordAtPartUft8CP 0 partCP2) (Just 0xf0) . - whenEqProp (E.wordAtPartUft8CP 1 partCP2) (Just 0x90) . - whenEqProp (E.wordAtPartUft8CP 2 partCP2) (Just 0x80) $ - E.wordAtPartUft8CP 3 partCP2 === Nothing + partCP2 === E.PartialUtf8CodePoint 0x03f09080 .&&. + E.partUtf8CPLen partCP2 === 3 .&&. + E.partUtf8CPUnsafeIndex 0 partCP2 === 0xf0 .&&. + E.partUtf8CPUnsafeIndex 1 partCP2 === 0x90 .&&. + E.partUtf8CPUnsafeIndex 2 partCP2 === 0x80 t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> diff --git a/text.cabal b/text.cabal index 445bb6e5..0d420fa2 100644 --- a/text.cabal +++ b/text.cabal @@ -150,6 +150,7 @@ library Data.Text.Internal.Builder.RealFloat.Functions Data.Text.Internal.ByteStringCompat Data.Text.Internal.PrimCompat + Data.Text.Internal.Encoding Data.Text.Internal.Encoding.Fusion Data.Text.Internal.Encoding.Fusion.Common Data.Text.Internal.Encoding.Utf16 From 4d9c09091290aeabfb17ff071f255e253297d1b1 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Tue, 31 Jan 2023 18:07:16 +0000 Subject: [PATCH 63/87] More streaming decodeUtf8 (WIP) --- src/Data/Text/Encoding.hs | 43 ++- src/Data/Text/Internal/Encoding.hs | 383 +++++++++++------------- src/Data/Text/Internal/StrictBuilder.hs | 0 src/Data/Text/Lazy/Encoding.hs | 24 +- 4 files changed, 194 insertions(+), 256 deletions(-) create mode 100644 src/Data/Text/Internal/StrictBuilder.hs diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index e8f2941e..d24942a1 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -37,13 +37,6 @@ module Data.Text.Encoding -- $total , decodeLatin1 , decodeASCIIPrefix - , TextDataStack - , dataStack - , stackLen - , emptyStack - , pushText - , stackToText - , decodeNextUtf8Chunk , decodeUtf8Chunk , decodeUtf8Lenient @@ -51,7 +44,6 @@ module Data.Text.Encoding , decodeUtf8' -- *** Controllable error handling - , handleUtf8Err , decodeUtf8With , decodeUtf16LEWith , decodeUtf16BEWith @@ -114,6 +106,7 @@ import qualified Data.Text.Array as A import Data.Text.Internal.Encoding import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F +import Data.Text.Show () #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif @@ -322,21 +315,25 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr bs = - let h w32 n ws = - if n >= 0 - then h w32 (n - 1) $ partUtf8CPUnsafeIndex n w32 : ws - else ws - g bs' s tds = - case decodeNextUtf8Chunk bs' s tds of - ((len, eS), tds') -> - case eS of - Left (pos, bs'') -> g bs'' startUtf8State $ handleUtf8Err onErr invalidUtf8Msg len pos s bs' tds' - Right s' -> let bss' = partialUtf8CodePoint s' in - Some (stackToText tds') (B.pack $ h bss' (partUtf8CPLen bss' - 1) []) $ \ bs'' -> - g bs'' s' emptyStack - in - g bs startUtf8State emptyStack +streamDecodeUtf8With onErr = loop startUtf8State + where + loop s chunk = + let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) chunk s + in Some (strictBuilderToText builder) undecoded (loop s') + +-- | Decode a 'ByteString' containing UTF-8 encoded text. +-- +-- Surrogate code points in replacement character returned by 'OnDecodeError' +-- will be automatically remapped to the replacement char @U+FFFD@. +decodeUtf8With :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> ByteString -> Text +decodeUtf8With onErr = decodeUtf8With1 (onErr invalidUtf8Msg . Just) + +invalidUtf8Msg :: String +invalidUtf8Msg = "Data.Text.Encoding: Invalid UTF-8 stream" -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index a16ac3bd..41b69902 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -1,42 +1,28 @@ {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} {-# LANGUAGE Trustworthy #-} -{-# LANGUAGE TypeApplications #-} -{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE RankNTypes #-} {-# LANGUAGE ViewPatterns #-} -- | module Data.Text.Internal.Encoding where -import Control.Exception (evaluate, try) import Control.Monad.ST (ST, runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Data.Bifunctor (Bifunctor(first)) -import Data.Bits (shiftL, shiftR, (.&.)) +import Data.Bits (shiftL, shiftR) import Data.ByteString (ByteString) -import qualified Data.ByteString.Short.Internal as SBS -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Internal (Text(..), empty) +import Data.Functor (void) +import Data.Text.Internal (Text(..), empty, safe) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word32, Word8) import Foreign.C.Types (CSize(..)) -import Foreign.Ptr (Ptr, minusPtr, plusPtr) -import Foreign.Storable (poke, peekByteOff) -import GHC.Exts (byteArrayContents#, unsafeCoerce#) -import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Foreign.Ptr (Ptr) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 - (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) + (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length) import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B -import qualified Data.ByteString.Builder as B -import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) -import qualified Data.ByteString.Builder.Prim as BP -import qualified Data.ByteString.Builder.Prim.Internal as BP import qualified Data.Text.Array as A -import qualified Data.Text.Internal.Encoding.Fusion as E -import qualified Data.Text.Internal.Fusion as F +import qualified Data.Text.Internal.Unsafe.Char as Char #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif @@ -86,8 +72,8 @@ partUtf8CPUnsafeIndex :: #if defined(ASSERTS) HasCallStack => #endif - Int -> PartialUtf8CodePoint -> Word8 -partUtf8CPUnsafeIndex n (PartialUtf8CodePoint w) = + PartialUtf8CodePoint -> Int -> Word8 +partUtf8CPUnsafeIndex (PartialUtf8CodePoint w) n = #if defined(ASSERTS) assert (0 <= n && n < partUtf8CPLen w) $ #endif @@ -135,50 +121,57 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- point, and 's' is the resulting 'Utf8State' value, which -- can be used to validate against a following 'ByteString' with -- 'validateNextUtf8Chunk'. -validateUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> (Int, Either Int Utf8State) -validateUtf8Chunk bs@(B.length -> len) +validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) +validateUtf8Chunk = validateUtf8ChunkFrom 0 + +-- | Add an offset to the index returned by 'validateUtf8Chunk'. +-- +-- @ +-- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8Chunk' . 'B.drop' n +-- @ +validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) +validateUtf8ChunkFrom ofs bs #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) | guessUtf8Boundary > 0 && -- the rest of the bytestring valid utf-8 up to the boundary ( #ifdef SIMDUTF - withBS bs $ \ fp _ -> unsafeDupablePerformIO $ + withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $ unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) #else - B.isValidUtf8 $ B.take guessUtf8Boundary bs + B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs) #endif - ) = getEndState guessUtf8Boundary + ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) bs -- No - | otherwise = getEndState 0 + | otherwise = slowValidateUtf8ChunkFrom ofs bs where - getEndState ndx = validateUtf8 ndx ndx utf8StartState - w n word8 = len >= n && word8 <= (B.index bs $ len - n) + len = B.length bs + isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n) guessUtf8Boundary - | w 3 0xf0 = len - 3 -- third to last char starts a four-byte code point - | w 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point - | w 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point + | isBoundary 3 0xf0 = len - 3 -- third to last char starts a four-byte code point + | isBoundary 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point + | isBoundary 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point | otherwise = len #else - = validateUtf8 0 0 utf8StartState - where + = slowValidateUtf8ChunkFrom ofs bs #endif - validateUtf8 !ndx0 ndx s - | ndx < len = - let ndx' = ndx + 1 in - case updateUtf8State (B.index bs ndx) s of + +-- | A pure Haskell implementation of validateUtf8Chunk. +-- +-- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us +-- indices to let us avoid this function. +slowValidateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) +slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState + where + loop !utf8End i s + | i < B.length bs = + case updateUtf8State (B.index bs i) s of Just s' -> - validateUtf8 ( - if isUtf8StateIsComplete s' - then ndx' - else ndx0 - ) ndx' s' - Nothing -> (ndx0, Left $ if ndx == ndx0 then ndx' else ndx) - | otherwise = (ndx0, Right $ Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty $ B.drop ndx0 bs)) + let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End + in loop (i + 1) utf8End' s' + Nothing -> (utf8End, Nothing) + | otherwise = (utf8End, Just (Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty (B.drop utf8End bs)))) -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. -- @@ -202,94 +195,78 @@ validateUtf8Chunk bs@(B.length -> len) -- at index 'n' are the beginning of an incomplete UTF-8 code -- point, and `s'` is the resulting 'Utf8State' value, which -- can be used to validate against a following 'ByteString'. -validateNextUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString -> Utf8State -> (Int, Either Int Utf8State) -validateNextUtf8Chunk bs@(B.length -> len) st@(Utf8State s lead) - | len > 0 = - let g pos s' - -- first things first. let's try to get to the start of the next code point - | isUtf8StateIsComplete s' = - -- found the beginning of the next code point, hand this off to someone else - case validateUtf8Chunk $ B.drop pos bs of - (len', mS) -> (pos + len', first (+ pos) mS) - -- code point is not complete yet - -- walk the rest of the code point until error, complete, or no more data - | pos < len = - case updateUtf8State (B.index bs pos) s' of - -- error - Nothing -> (leadPos, Left pos) - -- keep going - Just s'' -> g (pos + 1) s'' - -- no more data - | otherwise = (leadPos, Right $ Utf8State s' (partUtf8CPUnsafeAppend lead bs)) - in g 0 s - | otherwise = (leadPos, Right st) - where leadPos = -(partUtf8CPLen lead) +validateNextUtf8Chunk :: ByteString -> Utf8State -> (Int, Maybe Utf8State) +validateNextUtf8Chunk bs st@(Utf8State s0 part) + | len > 0 = loop 0 s0 + | otherwise = (0, Just st) + where + len = B.length bs + -- Complete an incomplete code point (if there is one) + -- and then jump to validateUtf8ChunkFrom + loop i s + | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs + | i < len = + case updateUtf8State (B.index bs i) s of + Nothing -> (0, Nothing) + Just s' -> loop (i + 1) s' + | otherwise = (0, Just (Utf8State s (partUtf8CPUnsafeAppend part bs))) --- | Validated UTF-8 data to be converted into a 'Text' value. -data TextDataStack = TextDataStack - { -- | Returns a list of 'Text' and UTF-8-valid 'ByteString' values. - dataStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] - -- | Returns total number of UTF-8 valid bytes in the stack. - , stackLen :: Int +-- | Construct an Array. This is currently an internal data structure which is +-- only used to construct Text (so for example byteStringToStrictBuilder should +-- only be applied to valid UTF-8 bytestrings). +data StrictBuilder = StrictBuilder + { sbLength :: !Int + , sbWrite :: forall s. A.MArray s -> Int -> ST s () } - deriving Show --- | Empty stack -emptyStack :: TextDataStack -emptyStack = TextDataStack [] 0 +emptyStrictBuilder :: StrictBuilder +emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) --- | Push a text value onto the stack -pushText :: Text -> TextDataStack -> TextDataStack -pushText t@(Text _ _ tLen) tds@(TextDataStack stack sLen) = - if tLen > 0 - then TextDataStack (Left t : stack) $ sLen + tLen - else tds +-- | Right-biased append: run the right action first. This allows a builder to +-- run tail-recursively when accumulating text left-to-right. +appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder +appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = + StrictBuilder (n1 + n2) (\arr ofs -> do + write2 arr (ofs + n1) + write1 arr ofs) -copyFromStack :: [Either Text (Either PartialUtf8CodePoint ByteString)] -> Int -> A.MArray s -> ST s () -copyFromStack (dat : dataStack') tLen dst = - (case dat of - Left (Text arr0 off utf8Len) -> do - let dstOff = tLen - utf8Len - A.copyI utf8Len dst dstOff arr0 off - pure dstOff - Right encoded -> - case encoded of - Left partial -> do - let utf8Len = partUtf8CPLen partial - dstOff = tLen - utf8Len - g i | i < utf8Len = do - A.unsafeWrite dst (dstOff + i) (partUtf8CPUnsafeIndex i partial) - g (i + 1) - | otherwise = pure () - g dstOff - pure utf8Len - Right bs@(B.length -> utf8Len) -> do - let dstOff = tLen - utf8Len - withBS bs $ \ fp _ -> - unsafeIOToST . unsafeWithForeignPtr fp $ \ src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src utf8Len - pure dstOff) >>= (\ tLen' -> copyFromStack dataStack' tLen' dst) -copyFromStack _ _ _ = pure () -{-# INLINE copyFromStack #-} +copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s () +copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> + unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do + unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len --- | Create a 'Text' value from the contents of a 'TextDataStack'. -stackToText :: -#if defined(ASSERTS) - HasCallStack => -#endif - TextDataStack -> Text -stackToText (TextDataStack stack sLen) - | sLen > 0 = runST $ - do - dst <- A.new sLen - copyFromStack stack sLen dst - arr <- A.unsafeFreeze dst - pure $ Text arr 0 sLen - | otherwise = empty +byteStringToStrictBuilder :: ByteString -> StrictBuilder +byteStringToStrictBuilder bs = + StrictBuilder (B.length bs) (\arr ofs -> copyFromByteString arr ofs bs) + +charToStrictBuilder :: Char -> StrictBuilder +charToStrictBuilder c = + StrictBuilder (utf8Length c) (\arr ofs -> void (Char.unsafeWrite arr ofs (safe c))) + +word8ToStrictBuilder :: Word8 -> StrictBuilder +word8ToStrictBuilder w = + StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w) + +partUtf8CPFoldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a +partUtf8CPFoldr f x0 c = case partUtf8CPLen c of + 0 -> x0 + 1 -> build 0 x0 + 2 -> build 0 (build 1 x0) + _ -> build 0 (build 1 (build 2 x0)) + where + build i x = f (partUtf8CPUnsafeIndex c i) x + +partUtf8CPToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder +partUtf8CPToStrictBuilder = + partUtf8CPFoldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder + +strictBuilderToText :: StrictBuilder -> Text +strictBuilderToText (StrictBuilder 0 _) = empty +strictBuilderToText (StrictBuilder n write) = runST (do + dst <- A.new n + write dst 0 + arr <- A.unsafeFreeze dst + pure (Text arr 0 n)) -- | Decode a 'ByteString' in the context of what has been already been decoded. -- @@ -300,103 +277,77 @@ stackToText (TextDataStack stack sLen) -- (@Right Utf8State@) or the position of the of the first (potentially) -- valid byte after the invalid bytes with remainder of the 'ByteString' -- (@Left (Int, ByteString)@). -decodeNextUtf8Chunk :: -#if defined(ASSERTS) - HasCallStack => -#endif - ByteString - -> Utf8State - -> TextDataStack - -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack) -decodeNextUtf8Chunk bs s tds@(TextDataStack stack sLen) = +decodeUtf8Chunk :: ByteString -> Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8Chunk bs s = case validateNextUtf8Chunk bs s of - (len, res) -> - let stackedData' - | len >= 0 = - let partCP = partialUtf8CodePoint s - partLen = partUtf8CPLen partCP - stackedData@(TextDataStack stack' sLen') = - if partLen > 0 - then TextDataStack (Right (Left partCP) : stack) $ sLen + partLen - else tds - in - if len > 0 - then TextDataStack (Right (Right $ B.take len bs) : stack') $ sLen' + len - else stackedData - | otherwise = tds - in - ( ( len - , case res of - Left pos -> Left (pos, B.drop pos bs) - Right s' -> Right s' - ) - , stackedData' - ) + (len, s') -> + let builder | len == 0 = emptyStrictBuilder + | otherwise = partUtf8CPToStrictBuilder (partialUtf8CodePoint s) + `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs) + in (builder, B.drop len bs, s') -- | Decode a 'ByteString' against a start 'Utf8State' with an empty -- 'TextDataStack'. -- -- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@ -decodeUtf8Chunk :: ByteString -> ((Int, Either (Int, ByteString) Utf8State), TextDataStack) -decodeUtf8Chunk bs = decodeNextUtf8Chunk bs startUtf8State emptyStack +decodeUtf8ChunkStart :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State -- | Call an error handler with the give 'String' message for each byte -- in given 'ByteString' and lead data in the given 'Utf8State' -- value. The bytes are the positions from 'errStart' (inclusive) to -- 'errEnd' (exclusive). Any substitute characters are pushed onto the -- supplied 'TextDataStack' argument. -handleUtf8Err - :: OnDecodeError - -> String - -> Int - -> Int - -> Utf8State - -> ByteString - -> TextDataStack - -> TextDataStack -handleUtf8Err onErr errMsg errStart errEnd s bs tds = - let h errPos tds' - | errPos < errEnd = - h (errPos + 1) $ - case onErr errMsg . Just $ B.index bs errPos of - Just c -> pushText (T.singleton c) tds' - Nothing -> tds' - | otherwise = tds' - in - ( if errStart < 0 - then - let partCP = partialUtf8CodePoint s - partCPLen = partUtf8CPLen partCP - g i tds' - | i < partCPLen = g (i + 1) $ - case onErr errMsg (Just (partUtf8CPUnsafeIndex i partCP)) of - Just c -> pushText (T.singleton c) tds' - Nothing -> tds' - | otherwise = h (partCPLen + errStart) tds' - in - g 0 - else h errStart - ) tds +skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder +skipIncomplete onErr s = + partUtf8CPFoldr + (appendRStrictBuilder . handleUtf8Error onErr) + emptyStrictBuilder (partialUtf8CodePoint s) -invalidUtf8Msg :: String -invalidUtf8Msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" +handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder +handleUtf8Error onErr w = case onErr w of + Just c -> charToStrictBuilder c + Nothing -> emptyStrictBuilder --- | Decode a 'ByteString' containing UTF-8 encoded text. --- --- Surrogate code points in replacement character returned by 'OnDecodeError' --- will be automatically remapped to the replacement char @U+FFFD@. -decodeUtf8With :: +decodeUtf8With1 :: +#if defined(ASSERTS) + HasCallStack => +#endif + (Word8 -> Maybe Char) -> ByteString -> Text +decodeUtf8With1 onErr bs0 = strictBuilderToText $ + builder `appendRStrictBuilder` skipIncomplete onErr s + where + (builder, _, s) = decodeUtf8With2 onErr bs0 startUtf8State + +-- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'. +-- This uses an 'OnDecodeError' to process bad bytes. +-- This is not a very pretty legacy API. +-- See 'decodeUtf8Chunk' for a more flexible alternative. +decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => #endif - OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs = - let g bs' res = - case res of - ((len, eS), tds) -> - let h msg pos s = handleUtf8Err onErr msg len pos s bs' tds in - case eS of - Left (pos, bs'') -> g bs'' . decodeNextUtf8Chunk bs'' startUtf8State $ h invalidUtf8Msg pos startUtf8State - Right s -> stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" (B.length bs') s - in - g bs $ decodeUtf8Chunk bs + (Word8 -> Maybe Char) -> ByteString -> Utf8State -> (StrictBuilder, ByteString, Utf8State) +decodeUtf8With2 onErr bs0 s0 = loop bs0 s0 emptyStrictBuilder + where + loop bs s !builder = + let nonEmptyPrefix len = builder + `appendRStrictBuilder` partUtf8CPToStrictBuilder (partialUtf8CodePoint s) + `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs) + in case validateNextUtf8Chunk bs s of + (len, Nothing) -> + if len == 0 && partUtf8CPLen (partialUtf8CodePoint s) == 0 + then + -- loop is strict on builder, so if onErr raises an error it will be forced here. + let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s + in loop bs startUtf8State builder' + else + let builder' = nonEmptyPrefix len + `appendRStrictBuilder` handleUtf8Error onErr (B.index bs len) + in loop (B.drop 1 bs) startUtf8State builder' + (len, Just s') -> + let builder' = + if len == 0 + then builder `appendRStrictBuilder` skipIncomplete onErr s' + else nonEmptyPrefix len `appendRStrictBuilder` skipIncomplete onErr s' + in (builder', B.drop len bs, s') diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs new file mode 100644 index 00000000..e69de29b diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 418e473e..a50f9d9e 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -61,12 +61,12 @@ import Data.Monoid (Monoid(..)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word8) -import qualified Data.ByteString as S import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B import qualified Data.Text.Encoding as TE +import qualified Data.Text.Internal.Encoding as TE import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E import qualified Data.Text.Internal.Lazy.Fusion as F import Data.Text.Unsafe (unsafeDupablePerformIO) @@ -106,22 +106,12 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text -decodeUtf8With onErr (B.Chunk b0 bs0) = - let g bs@(S.length -> bLen) lbs s tds diffText = - case TE.decodeNextUtf8Chunk bs s tds of - ((len, eS), tds') -> - let h errMsg pos s' = TE.handleUtf8Err onErr errMsg len pos s' bs tds' in - case eS of - Left (pos, bs') -> g bs' lbs TE.startUtf8State - (h "Data.Text.Internal.Encoding: Invalid UTF-8 stream" pos s) diffText - Right s' -> - case lbs of - B.Chunk bs' lbs' -> - g bs' lbs' s' TE.emptyStack $ diffText . chunk (TE.stackToText tds') - B.Empty -> diffText $ chunk (TE.stackToText $ h "Data.Text.Internal.Encoding: Incomplete UTF-8 code point" bLen s') Empty - in - g b0 bs0 TE.startUtf8State TE.emptyStack id -decodeUtf8With _ _ = empty +decodeUtf8With onErr = loop TE.startUtf8State + where + loop s (B.Chunk b0 bs0) = case TE.decodeUtf8With2 onErr' b0 s of + (builder, _, s') -> Chunk (TE.strictBuilderToText builder) (loop s' bs0) + loop s B.Empty = Chunk (TE.strictBuilderToText (TE.skipIncomplete onErr' s)) Empty + onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. From 1a96f2df03e8da76f5bd7663b98f64636013177e Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Fri, 3 Feb 2023 19:14:15 +0000 Subject: [PATCH 64/87] Fix bugs, fix tests --- src/Data/Text/Encoding.hs | 4 +- src/Data/Text/Internal/Encoding.hs | 151 ++++++++------ src/Data/Text/Lazy/Encoding.hs | 8 +- tests/Tests/Properties/Transcoding.hs | 285 +++++++++++++------------- tests/Tests/QuickCheckUtils.hs | 8 - 5 files changed, 237 insertions(+), 219 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index d24942a1..6e52d24d 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -27,7 +27,7 @@ module Data.Text.Encoding -- $validation Utf8State , validateUtf8Chunk - , validateNextUtf8Chunk + , validateUtf8More , startUtf8State -- * Decoding ByteStrings to Text @@ -318,7 +318,7 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr = loop startUtf8State where loop s chunk = - let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) chunk s + let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) s chunk in Some (strictBuilderToText builder) undecoded (loop s') -- | Decode a 'ByteString' containing UTF-8 encoded text. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 41b69902..7ae90d8a 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -8,7 +8,7 @@ module Data.Text.Internal.Encoding where import Control.Monad.ST (ST, runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Data.Bits (shiftL, shiftR) +import Data.Bits ((.&.), shiftL, shiftR) import Data.ByteString (ByteString) import Data.Functor (void) import Data.Text.Internal (Text(..), empty, safe) @@ -17,10 +17,12 @@ import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word32, Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr) +import Foreign.Storable (pokeElemOff) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length) import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as BI import qualified Data.Text.Array as A import qualified Data.Text.Internal.Unsafe.Char as Char #if defined(ASSERTS) @@ -40,64 +42,91 @@ import qualified Data.ByteString.Unsafe as B data Utf8State = Utf8State { -- | Current UTF-8 code point state of the 'ByteString's -- that have been validated thus far. - utf8CodePointState :: Utf8CodePointState + utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState -- | Get the incomplete UTF-8 code point of the 'ByteString's that -- have been validated thus far. The first byte of the 'Word32' -- indicates the number of bytes of the code point are available, -- and is followed by the bytes of the code point. - , partialUtf8CodePoint :: PartialUtf8CodePoint + , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint } deriving (Eq, Ord, Show) -- | This represtents the starting state of a UTF-8 validation check. startUtf8State :: Utf8State -startUtf8State = Utf8State utf8StartState partUtf8CPEmpty +startUtf8State = Utf8State utf8StartState partUtf8Empty --- | Prefix of a valid UTF-8 encoded code point. +-- | Prefix of a valid UTF-8 encoded code point encoded in 4 bytes. -- This consists of a length (in bytes) between 1 and 3 stored in the most -- significant byte, and the actual bytes in the rest of the word. +-- +-- All of its operations are the functions below. +-- The constructor should never be used outside of those. newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32 deriving (Eq, Ord, Show) -- | Empty prefix. -partUtf8CPEmpty :: PartialUtf8CodePoint -partUtf8CPEmpty = PartialUtf8CodePoint 0 +partUtf8Empty :: PartialUtf8CodePoint +partUtf8Empty = PartialUtf8CodePoint 0 -- | Length of the partial code point, stored in the most significant byte. -partUtf8CPLen :: PartialUtf8CodePoint -> Int -partUtf8CPLen (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24 +partUtf8Len :: PartialUtf8CodePoint -> Int +partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24 + +partUtf8CompleteLen :: PartialUtf8CodePoint -> Int +partUtf8CompleteLen c@(PartialUtf8CodePoint w) + | partUtf8Len c == 0 = 0 + | 0xf0 <= firstByte = 4 + | 0xe0 <= firstByte = 3 + | 0xc2 <= firstByte = 2 + | otherwise = 0 + where + firstByte = (w `shiftR` 16) .&. 255 --- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8CPLen c@. -partUtf8CPUnsafeIndex :: +-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@. +partUtf8UnsafeIndex :: #if defined(ASSERTS) HasCallStack => #endif PartialUtf8CodePoint -> Int -> Word8 -partUtf8CPUnsafeIndex (PartialUtf8CodePoint w) n = +partUtf8UnsafeIndex (PartialUtf8CodePoint w) n = #if defined(ASSERTS) - assert (0 <= n && n < partUtf8CPLen w) $ + assert (0 <= n && n < partUtf8Len w) $ #endif fromIntegral $ w `shiftR` (16 - 8 * n) -partUtf8CPUnsafeAppend :: +partUtf8UnsafeAppend :: #if defined(ASSERTS) HasCallStack => #endif PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint -partUtf8CPUnsafeAppend c@(PartialUtf8CodePoint word) bs = +partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs = #if defined(ASSERTS) assert (lenc + lenbs <= 3) $ #endif PartialUtf8CodePoint $ tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24) where - lenc = partUtf8CPLen c + lenc = partUtf8Len c lenbs = B.length bs tryPush i w = if i < lenbs then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i))) else w +-- | This avoids recursion to unfold to straightline code. +partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a +partUtf8Foldr f x0 c = case partUtf8Len c of + 0 -> x0 + 1 -> build 0 x0 + 2 -> build 0 (build 1 x0) + _ -> build 0 (build 1 (build 2 x0)) + where + build i x = f (partUtf8UnsafeIndex c i) x + +partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString +partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr -> + partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0 + #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt @@ -105,7 +134,7 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- | Validate a 'ByteString' as a UTF-8-encoded text. -- --- @validateUtf8Chunk chunk = (n, es)@ +-- @validateUtf8More chunk = (n, es)@ -- -- This function returns two values: -- @@ -124,10 +153,10 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) validateUtf8Chunk = validateUtf8ChunkFrom 0 --- | Add an offset to the index returned by 'validateUtf8Chunk'. +-- | Add an offset to the index returned by 'validateUtf8More'. -- -- @ --- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8Chunk' . 'B.drop' n +-- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8More' . 'B.drop' n -- @ validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) validateUtf8ChunkFrom ofs bs @@ -146,7 +175,7 @@ validateUtf8ChunkFrom ofs bs -- No | otherwise = slowValidateUtf8ChunkFrom ofs bs where - len = B.length bs + len = B.length bs - ofs isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n) guessUtf8Boundary | isBoundary 3 0xf0 = len - 3 -- third to last char starts a four-byte code point @@ -157,7 +186,7 @@ validateUtf8ChunkFrom ofs bs = slowValidateUtf8ChunkFrom ofs bs #endif --- | A pure Haskell implementation of validateUtf8Chunk. +-- | A pure Haskell implementation of validateUtf8More. -- -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us -- indices to let us avoid this function. @@ -169,9 +198,9 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState case updateUtf8State (B.index bs i) s of Just s' -> let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End - in loop (i + 1) utf8End' s' + in loop utf8End' (i + 1) s' Nothing -> (utf8End, Nothing) - | otherwise = (utf8End, Just (Utf8State s (partUtf8CPUnsafeAppend partUtf8CPEmpty (B.drop utf8End bs)))) + | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) -- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. -- @@ -195,27 +224,27 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState -- at index 'n' are the beginning of an incomplete UTF-8 code -- point, and `s'` is the resulting 'Utf8State' value, which -- can be used to validate against a following 'ByteString'. -validateNextUtf8Chunk :: ByteString -> Utf8State -> (Int, Maybe Utf8State) -validateNextUtf8Chunk bs st@(Utf8State s0 part) +validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) +validateUtf8More st@(Utf8State s0 part) bs | len > 0 = loop 0 s0 | otherwise = (0, Just st) where len = B.length bs -- Complete an incomplete code point (if there is one) -- and then jump to validateUtf8ChunkFrom - loop i s + loop !i s | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs | i < len = case updateUtf8State (B.index bs i) s of Nothing -> (0, Nothing) Just s' -> loop (i + 1) s' - | otherwise = (0, Just (Utf8State s (partUtf8CPUnsafeAppend part bs))) + | otherwise = (0, Just (Utf8State s (partUtf8UnsafeAppend part bs))) -- | Construct an Array. This is currently an internal data structure which is -- only used to construct Text (so for example byteStringToStrictBuilder should -- only be applied to valid UTF-8 bytestrings). data StrictBuilder = StrictBuilder - { sbLength :: !Int + { sbLength :: {-# UNPACK #-} !Int , sbWrite :: forall s. A.MArray s -> Int -> ST s () } @@ -247,18 +276,9 @@ word8ToStrictBuilder :: Word8 -> StrictBuilder word8ToStrictBuilder w = StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w) -partUtf8CPFoldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a -partUtf8CPFoldr f x0 c = case partUtf8CPLen c of - 0 -> x0 - 1 -> build 0 x0 - 2 -> build 0 (build 1 x0) - _ -> build 0 (build 1 (build 2 x0)) - where - build i x = f (partUtf8CPUnsafeIndex c i) x - -partUtf8CPToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder -partUtf8CPToStrictBuilder = - partUtf8CPFoldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder +partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder +partUtf8ToStrictBuilder = + partUtf8Foldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder strictBuilderToText :: StrictBuilder -> Text strictBuilderToText (StrictBuilder 0 _) = empty @@ -277,21 +297,21 @@ strictBuilderToText (StrictBuilder n write) = runST (do -- (@Right Utf8State@) or the position of the of the first (potentially) -- valid byte after the invalid bytes with remainder of the 'ByteString' -- (@Left (Int, ByteString)@). -decodeUtf8Chunk :: ByteString -> Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State) -decodeUtf8Chunk bs s = - case validateNextUtf8Chunk bs s of +decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8More s bs = + case validateUtf8More s bs of (len, s') -> let builder | len == 0 = emptyStrictBuilder - | otherwise = partUtf8CPToStrictBuilder (partialUtf8CodePoint s) + | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s) `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs) in (builder, B.drop len bs, s') -- | Decode a 'ByteString' against a start 'Utf8State' with an empty -- 'TextDataStack'. -- --- @decodeUtf8Chunk bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@ -decodeUtf8ChunkStart :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) -decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State +-- @decodeUtf8More bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@ +decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8Chunk = decodeUtf8More startUtf8State -- | Call an error handler with the give 'String' message for each byte -- in given 'ByteString' and lead data in the given 'Utf8State' @@ -300,7 +320,7 @@ decodeUtf8ChunkStart bs = decodeUtf8Chunk bs startUtf8State -- supplied 'TextDataStack' argument. skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder skipIncomplete onErr s = - partUtf8CPFoldr + partUtf8Foldr (appendRStrictBuilder . handleUtf8Error onErr) emptyStrictBuilder (partialUtf8CodePoint s) @@ -309,6 +329,7 @@ handleUtf8Error onErr w = case onErr w of Just c -> charToStrictBuilder c Nothing -> emptyStrictBuilder +-- | Helper for 'decodeUtfWith'. decodeUtf8With1 :: #if defined(ASSERTS) HasCallStack => @@ -317,37 +338,37 @@ decodeUtf8With1 :: decodeUtf8With1 onErr bs0 = strictBuilderToText $ builder `appendRStrictBuilder` skipIncomplete onErr s where - (builder, _, s) = decodeUtf8With2 onErr bs0 startUtf8State + (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0 -- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'. -- This uses an 'OnDecodeError' to process bad bytes. -- This is not a very pretty legacy API. --- See 'decodeUtf8Chunk' for a more flexible alternative. +-- See 'decodeUtf8More' for a more flexible alternative. decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => #endif - (Word8 -> Maybe Char) -> ByteString -> Utf8State -> (StrictBuilder, ByteString, Utf8State) -decodeUtf8With2 onErr bs0 s0 = loop bs0 s0 emptyStrictBuilder + (Word8 -> Maybe Char) -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State) +decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder where - loop bs s !builder = + loop s i !builder = let nonEmptyPrefix len = builder - `appendRStrictBuilder` partUtf8CPToStrictBuilder (partialUtf8CodePoint s) - `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs) - in case validateNextUtf8Chunk bs s of + `appendRStrictBuilder` partUtf8ToStrictBuilder (partialUtf8CodePoint s) + `appendRStrictBuilder` byteStringToStrictBuilder (B.take len (B.drop i bs)) + in case validateUtf8More s (B.drop i bs) of (len, Nothing) -> - if len == 0 && partUtf8CPLen (partialUtf8CodePoint s) == 0 + if len == 0 && utf8CodePointState s /= utf8StartState then -- loop is strict on builder, so if onErr raises an error it will be forced here. let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s - in loop bs startUtf8State builder' + in loop startUtf8State i builder' else let builder' = nonEmptyPrefix len - `appendRStrictBuilder` handleUtf8Error onErr (B.index bs len) - in loop (B.drop 1 bs) startUtf8State builder' + `appendRStrictBuilder` handleUtf8Error onErr (B.index bs (i + len)) + in loop startUtf8State (i + len + 1) builder' (len, Just s') -> - let builder' = - if len == 0 - then builder `appendRStrictBuilder` skipIncomplete onErr s' - else nonEmptyPrefix len `appendRStrictBuilder` skipIncomplete onErr s' - in (builder', B.drop len bs, s') + let builder' = if len == 0 then builder else nonEmptyPrefix len + undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s') + then B.drop (i + len) bs -- Reuse bs if possible + else partUtf8ToByteString (partialUtf8CodePoint s') + in (builder', undecoded, s') diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index a50f9d9e..1def3f51 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -108,9 +108,11 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr = loop TE.startUtf8State where - loop s (B.Chunk b0 bs0) = case TE.decodeUtf8With2 onErr' b0 s of - (builder, _, s') -> Chunk (TE.strictBuilderToText builder) (loop s' bs0) - loop s B.Empty = Chunk (TE.strictBuilderToText (TE.skipIncomplete onErr' s)) Empty + chunkb builder t | TE.sbLength builder == 0 = t + | otherwise = Chunk (TE.strictBuilderToText builder) t + loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr' s b of + (builder, _, s') -> chunkb builder (loop s' bs) + loop s B.Empty = chunkb (TE.skipIncomplete onErr' s) Empty onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just -- | Decode a 'ByteString' containing UTF-8 encoded text that is known diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index d75cfaaa..8e7a614e 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -11,10 +11,11 @@ module Tests.Properties.Transcoding import Prelude hiding (head, tail) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) -import Data.Either (isLeft, isRight) +import Data.Functor (void) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) +import Test.Tasty.HUnit ((@?=), assertBool, assertFailure, testCase) import Tests.QuickCheckUtils import qualified Control.Exception as Exception import qualified Data.Bits as Bits (shiftL, shiftR) @@ -28,12 +29,10 @@ import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E -import Data.Text.Internal.Encoding.Utf8 (isUtf8StateIsComplete) import qualified Data.Text.Internal.Encoding as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL --- import qualified Data.Text.Internal.Lazy as BL --- import qualified Data.ByteString.Lazy.Internal as BS +import Data.Word (Word8) t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t @@ -43,71 +42,69 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) -t_p_utf8_1 = case E.validateUtf8Chunk (B.pack [0x63]) of - (result, st) -> whenEqProp result 1 . property $ isRight st -t_p_utf8_2 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63]) of - (result, st) -> whenEqProp result 3 . property $ isRight st -t_p_utf8_3 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xc2, 0x80, 0x63]) of - (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_4 = case E.validateUtf8Chunk (B.pack [0x63, 0xe1, 0x80, 0x80, 0x63]) of - (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_5 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x80, 0x63]) of - (result, st) -> whenEqProp result 5 . property $ isRight st -t_p_utf8_6 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xF0, 0x90, 0x80]) of - (result, st) -> whenEqProp result 2 . property $ isRight st -t_p_utf8_7 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x63, 0xF0, 0x90]) of - (result, st) -> whenEqProp result 3 . property $ isRight st -t_p_utf8_8 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80, 0x63, 0x63]) of - (result, st) -> whenEqProp result 0 $ st === Left 3 -t_p_utf8_9 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0x80, 0x63, 0x63]) of - (result, st) -> whenEqProp result 2 $ st === Left 3 -t_p_utf8_0 = case E.validateUtf8Chunk (B.pack [0x63, 0x63, 0xe1, 0x63, 0x63]) of - (result, st) -> whenEqProp result 2 $ st === Left 3 - -t_pn_utf8_1 = case E.validateUtf8Chunk (B.pack [0xF0, 0x90, 0x80]) of - (result0, mS) -> whenEqProp result0 0 $ - case mS of - Left _ -> counterexample (show mS) False - Right s -> case E.validateNextUtf8Chunk (B.pack [0x80]) s of - (result1, mS1) -> whenEqProp result1 1 $ - if isLeft mS1 - then counterexample (show mS1) False - else case E.validateNextUtf8Chunk (B.pack [0x7f]) s of - (result2, mS2) -> whenEqProp result2 (-3) $ mS2 === Left 0 -t_pn_utf8_2 = case E.validateUtf8Chunk (B.pack [0xF0]) of - (result0, mS0) -> whenEqProp result0 0 $ - case mS0 of - Left _ -> counterexample (show mS0) False - Right s0 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s0 of - (result1, mS1) -> whenEqProp result1 (-1) . - whenEqProp mS1 (Left 0) $ - case E.validateNextUtf8Chunk (B.pack [0x90]) s0 of - (result2, mS2) -> whenEqProp result2 (-1) $ - case mS2 of - Left _ -> counterexample (show mS2) False - Right s1 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s1 of - (result3, mS3) -> whenEqProp result3 (-2) . - whenEqProp mS3 (Left 0) $ - case E.validateNextUtf8Chunk (B.pack [0x80]) s1 of - (result4, mS4) -> whenEqProp result4 (-2) $ - case mS4 of - Left _ -> counterexample (show mS3) False - Right s2 -> case E.validateNextUtf8Chunk (B.pack [0x7f]) s2 of - (result5, mS5) -> whenEqProp result5 (-3) . - whenEqProp mS5 (Left 0) $ - case E.validateNextUtf8Chunk (B.pack [0x80]) s2 of - (result6, mS6) -> whenEqProp result6 1 $ - property $ isRight mS6 -t_pn_utf8_3 = case E.validateUtf8Chunk $ B.pack [0xc2] of - (len1, eS1) -> whenEqProp len1 0 $ case eS1 of - Left _ -> counterexample (show eS1) False - Right s1 -> E.partialUtf8CodePoint s1 === E.PartialUtf8CodePoint 0x01c20000 .&&. - if isUtf8StateIsComplete $ E.utf8CodePointState s1 - then counterexample (show $ E.utf8CodePointState s1) False - else case E.validateNextUtf8Chunk (B.pack [0x80, 0x80]) s1 of - (len2, eS2) -> len2 === 1 .&&. eS2 === Left 2 - -t_utf8_c = (E.stackToText . snd . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id +t_p_utf8_1 = testValidateUtf8_ [0x63] 1 +t_p_utf8_2 = testValidateUtf8_ [0x63, 0x63, 0x63] 3 +t_p_utf8_3 = testValidateUtf8_ [0x63, 0x63, 0xc2, 0x80, 0x63] 5 +t_p_utf8_4 = testValidateUtf8_ [0x63, 0xe1, 0x80, 0x80, 0x63] 5 +t_p_utf8_5 = testValidateUtf8_ [0xF0, 0x90, 0x80, 0x80, 0x63] 5 +t_p_utf8_6 = testValidateUtf8_ [0x63, 0x63, 0xF0, 0x90, 0x80] 2 +t_p_utf8_7 = testValidateUtf8_ [0x63, 0x63, 0x63, 0xF0, 0x90] 3 +t_p_utf8_8 = testValidateUtf8Fail [0xF0, 0x90, 0x80, 0x63, 0x63] 0 +t_p_utf8_9 = testValidateUtf8Fail [0x63, 0x63, 0x80, 0x63, 0x63] 2 +t_p_utf8_0 = testValidateUtf8Fail [0x63, 0x63, 0xe1, 0x63, 0x63] 2 + +testValidateUtf8With :: + (B.ByteString -> (Int, Maybe E.Utf8State)) -> + (Maybe E.Utf8State -> IO r) -> + [Word8] -> Int -> IO r +testValidateUtf8With validate k xs expectedLen = case validate (B.pack xs) of + (len, s) -> do + len @?= expectedLen + k s + +expectJust :: Maybe a -> IO a +expectJust Nothing = assertFailure "Unexpected Nothing" +expectJust (Just s) = pure s + +expectNothing :: Maybe a -> IO () +expectNothing Nothing = pure () +expectNothing (Just _) = assertFailure "Unexpected Just" + +testValidateUtf8 :: [Word8] -> Int -> IO E.Utf8State +testValidateUtf8 = testValidateUtf8With E.validateUtf8Chunk expectJust + +testValidateUtf8_ :: [Word8] -> Int -> IO () +testValidateUtf8_ = testValidateUtf8With E.validateUtf8Chunk (void . expectJust) + +testValidateUtf8Fail :: [Word8] -> Int -> IO () +testValidateUtf8Fail = testValidateUtf8With E.validateUtf8Chunk expectNothing + +testValidateUtf8More :: E.Utf8State -> [Word8] -> Int -> IO E.Utf8State +testValidateUtf8More s = testValidateUtf8With (E.validateUtf8More s) expectJust + +testValidateUtf8MoreFail :: E.Utf8State -> [Word8] -> Int -> IO () +testValidateUtf8MoreFail s = testValidateUtf8With (E.validateUtf8More s) expectNothing + +t_pn_utf8_1 = do + s <- testValidateUtf8 [0xF0, 0x90, 0x80] 0 + _ <- testValidateUtf8More s [0x80] 1 + testValidateUtf8MoreFail s [0x7f] 0 +t_pn_utf8_2 = do + s0 <- testValidateUtf8 [0xF0] 0 + testValidateUtf8MoreFail s0 [0x7f] 0 + s1 <- testValidateUtf8More s0 [0x90] 0 + testValidateUtf8MoreFail s1 [0x7f] 0 + s2 <- testValidateUtf8More s1 [0x80] 0 + testValidateUtf8MoreFail s2 [0x7f] 0 + _ <- testValidateUtf8More s2 [0x80] 1 + pure () +t_pn_utf8_3 = do + s1 <- testValidateUtf8 [0xc2] 0 + let c = E.partialUtf8CodePoint s1 + assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c + testValidateUtf8MoreFail s1 [0x80, 0x80] 1 + +t_utf8_c = (E.strictBuilderToText . fst3 . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id @@ -121,6 +118,9 @@ tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id +fst3 :: (a, b, c) -> a +fst3 (a, _, _) = a + runBuilder :: B.Builder -> B.ByteString runBuilder = -- Use smallish buffers to exercise bufferFull case as well @@ -268,62 +268,63 @@ t_decode_with_error3 = t_decode_with_error4 = E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa" -t_decode_with_error1' = - case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) of - E.Some x1 bs1 f1 -> whenEqProp x1 "" . whenEqProp bs1 (B.pack [0xc2]) $ - case f1 $ B.pack [0x80, 0x80] of - E.Some x2 bs2 _ -> whenEqProp x2 "\x80x" $ bs2 === mempty +t_decode_with_error1' = do + E.Some x1 bs1 f1 <- pure $ E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) + x1 @?= "" + bs1 @?= B.pack [0xc2] + E.Some x2 bs2 _ <- pure $ f1 $ B.pack [0x80, 0x80] + x2 @?= "\x80x" + bs2 @?= mempty t_decode_with_error2' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) of - E.Some x _ _ -> x === "xa" + E.Some x _ _ -> x @?= "xa" t_decode_with_error3' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97]) of - E.Some x _ _ -> x === "xaa" + E.Some x _ _ -> x @?= "xaa" t_decode_with_error4' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of - E.Some x _ _ -> x === "xaaa" -t_decode_with_error5' = ioProperty $ do + E.Some x _ _ -> x @?= "xaaa" +t_decode_with_error5' = do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) - pure $ case ret of - Left (_ :: E.UnicodeException) -> True - Right{} -> False - -t_decode_chunk1 = case E.decodeUtf8Chunk $ B.pack [0xc2] of - ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ - case eS1 of - Left _ -> counterexample (show eS1) False - Right s1 -> let partCP = E.partialUtf8CodePoint s1 in - partCP === E.PartialUtf8CodePoint 0x01c20000 .&&. - E.partUtf8CPLen partCP === 1 .&&. - E.partUtf8CPUnsafeIndex 0 partCP === 0xc2 .&&. - if isUtf8StateIsComplete $ E.utf8CodePointState s1 - then counterexample (show $ E.utf8CodePointState s1) False - else case E.decodeNextUtf8Chunk (B.pack [0x80, 0x80]) s1 tds1 of - ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (1, [Right $ Right $ B.pack [0x80], Right $ Left (E.PartialUtf8CodePoint 0x1c20000)], 2) $ - case eS2 of - Right _ -> counterexample (show eS2) False - Left res -> res === (2, mempty) - -t_decode_chunk2 = case E.decodeUtf8Chunk $ B.pack [0xf0] of - ((len1, eS1), tds1) -> whenEqProp (len1, E.dataStack tds1, E.stackLen tds1) (0, [], 0) $ - case eS1 of - Left _ -> counterexample (show eS1) False - Right s1 -> let partCP = E.partialUtf8CodePoint s1 in - partCP === E.PartialUtf8CodePoint 0x01f00000 .&&. - E.partUtf8CPLen partCP === 1 .&&. - E.partUtf8CPUnsafeIndex 0 partCP === 0xf0 .&&. - if isUtf8StateIsComplete $ E.utf8CodePointState s1 - then counterexample (show $ E.utf8CodePointState s1) False - else case E.decodeNextUtf8Chunk (B.pack [0x90, 0x80]) s1 tds1 of - ((len2, eS2), tds2) -> whenEqProp (len2, E.dataStack tds2, E.stackLen tds2) (-1, [], 0) $ - case eS2 of - Left _ -> counterexample (show eS2) False - Right s2 -> let partCP2 = E.partialUtf8CodePoint s2 in - partCP2 === E.PartialUtf8CodePoint 0x03f09080 .&&. - E.partUtf8CPLen partCP2 === 3 .&&. - E.partUtf8CPUnsafeIndex 0 partCP2 === 0xf0 .&&. - E.partUtf8CPUnsafeIndex 1 partCP2 === 0x90 .&&. - E.partUtf8CPUnsafeIndex 2 partCP2 === 0x80 + case ret of + Left (_ :: E.UnicodeException) -> pure () + Right{} -> assertFailure "Unexpected success" + +testDecodeUtf8With :: (Maybe E.Utf8State -> IO r) -> E.Utf8State -> [Word8] -> T.Text -> IO r +testDecodeUtf8With k s xs expected = + let xs' = B.pack xs in + case E.decodeUtf8More s xs' of + (prefix, bs, s') -> do + let txt = E.strictBuilderToText prefix + txt @?= expected + if T.null txt then + bs @?= xs' + else + let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s) + in E.encodeUtf8 txt <> bs @?= c <> xs' + k s' + +testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State +testDecodeUtf8 = testDecodeUtf8With (\ms -> case ms of + Just s -> pure s + Nothing -> assertFailure "Unexpected failure") + +testDecodeUtf8Fail :: E.Utf8State -> [Word8] -> T.Text -> IO () +testDecodeUtf8Fail = testDecodeUtf8With (\ms -> case ms of + Just _ -> assertFailure "Unexpected failure" + Nothing -> pure ()) + +t_decode_chunk1 = do + s1 <- testDecodeUtf8 E.startUtf8State [0xc2] "" + let c1 = E.partialUtf8CodePoint s1 + E.partUtf8Len c1 @?= 1 + testDecodeUtf8Fail s1 [0x80, 0x80] "\128" + +t_decode_chunk2 = do + s1 <- testDecodeUtf8 E.startUtf8State [0xf0] "" + s2 <- testDecodeUtf8 s1 [0x90, 0x80] "" + _ <- testDecodeUtf8 s2 [0x80, 0x41] "\65536A" + pure () t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> @@ -337,24 +338,8 @@ testTranscoding = testProperty "tl_ascii" tl_ascii, testProperty "t_latin1" t_latin1, testProperty "tl_latin1" tl_latin1, - testProperty "t_p_utf8_1" t_p_utf8_1, - testProperty "t_p_utf8_2" t_p_utf8_2, - testProperty "t_p_utf8_3" t_p_utf8_3, - testProperty "t_p_utf8_4" t_p_utf8_4, - testProperty "t_p_utf8_5" t_p_utf8_5, - testProperty "t_p_utf8_6" t_p_utf8_6, - testProperty "t_p_utf8_7" t_p_utf8_7, - testProperty "t_p_utf8_8" t_p_utf8_8, - testProperty "t_p_utf8_9" t_p_utf8_9, - testProperty "t_p_utf8_0" t_p_utf8_0, - testProperty "t_pn_utf8_1" t_pn_utf8_1, - testProperty "t_pn_utf8_2" t_pn_utf8_2, - testProperty "t_pn_utf8_3" t_pn_utf8_3, - testProperty "t_utf8_c" t_utf8_c, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', - testProperty "t_utf8_incr" t_utf8_incr, - testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "t_utf16LE" t_utf16LE, @@ -381,13 +366,31 @@ testTranscoding = testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, - testProperty "t_decode_with_error1'" t_decode_with_error1', - testProperty "t_decode_with_error2'" t_decode_with_error2', - testProperty "t_decode_with_error3'" t_decode_with_error3', - testProperty "t_decode_with_error4'" t_decode_with_error4', - testProperty "t_decode_with_error5'" t_decode_with_error5', - testProperty "t_decode_chunk1" t_decode_chunk1, - testProperty "t_decode_chunk2" t_decode_chunk2, testProperty "t_infix_concat" t_infix_concat + ], + testGroup "streaming" [ + testProperty "t_utf8_undecoded" t_utf8_undecoded, + testProperty "t_utf8_incr" t_utf8_incr, + testProperty "t_utf8_c" t_utf8_c, + testCase "t_p_utf8_1" t_p_utf8_1, + testCase "t_p_utf8_2" t_p_utf8_2, + testCase "t_p_utf8_3" t_p_utf8_3, + testCase "t_p_utf8_4" t_p_utf8_4, + testCase "t_p_utf8_5" t_p_utf8_5, + testCase "t_p_utf8_6" t_p_utf8_6, + testCase "t_p_utf8_7" t_p_utf8_7, + testCase "t_p_utf8_8" t_p_utf8_8, + testCase "t_p_utf8_9" t_p_utf8_9, + testCase "t_p_utf8_0" t_p_utf8_0, + testCase "t_pn_utf8_1" t_pn_utf8_1, + testCase "t_pn_utf8_2" t_pn_utf8_2, + testCase "t_pn_utf8_3" t_pn_utf8_3, + testCase "t_decode_chunk1" t_decode_chunk1, + testCase "t_decode_chunk2" t_decode_chunk2 + testCase "t_decode_with_error1'" t_decode_with_error1', + testCase "t_decode_with_error2'" t_decode_with_error2', + testCase "t_decode_with_error3'" t_decode_with_error3', + testCase "t_decode_with_error4'" t_decode_with_error4', + testCase "t_decode_with_error5'" t_decode_with_error5', ] ] diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index 58ad3c47..8f36f7ec 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -27,8 +27,6 @@ module Tests.QuickCheckUtils , eqPSqrt , write_read - - , whenEqProp ) where import Control.Arrow ((***)) @@ -286,9 +284,3 @@ newtype SkewedBool = Skewed { getSkewed :: Bool } instance Arbitrary SkewedBool where arbitrary = Skewed <$> frequency [(1, pure False), (5, pure True)] - --- like Control.Monad.when, but with properties instead of monad values -whenEqProp :: (Eq a, Show a) => a -> a -> Property -> Property -whenEqProp a b next = if a == b - then next - else a === b From 6a65ed66a159340a4351383dad33ea772abca359 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Fri, 3 Feb 2023 22:10:21 +0000 Subject: [PATCH 65/87] Rework docs --- src/Data/Text/Encoding.hs | 44 +++-- src/Data/Text/Internal/Encoding.hs | 255 ++++++++++++++++++-------- tests/Tests/Properties/Transcoding.hs | 4 +- 3 files changed, 208 insertions(+), 95 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 6e52d24d..9d7ed6d0 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -23,24 +23,14 @@ module Data.Text.Encoding ( - -- * ByteString validation - -- $validation - Utf8State - , validateUtf8Chunk - , validateUtf8More - , startUtf8State - -- * Decoding ByteStrings to Text -- $strict -- ** Total Functions #total# -- $total - , decodeLatin1 + decodeLatin1 , decodeASCIIPrefix - , decodeUtf8Chunk , decodeUtf8Lenient - - -- *** Catchable failure , decodeUtf8' -- *** Controllable error handling @@ -55,6 +45,15 @@ module Data.Text.Encoding , streamDecodeUtf8With , Decoding(..) + -- *** Incremental decoding + -- $incremental + , decodeUtf8Chunk + , decodeUtf8More + , Utf8State + , startUtf8State + , StrictBuilder() + , strictBuilderToText + -- ** Partial Functions -- $partial , decodeASCII @@ -77,6 +76,11 @@ module Data.Text.Encoding -- * Encoding Text using ByteString Builders , encodeUtf8Builder , encodeUtf8BuilderEscaped + + -- * ByteString validation + -- $validation + , validateUtf8Chunk + , validateUtf8More ) where import Control.Exception (evaluate, try) @@ -154,7 +158,8 @@ import qualified Data.ByteString.Unsafe as B -- To drop references to the input bytestring, force the prefix -- (using 'seq' or @BangPatterns@) and drop references to the suffix. -- --- Properties: +-- === Properties +-- -- - If @(prefix, suffix) = decodeAsciiPrefix s@, then @'encodeUtf8' prefix <> suffix = s@. -- - Either @suffix@ is empty, or @'B.head' suffix > 127@. -- @@ -535,3 +540,18 @@ encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt)) encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} + +-- $incremental +-- The functions 'decodeUtf8Chunk' and 'decodeUtf8More' provide more +-- control for error-handling and streaming. +-- +-- - You get an UTF-8 prefix of the given 'ByteString' up to the next error. +-- For example this lets you insert or delete arbitrary text, or do some +-- stateful operations, before resuming. +-- In contrast, the older stream-oriented interface only lets you substitute +-- a single fixed 'Char' for each invalid byte in 'OnDecodeError'. +-- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks +-- before doing the copying work to construct a 'Text', or you can +-- output decoded fragments immediately as a lazy 'Data.Text.Lazy.Text'. +-- +-- For even lower-level primitives, see 'validateUtf8Chunk' and 'validateUtf8More'. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 7ae90d8a..fc8be607 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -3,7 +3,12 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ViewPatterns #-} + -- | +-- Internals of "Data.Text.Encoding". +-- +-- If you'd like to depend on something from here that's not in "Data.Text.Encoding", +-- please request to have it exported! module Data.Text.Internal.Encoding where import Control.Monad.ST (ST, runST) @@ -11,6 +16,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits ((.&.), shiftL, shiftR) import Data.ByteString (ByteString) import Data.Functor (void) +import Data.Semigroup (Semigroup(..)) import Data.Text.Internal (Text(..), empty, safe) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Unsafe (unsafeDupablePerformIO) @@ -36,31 +42,46 @@ import qualified Data.ByteString.Unsafe as B #endif -- | State of decoding a 'ByteString' in UTF-8. --- It consists of a value representing whether or --- not the last byte is a complete code point, and on incompletion what --- the 1 to 3 end bytes are that make up the incomplete code point. +-- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More', +-- 'decodeUtf8Chunk', 'decodeUtf8More'). + +-- Internal invariant: +-- the first component is the initial state if and only if +-- the second component is empty. +-- +-- @ +-- 'utf9CodePointState' s = 'utf8StartState' +-- <=> +-- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0 +-- @ +-- +-- @since 2.0.2 data Utf8State = Utf8State - { -- | Current UTF-8 code point state of the 'ByteString's - -- that have been validated thus far. + { -- | State of the UTF-8 state machine. utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState - -- | Get the incomplete UTF-8 code point of the 'ByteString's that - -- have been validated thus far. The first byte of the 'Word32' - -- indicates the number of bytes of the code point are available, - -- and is followed by the bytes of the code point. + -- | Bytes of the currently incomplete code point (if any). , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint } deriving (Eq, Ord, Show) --- | This represtents the starting state of a UTF-8 validation check. +-- | Initial 'Utf8State'. +-- +-- @since 2.0.2 startUtf8State :: Utf8State startUtf8State = Utf8State utf8StartState partUtf8Empty --- | Prefix of a valid UTF-8 encoded code point encoded in 4 bytes. --- This consists of a length (in bytes) between 1 and 3 stored in the most --- significant byte, and the actual bytes in the rest of the word. +-- | Prefix of a UTF-8 code point encoded in 4 bytes, +-- possibly empty. +-- +-- - The most significant byte contains the number of bytes, +-- between 0 and 3. +-- - The remaining bytes hold the incomplete code point. +-- - Unused bytes must be 0. -- --- All of its operations are the functions below. +-- All of operations available on it are the functions below. -- The constructor should never be used outside of those. +-- +-- @since 2.0.2 newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32 deriving (Eq, Ord, Show) @@ -72,6 +93,8 @@ partUtf8Empty = PartialUtf8CodePoint 0 partUtf8Len :: PartialUtf8CodePoint -> Int partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24 +-- | Length of the code point once completed (it is known in the first byte). +-- 0 if empty. partUtf8CompleteLen :: PartialUtf8CodePoint -> Int partUtf8CompleteLen c@(PartialUtf8CodePoint w) | partUtf8Len c == 0 = 0 @@ -83,6 +106,8 @@ partUtf8CompleteLen c@(PartialUtf8CodePoint w) firstByte = (w `shiftR` 16) .&. 255 -- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@. +-- +-- Unsafe: no bounds checking. partUtf8UnsafeIndex :: #if defined(ASSERTS) HasCallStack => @@ -94,6 +119,9 @@ partUtf8UnsafeIndex (PartialUtf8CodePoint w) n = #endif fromIntegral $ w `shiftR` (16 - 8 * n) +-- | Append some bytes. +-- +-- Unsafe: no bounds checking. partUtf8UnsafeAppend :: #if defined(ASSERTS) HasCallStack => @@ -113,7 +141,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs = then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i))) else w --- | This avoids recursion to unfold to straightline code. +-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it unfolds to straightline code. partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a partUtf8Foldr f x0 c = case partUtf8Len c of 0 -> x0 @@ -123,6 +151,7 @@ partUtf8Foldr f x0 c = case partUtf8Len c of where build i x = f (partUtf8UnsafeIndex c i) x +-- | Convert 'PartialUtf8CodePoint' to 'ByteString'. partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr -> partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0 @@ -132,31 +161,37 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt #endif --- | Validate a 'ByteString' as a UTF-8-encoded text. +-- | Validate a 'ByteString' as UTF-8-encoded text. To be continued using 'validateUtf8More'. +-- +-- See also 'validateUtf8More' for details on the result of this function. -- --- @validateUtf8More chunk = (n, es)@ +-- @ +-- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State' +-- @ +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- 'validateUtf8Chunk' chunk = (n, ms) +-- @ -- --- This function returns two values: +-- - The prefix is valid UTF-8. In particular, it should be accepted +-- by this validation: -- --- * The value 'n' indicates the longest prefix of the 'ByteString' --- that is valid UTF-8-encoded data. --- * The value 'es' indicates whether the 'ByteString' +-- @ +-- 'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State') +-- @ -- --- * (@Left p@) contains an invalid code point and where the next --- (potentially valid) code point begins, so that @p - n@ is the --- number of invalid bytes, or --- * (@Right s@) is valid, and all of the remaining bytes starting --- at inbex 'n' are the beginning of an incomplete UTF-8 code --- point, and 's' is the resulting 'Utf8State' value, which --- can be used to validate against a following 'ByteString' with --- 'validateNextUtf8Chunk'. +-- @since 2.0.2 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) validateUtf8Chunk = validateUtf8ChunkFrom 0 -- | Add an offset to the index returned by 'validateUtf8More'. -- -- @ --- validateUtf8ChunkFrom n = first (+ 1) . 'validateUtf8More' . 'B.drop' n +-- validateUtf8ChunkFrom n = first (+ n) . 'validateUtf8More' . 'B.drop' n -- @ validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) validateUtf8ChunkFrom ofs bs @@ -202,32 +237,51 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState Nothing -> (utf8End, Nothing) | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) --- | Validate a 'ByteString' as a contiuation of UTF-8-encoded text. +-- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text. -- --- @validateNextUtf8Chunk chunk s = (n, es)@ +-- Returns a pair: -- --- This function returns two values: +-- 1. The first component @n@ is the end position, relative to the current +-- chunk, of the longest prefix of the accumulated bytestring which is valid UTF-8. +-- @n@ may be negative: that happens when an incomplete code point started in +-- a previous chunk and is not completed by the current chunk (either +-- that code point is still incomplete, or it is broken by an invalid byte). -- --- * The value 'n' indicates the end position of longest prefix of the --- 'ByteString' that is valid UTF-8-encoded data from the starting --- state 's'. If 's' contains an incomplete code point, the input --- 'ByteString' is considered a continuation. As a result 'n' will be --- negative if the code point is still incomplete or is proven to be --- invalid. --- --- * The value 'es' indicates whether the 'ByteString' +-- 2. The second component @ms@ indicates the following: +-- +-- - if @ms = Nothing@, the remainder of the chunk contains an invalid byte, +-- within four bytes from position @n@; +-- - if @ms = Just s'@, you can carry on validating another chunk +-- by calling 'validateUtf8More' with the new state @s'@. +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- 'validateUtf8More' s chunk = (n, ms) +-- @ -- --- * (@Left p@) contains an invalid code point and where the next --- (potentially valid) code point begins, so that @p - n@ is the --- number of invalid bytes, or --- * (@Right s'@) is valid, and all of the remaining bytes starting --- at index 'n' are the beginning of an incomplete UTF-8 code --- point, and `s'` is the resulting 'Utf8State' value, which --- can be used to validate against a following 'ByteString'. +-- - If the chunk is invalid, it cannot be extended to be valid. +-- +-- @ +-- ms = Nothing +-- ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing) +-- @ +-- +-- - Validating two chunks sequentially is the same as validating them +-- together at once: +-- +-- @ +-- ms = Just s' +-- ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more) +-- @ +-- +-- @since 2.0.2 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) validateUtf8More st@(Utf8State s0 part) bs | len > 0 = loop 0 s0 - | otherwise = (0, Just st) + | otherwise = (- partUtf8Len part, Just st) where len = B.length bs -- Complete an incomplete code point (if there is one) @@ -238,21 +292,33 @@ validateUtf8More st@(Utf8State s0 part) bs case updateUtf8State (B.index bs i) s of Nothing -> (0, Nothing) Just s' -> loop (i + 1) s' - | otherwise = (0, Just (Utf8State s (partUtf8UnsafeAppend part bs))) + | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs))) --- | Construct an Array. This is currently an internal data structure which is --- only used to construct Text (so for example byteStringToStrictBuilder should --- only be applied to valid UTF-8 bytestrings). +-- | A delayed representation of strict 'Text'. + +-- For internal purposes, this is instead used as a delayed 'Array': +-- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder', +-- 'byteStringToStrictBuilder', 'partUtf8ToStrictBuilder'). +-- +-- @since 2.0.2 data StrictBuilder = StrictBuilder { sbLength :: {-# UNPACK #-} !Int , sbWrite :: forall s. A.MArray s -> Int -> ST s () } +-- | Concatenation of 'StrictBuilder' is right-biased: +-- the right builder will be run first. This allows a builder to +-- run tail-recursively when it was accumulated left-to-right. +instance Semigroup StrictBuilder where + (<>) = appendRStrictBuilder + +instance Monoid StrictBuilder where + mempty = emptyStrictBuilder + mappend = (<>) + emptyStrictBuilder :: StrictBuilder emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) --- | Right-biased append: run the right action first. This allows a builder to --- run tail-recursively when accumulating text left-to-right. appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = StrictBuilder (n1 + n2) (\arr ofs -> do @@ -278,8 +344,11 @@ word8ToStrictBuilder w = partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder partUtf8ToStrictBuilder = - partUtf8Foldr (appendRStrictBuilder . word8ToStrictBuilder) emptyStrictBuilder + partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder +-- | Use 'StrictBuilder' to build 'Text'. +-- +-- @since 2.0.2 strictBuilderToText :: StrictBuilder -> Text strictBuilderToText (StrictBuilder 0 _) = empty strictBuilderToText (StrictBuilder n write) = runST (do @@ -288,28 +357,50 @@ strictBuilderToText (StrictBuilder n write) = runST (do arr <- A.unsafeFreeze dst pure (Text arr 0 n)) --- | Decode a 'ByteString' in the context of what has been already been decoded. +-- | Decode another chunk in an ongoing UTF-8 stream. +-- +-- Returns a triple: -- --- The 'ByteString' is validated against the 'Utf8State' using the rules --- governing 'validateNextUtf8Chunk'. The longest valid UTF-8 prefix is added --- to the input 'TextDataStack' which is returned with the end position of the --- valid prefix, and either the resulting 'Utf8State' --- (@Right Utf8State@) or the position of the of the first (potentially) --- valid byte after the invalid bytes with remainder of the 'ByteString' --- (@Left (Int, ByteString)@). +-- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate +-- chunks with @('<>')@ or output them with 'strictBuilderToText'. +-- 2. The undecoded remainder of the given chunk, for diagnosing errors +-- and resuming (presumably after skipping some bytes). +-- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered +-- (it will be within the first 4 bytes of the undecoded remainder). +-- +-- @since 2.0.2 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = case validateUtf8More s bs of (len, s') -> - let builder | len == 0 = emptyStrictBuilder + let builder | len <= 0 = emptyStrictBuilder | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s) - `appendRStrictBuilder` byteStringToStrictBuilder (B.take len bs) + <> byteStringToStrictBuilder (B.take len bs) in (builder, B.drop len bs, s') --- | Decode a 'ByteString' against a start 'Utf8State' with an empty --- 'TextDataStack'. +-- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'. +-- +-- See 'decodeUtf8More' for details on the result. +-- +-- === Properties -- --- @decodeUtf8More bs = 'decodeNextUtf8Chunk' bs 'startUtf8State' 'emptyStack'@ +-- @ +-- 'decodeUtf8Chunk' = 'decodeUtf8More' 'startUtf8State' +-- @ +-- +-- Given: +-- +-- @ +-- 'decodeUtf8Chunk' chunk = (builder, rest, ms) +-- @ +-- +-- @builder@ is a prefix and @rest@ is a suffix of @chunk@. +-- +-- @ +-- 'Data.Text.Encoding.encodeUtf8' ('strictBuilderToText' builder) '<>' rest = chunk +-- @ +-- +-- @since 2.0.2 decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8Chunk = decodeUtf8More startUtf8State @@ -321,7 +412,7 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder skipIncomplete onErr s = partUtf8Foldr - (appendRStrictBuilder . handleUtf8Error onErr) + ((<>) . handleUtf8Error onErr) emptyStrictBuilder (partialUtf8CodePoint s) handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder @@ -336,14 +427,16 @@ decodeUtf8With1 :: #endif (Word8 -> Maybe Char) -> ByteString -> Text decodeUtf8With1 onErr bs0 = strictBuilderToText $ - builder `appendRStrictBuilder` skipIncomplete onErr s + builder <> skipIncomplete onErr s where (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0 --- | Helper for 'decodeUtf8With' and 'streamDecodeUtf8With'. --- This uses an 'OnDecodeError' to process bad bytes. --- This is not a very pretty legacy API. --- See 'decodeUtf8More' for a more flexible alternative. +-- | Helper for 'Data.Text.Encoding.decodeUtf8With', +-- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy +-- 'Data.Text.Lazy.Encoding.decodeUtf8With', +-- which use an 'OnDecodeError' to process bad bytes. +-- +-- See 'decodeUtf8Chunk' for a more flexible alternative. decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => @@ -353,21 +446,21 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder where loop s i !builder = let nonEmptyPrefix len = builder - `appendRStrictBuilder` partUtf8ToStrictBuilder (partialUtf8CodePoint s) - `appendRStrictBuilder` byteStringToStrictBuilder (B.take len (B.drop i bs)) + <> partUtf8ToStrictBuilder (partialUtf8CodePoint s) + <> byteStringToStrictBuilder (B.take len (B.drop i bs)) in case validateUtf8More s (B.drop i bs) of (len, Nothing) -> - if len == 0 && utf8CodePointState s /= utf8StartState + if len < 0 then -- loop is strict on builder, so if onErr raises an error it will be forced here. - let builder' = builder `appendRStrictBuilder` skipIncomplete onErr s + let builder' = builder <> skipIncomplete onErr s in loop startUtf8State i builder' else let builder' = nonEmptyPrefix len - `appendRStrictBuilder` handleUtf8Error onErr (B.index bs (i + len)) + <> handleUtf8Error onErr (B.index bs (i + len)) in loop startUtf8State (i + len + 1) builder' (len, Just s') -> - let builder' = if len == 0 then builder else nonEmptyPrefix len + let builder' = if len <= 0 then builder else nonEmptyPrefix len undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s') then B.drop (i + len) bs -- Reuse bs if possible else partUtf8ToByteString (partialUtf8CodePoint s') diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 8e7a614e..d3e85ca9 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -386,11 +386,11 @@ testTranscoding = testCase "t_pn_utf8_2" t_pn_utf8_2, testCase "t_pn_utf8_3" t_pn_utf8_3, testCase "t_decode_chunk1" t_decode_chunk1, - testCase "t_decode_chunk2" t_decode_chunk2 + testCase "t_decode_chunk2" t_decode_chunk2, testCase "t_decode_with_error1'" t_decode_with_error1', testCase "t_decode_with_error2'" t_decode_with_error2', testCase "t_decode_with_error3'" t_decode_with_error3', testCase "t_decode_with_error4'" t_decode_with_error4', - testCase "t_decode_with_error5'" t_decode_with_error5', + testCase "t_decode_with_error5'" t_decode_with_error5' ] ] From 27d23210c10004e0fcefb8846ea8f16173459687 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Fri, 3 Feb 2023 23:08:40 +0000 Subject: [PATCH 66/87] Fix tests --- src/Data/Text/Encoding.hs | 2 +- src/Data/Text/Internal/Encoding.hs | 11 +++++++---- tests/Tests/Properties/Transcoding.hs | 12 ++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 9d7ed6d0..65a1ce82 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -547,7 +547,7 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) -- -- - You get an UTF-8 prefix of the given 'ByteString' up to the next error. -- For example this lets you insert or delete arbitrary text, or do some --- stateful operations, before resuming. +-- stateful operations before resuming, such as keeping track of error locations. -- In contrast, the older stream-oriented interface only lets you substitute -- a single fixed 'Char' for each invalid byte in 'OnDecodeError'. -- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index fc8be607..58c79bc3 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -11,6 +11,9 @@ -- please request to have it exported! module Data.Text.Internal.Encoding where +#if defined(ASSERTS) +import Control.Exception (assert) +#endif import Control.Monad.ST (ST, runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits ((.&.), shiftL, shiftR) @@ -113,9 +116,9 @@ partUtf8UnsafeIndex :: HasCallStack => #endif PartialUtf8CodePoint -> Int -> Word8 -partUtf8UnsafeIndex (PartialUtf8CodePoint w) n = +partUtf8UnsafeIndex _c@(PartialUtf8CodePoint w) n = #if defined(ASSERTS) - assert (0 <= n && n < partUtf8Len w) $ + assert (0 <= n && n < partUtf8Len _c) $ #endif fromIntegral $ w `shiftR` (16 - 8 * n) @@ -290,7 +293,7 @@ validateUtf8More st@(Utf8State s0 part) bs | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs | i < len = case updateUtf8State (B.index bs i) s of - Nothing -> (0, Nothing) + Nothing -> (- partUtf8Len part, Nothing) Just s' -> loop (i + 1) s' | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs))) @@ -461,7 +464,7 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder in loop startUtf8State (i + len + 1) builder' (len, Just s') -> let builder' = if len <= 0 then builder else nonEmptyPrefix len - undecoded = if B.length bs - i - len == partUtf8Len (partialUtf8CodePoint s') + undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s') then B.drop (i + len) bs -- Reuse bs if possible else partUtf8ToByteString (partialUtf8CodePoint s') in (builder', undecoded, s') diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index d3e85ca9..038b6506 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -88,14 +88,14 @@ testValidateUtf8MoreFail s = testValidateUtf8With (E.validateUtf8More s) expectN t_pn_utf8_1 = do s <- testValidateUtf8 [0xF0, 0x90, 0x80] 0 _ <- testValidateUtf8More s [0x80] 1 - testValidateUtf8MoreFail s [0x7f] 0 + testValidateUtf8MoreFail s [0x7f] (-3) t_pn_utf8_2 = do s0 <- testValidateUtf8 [0xF0] 0 - testValidateUtf8MoreFail s0 [0x7f] 0 - s1 <- testValidateUtf8More s0 [0x90] 0 - testValidateUtf8MoreFail s1 [0x7f] 0 - s2 <- testValidateUtf8More s1 [0x80] 0 - testValidateUtf8MoreFail s2 [0x7f] 0 + testValidateUtf8MoreFail s0 [0x7f] (-1) + s1 <- testValidateUtf8More s0 [0x90] (-1) + testValidateUtf8MoreFail s1 [0x7f] (-2) + s2 <- testValidateUtf8More s1 [0x80] (-2) + testValidateUtf8MoreFail s2 [0x7f] (-3) _ <- testValidateUtf8More s2 [0x80] 1 pure () t_pn_utf8_3 = do From 4479e6e054eb7b584f5a1aa35f9edd5a86168e70 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Fri, 3 Feb 2023 23:46:57 +0000 Subject: [PATCH 67/87] Space --- src/Data/Text/Internal/Encoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 58c79bc3..de149119 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -70,7 +70,7 @@ data Utf8State = Utf8State -- | Initial 'Utf8State'. -- -- @since 2.0.2 -startUtf8State :: Utf8State +startUtf8State :: Utf8State startUtf8State = Utf8State utf8StartState partUtf8Empty -- | Prefix of a UTF-8 code point encoded in 4 bytes, From 01d8824a998aa1f67961b4920dd5cf86a833f1bf Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sat, 4 Feb 2023 00:20:30 +0000 Subject: [PATCH 68/87] Fix test for old base --- tests/Tests/Properties/Transcoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 038b6506..7f02c0f4 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -301,7 +301,7 @@ testDecodeUtf8With k s xs expected = bs @?= xs' else let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s) - in E.encodeUtf8 txt <> bs @?= c <> xs' + in E.encodeUtf8 txt `B.append` bs @?= c `B.append` xs' k s' testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State From 25859ad34f9a95a8377c87155d867cc1ec144ee1 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sat, 4 Feb 2023 00:28:41 +0000 Subject: [PATCH 69/87] Add textToStrictBuilder --- src/Data/Text/Encoding.hs | 3 ++- src/Data/Text/Internal/Encoding.hs | 24 +++++++++++++++++------- tests/Tests/Properties/Transcoding.hs | 6 ++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 65a1ce82..6a7a614f 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -45,7 +45,7 @@ module Data.Text.Encoding , streamDecodeUtf8With , Decoding(..) - -- *** Incremental decoding + -- *** Incremental UTF-8 decoding -- $incremental , decodeUtf8Chunk , decodeUtf8More @@ -53,6 +53,7 @@ module Data.Text.Encoding , startUtf8State , StrictBuilder() , strictBuilderToText + , textToStrictBuilder -- ** Partial Functions -- $partial diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index de149119..7d89719f 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -324,9 +324,9 @@ emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = - StrictBuilder (n1 + n2) (\arr ofs -> do - write2 arr (ofs + n1) - write1 arr ofs) + StrictBuilder (n1 + n2) (\dst ofs -> do + write2 dst (ofs + n1) + write1 dst ofs) copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s () copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> @@ -335,15 +335,15 @@ copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> byteStringToStrictBuilder :: ByteString -> StrictBuilder byteStringToStrictBuilder bs = - StrictBuilder (B.length bs) (\arr ofs -> copyFromByteString arr ofs bs) + StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) charToStrictBuilder :: Char -> StrictBuilder charToStrictBuilder c = - StrictBuilder (utf8Length c) (\arr ofs -> void (Char.unsafeWrite arr ofs (safe c))) + StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) word8ToStrictBuilder :: Word8 -> StrictBuilder word8ToStrictBuilder w = - StrictBuilder 1 (\arr ofs -> A.unsafeWrite arr ofs w) + StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder partUtf8ToStrictBuilder = @@ -360,6 +360,13 @@ strictBuilderToText (StrictBuilder n write) = runST (do arr <- A.unsafeFreeze dst pure (Text arr 0 n)) +-- | Copy 'Text' in a 'StrictBuilder' +-- +-- @since 2.0.2 +textToStrictBuilder :: Text -> StrictBuilder +textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> + A.copyI n dst dstOfs src srcOfs) + -- | Decode another chunk in an ongoing UTF-8 stream. -- -- Returns a triple: @@ -455,8 +462,11 @@ decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder (len, Nothing) -> if len < 0 then - -- loop is strict on builder, so if onErr raises an error it will be forced here. + -- If the first byte cannot complete the partial code point in s, + -- retry from startUtf8State. let builder' = builder <> skipIncomplete onErr s + -- Note: loop is strict on builder, so if onErr raises an error it will + -- be forced here, short-circuiting the loop as desired. in loop startUtf8State i builder' else let builder' = nonEmptyPrefix len diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 7f02c0f4..57f09139 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -331,6 +331,9 @@ t_infix_concat bs1 text bs2 = text `T.isInfixOf` E.decodeUtf8With onErr (B.concat [bs1, E.encodeUtf8 text, bs2]) +t_textToStrictBuilder = + (E.strictBuilderToText . E.textToStrictBuilder) `eq` id + testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ @@ -392,5 +395,8 @@ testTranscoding = testCase "t_decode_with_error3'" t_decode_with_error3', testCase "t_decode_with_error4'" t_decode_with_error4', testCase "t_decode_with_error5'" t_decode_with_error5' + ], + testGroup "strictBuilder" [ + testProperty "textToStrictBuilder" t_textToStrictBuilder ] ] From ace7c61764fa859a825a58b9a327f7239098733d Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sat, 4 Feb 2023 00:28:50 +0000 Subject: [PATCH 70/87] Update changelog.md --- changelog.md | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/changelog.md b/changelog.md index 136d89a3..e63283c1 100644 --- a/changelog.md +++ b/changelog.md @@ -4,28 +4,22 @@ ### 2.0.2 -* A suite of functions have been added in `Data.Text.Encoding` that - allow decoding to be aborted on errors without the need to raise an - `error` and `catch` it elsewhere: +* Add decoding functions in `Data.Text.Encoding` that allow + more control for error handling and for how to allocate text. + (https://github.com/haskell/text/pull/448 Thanks to David Sledge) * `decodeASCIIPrefix` - * `TextDataStack` - * `dataStack` - * `stackLen` - * `emptyStack` * `decodeUtf8Chunk` - * `decodeNextUtf8Chunk` - * `pushText` - * `stackToText` - -* Added functions to validate `ByteString`s that represent encoded text: + * `decodeUtf8More` * `Utf8ValidState` - * `partialUtf8CodePoint` - * `partUtf8CPLen` - * `wordAtPartUft8CP` - * `utf8CodePointState` - * `validateUtf8Chunk` - * `validateNextUtf8Chunk` * `startUtf8ValidState` + * `StrictBuilder` + * `strictBuilderToText` + * `textToStrictBuilder` + * `validateUtf8Chunk` + * `validateUtf8More` + +* Fix quadratic slowdown when decoding invalid UTF-8 bytestrings + (https://github.com/haskell/text/issues/495) ### 2.0.1 From 2603294e28d5ab6adcc2a4d4e3f74ff392a36c6e Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sat, 4 Feb 2023 03:37:57 +0000 Subject: [PATCH 71/87] Add laws --- src/Data/Text/Internal/Encoding.hs | 64 ++++++++-- tests/Tests/Properties/Transcoding.hs | 165 +++++++++++++++++++++++++- 2 files changed, 217 insertions(+), 12 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 7d89719f..54c84e34 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -183,9 +183,9 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- - The prefix is valid UTF-8. In particular, it should be accepted -- by this validation: -- --- @ --- 'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State') --- @ +-- @ +-- 'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State') +-- @ -- -- @since 2.0.2 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) @@ -267,18 +267,18 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState -- -- - If the chunk is invalid, it cannot be extended to be valid. -- --- @ --- ms = Nothing --- ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing) --- @ +-- @ +-- ms = Nothing +-- ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing) +-- @ -- -- - Validating two chunks sequentially is the same as validating them -- together at once: -- --- @ --- ms = Just s' --- ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more) --- @ +-- @ +-- ms = Just s' +-- ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more) +-- @ -- -- @since 2.0.2 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) @@ -379,6 +379,48 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> -- (it will be within the first 4 bytes of the undecoded remainder). -- -- @since 2.0.2 +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- (pre, suf, ms) = 'decodeUtf8More' s chunk +-- @ +-- +-- 1. If the output @pre@ is nonempty (alternatively, if @length chunk > length suf@) +-- +-- @ +-- s2b pre \`'Data.ByteString.append'\` suf = p2b s \`'Data.ByteString.append'\` chunk +-- @ +-- +-- where +-- +-- @ +-- s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.strictBuilderToText' +-- p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString' +-- @ +-- +-- 2. If the output @pre@ is empty (alternatively, if @length chunk = length suf@) +-- +-- @suf = chunk@ +-- +-- 3. Decoding chunks separately is equivalent to decoding their concatenation. +-- +-- Given: +-- +-- @ +-- (pre1, suf1, Just s1) = 'decodeUtf8More' s chunk1 +-- (pre2, suf2, ms2) = 'decodeUtf8More' s1 chunk2 +-- (pre3, suf3, ms3) = 'decodeUtf8More' s (chunk1 \`B.append\` chunk2) +-- @ +-- +-- we have: +-- +-- @ +-- s2b (pre1 '<>' pre2) = s2b pre3 +-- ms2 = ms3 +-- @ decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = case validateUtf8More s bs of diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 57f09139..7bf2067f 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -12,6 +12,8 @@ import Prelude hiding (head, tail) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) import Data.Functor (void) +import Data.Maybe (isNothing) +import Data.Semigroup ((<>)) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -104,6 +106,105 @@ t_pn_utf8_3 = do assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c testValidateUtf8MoreFail s1 [0x80, 0x80] 1 +-- Precondition: (i, ms1) = E.validateUtf8More s chunk +-- +-- The index points to the end of the longest valid prefix +-- of prechunk `B.append` chunk +pre_validateUtf8More_validPrefix s chunk i = + let prechunk = E.partUtf8ToByteString (E.partialUtf8CodePoint s) in + -- Note: i <= 0 implies take i = id + let (j, ms2) = E.validateUtf8Chunk (B.take (B.length prechunk + i) (prechunk `B.append` chunk)) in + counterexample (show prechunk) $ + (B.length prechunk + i, ms2) === (j, Just E.startUtf8State) + +-- Precondition: (i, Nothing) = E.validateUtf8More s chunk +-- +-- Appending to an invalid chunk yields another invalid chunk. +pre_validateUtf8More_maximalPrefix s chunk i more = + E.validateUtf8More s (chunk `B.append` more) === (i, Nothing) + +-- Precondition: (i, Just s1) = E.validateUtf8More s chunk +pre_validateUtf8More_suffix s chunk i s1 = + if 0 <= i + then B.drop i chunk === p2b s1 -- The state s1 contains a suffix of the chunk. + else p2b s `B.append` chunk === p2b s1 -- Or the chunk extends the incomplete code point in s1. + +-- Precondition: (i, Just s1) = E.validateUtf8More s chunk1 +-- +-- Validating two chunks sequentially is equivalent to validating them at once. +pre_validateUtf8More_append s chunk1 s1 chunk2 = + let (j, ms2) = E.validateUtf8More s1 chunk2 in + (B.length chunk1 + j, ms2) === E.validateUtf8More s (chunk1 `B.append` chunk2) + +-- These wrappers use custom generators to satisfy the above properties. + +t_validateUtf8More_validPrefix = property $ do + cex@(s, chunk, i, _ms1) <- randomMoreChunk + pure $ counterexample (show cex) $ + pre_validateUtf8More_validPrefix s chunk i + +t_validateUtf8More_maximalPrefix = property $ do + -- We want chunks that fail validation: force their size to be big,.. + cex@(s, chunk, i, ms1) <- scale (* 3) arbitraryMoreChunk + pure $ counterexample (show cex) $ + -- ... and just use rejection sampling + isNothing ms1 ==> + pre_validateUtf8More_maximalPrefix s chunk i + +t_validateUtf8More_valid = property $ do + cex@(s, chunk1, i, s1, chunk2) <- validMoreChunks + pure $ counterexample (show cex) $ + pre_validateUtf8More_suffix s chunk1 i s1 .&&. + pre_validateUtf8More_append s chunk1 s1 chunk2 + +randomMoreChunk, arbitraryMoreChunk, validMoreChunk :: Gen (E.Utf8State, B.ByteString, Int, Maybe E.Utf8State) +randomMoreChunk = oneof [arbitraryMoreChunk, validMoreChunk] + +arbitraryMoreChunk = do + s <- randomUtf8State + chunk <- arbitrary + let (i, ms1) = E.validateUtf8More s chunk + pure (s, chunk, i, ms1) + +-- | Generate a random state by parsing a prefix of a Char +randomUtf8State :: Gen E.Utf8State +randomUtf8State = do + c <- arbitrary + chunk <- elements (B.inits (E.encodeUtf8 (T.singleton c))) + case E.validateUtf8Chunk chunk of + (_, Just s) -> pure s + (_, Nothing) -> error "should not happen" + +-- | Make a valid chunk, i.e., (s, chunk) such that +-- +-- validateUtf8More s chunk = (i, Just s1) +-- +-- Also returning i and s1 to not repeat work. +validMoreChunk = do + (s, chunk, i, s1, _chunk2) <- validMoreChunks + pure (s, chunk, i, Just s1) + +-- | Make a valid chunk by slicing a valid UTF8 bs, +-- and also provide a second chunk which is a valid extension +-- with 0.5 probability. +validMoreChunks :: Gen (E.Utf8State, B.ByteString, Int, E.Utf8State, B.ByteString) +validMoreChunks = do + bs <- E.encodeUtf8 <$> scale (* 3) arbitrary + -- Take an intermediate state. + -- No need to go too far since code points are at most 4 bytes long + i <- choose (0, 3) + let (bs0, bs1) = B.splitAt i bs + case E.validateUtf8Chunk bs0 of + (_, Just s) -> do + j <- choose (0, B.length bs1) + let (chunk1, chunk2') = B.splitAt j bs1 + case E.validateUtf8More s chunk1 of + (n1, Just s1) -> do + chunk2 <- oneof [pure chunk2', arbitrary] + pure (s, chunk1, n1, s1, chunk2) + (_, Nothing) -> error "should not happen" + (_, Nothing) -> error "should not happen" + t_utf8_c = (E.strictBuilderToText . fst3 . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) @@ -334,6 +435,60 @@ t_infix_concat bs1 text bs2 = t_textToStrictBuilder = (E.strictBuilderToText . E.textToStrictBuilder) `eq` id +-- decodeUtf8Chunk splits the input bytestring +t_decodeUtf8Chunk_split chunk = + let (pre, suf, _ms) = E.decodeUtf8Chunk chunk + in s2b pre `B.append` suf === chunk + +-- decodeUtf8More mostly splits the input bytestring, +-- also inserting bytes from the partial code point in s. +-- +-- This is wrapped by t_decodeUtf8More_split to have more +-- likely valid chunks. +t_decodeUtf8More_split' s chunk = + let (pre, suf, _ms) = E.decodeUtf8More s chunk + in if B.length chunk > B.length suf + then s2b pre `B.append` suf === p2b s `B.append` chunk + else suf === chunk + +-- The output state of decodeUtf8More contains the suffix. +-- +-- Precondition (valid chunk): ms = Just s' +pre_decodeUtf8More_suffix s chunk = + let (_pre, suf, ms) = E.decodeUtf8More s chunk + in case ms of + Nothing -> discard + Just s' -> if B.length chunk > B.length suf + then p2b s' === suf + else p2b s' === p2b s `B.append` suf + +-- Decoding chunks separately is equivalent to decoding their concatenation. +pre_decodeUtf8More_append s chunk1 chunk2 = + let (pre1, suf1, ms1) = E.decodeUtf8More s chunk1 in + case ms1 of + Nothing -> discard + Just s1 -> + let (pre2, suf2, ms2) = E.decodeUtf8More s1 chunk2 in + let (pre3, suf3, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in + (s2b (pre1 <> pre2), ms2) === (s2b pre3, ms3) + +-- Properties for any chunk +-- (but do try to generate valid chunks often enough) +t_decodeUtf8More1 = property $ do + cex@(s, chunk, _, _) <- randomMoreChunk + pure $ counterexample (show cex) $ + t_decodeUtf8More_split' s chunk + +-- Properties that require valid chunks +t_decodeUtf8More2 = property $ do + cex@(s, chunk, _, _, chunk2) <- validMoreChunks + pure $ counterexample (show cex) $ + pre_decodeUtf8More_suffix s chunk .&&. + pre_decodeUtf8More_append s chunk chunk2 + +s2b = E.encodeUtf8 . E.strictBuilderToText +p2b = E.partUtf8ToByteString . E.partialUtf8CodePoint + testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ @@ -371,6 +526,11 @@ testTranscoding = testProperty "t_decode_with_error4" t_decode_with_error4, testProperty "t_infix_concat" t_infix_concat ], + testGroup "validate" [ + testProperty "t_validateUtf8More_validPrefix" t_validateUtf8More_validPrefix, + testProperty "t_validateUtf8More_maximalPrefix" t_validateUtf8More_maximalPrefix, + testProperty "t_validateUtf8More_valid" t_validateUtf8More_valid + ], testGroup "streaming" [ testProperty "t_utf8_undecoded" t_utf8_undecoded, testProperty "t_utf8_incr" t_utf8_incr, @@ -394,7 +554,10 @@ testTranscoding = testCase "t_decode_with_error2'" t_decode_with_error2', testCase "t_decode_with_error3'" t_decode_with_error3', testCase "t_decode_with_error4'" t_decode_with_error4', - testCase "t_decode_with_error5'" t_decode_with_error5' + testCase "t_decode_with_error5'" t_decode_with_error5', + testProperty "t_decodeUtf8Chunk_split" t_decodeUtf8Chunk_split, + testProperty "t_decodeUtf8More1" t_decodeUtf8More1, + testProperty "t_decodeUtf8More2" t_decodeUtf8More2 ], testGroup "strictBuilder" [ testProperty "textToStrictBuilder" t_textToStrictBuilder From 01344a27166a0c49350f29ce6fde58d720cdeaa1 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sat, 4 Feb 2023 19:48:23 +0000 Subject: [PATCH 72/87] Add some inline pragmas --- src/Data/Text/Encoding.hs | 1 + src/Data/Text/Internal/Encoding.hs | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 6a7a614f..cd3593de 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -175,6 +175,7 @@ decodeASCIIPrefix bs = if B.null bs Text (A.ByteArray arr) 0 len suffix = B.drop len bs in (prefix, suffix) +{-# INLINE decodeASCIIPrefix #-} -- | Length of the longest ASCII prefix. asciiPrefixLength :: ByteString -> Int diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 54c84e34..893f426a 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -144,7 +144,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs = then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i))) else w --- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it unfolds to straightline code. +-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code. partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a partUtf8Foldr f x0 c = case partUtf8Len c of 0 -> x0 @@ -153,6 +153,7 @@ partUtf8Foldr f x0 c = case partUtf8Len c of _ -> build 0 (build 1 (build 2 x0)) where build i x = f (partUtf8UnsafeIndex c i) x +{-# INLINE partUtf8Foldr #-} -- | Convert 'PartialUtf8CodePoint' to 'ByteString'. partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString @@ -340,14 +341,16 @@ byteStringToStrictBuilder bs = charToStrictBuilder :: Char -> StrictBuilder charToStrictBuilder c = StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) +{-# INLINE charToStrictBuilder #-} word8ToStrictBuilder :: Word8 -> StrictBuilder -word8ToStrictBuilder w = +word8ToStrictBuilder !w = StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) +-- Eta-expanded to inline partUtf8Foldr partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder -partUtf8ToStrictBuilder = - partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder +partUtf8ToStrictBuilder c = + partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c -- | Use 'StrictBuilder' to build 'Text'. -- From c36c95f6f3173d76c55c51e33bfa8cbbebb257bb Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sun, 5 Feb 2023 22:52:56 +0000 Subject: [PATCH 73/87] Optimize first iteration of decodeUtf8With1 --- src/Data/Text/Internal/Encoding.hs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 893f426a..316cce22 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -32,6 +32,7 @@ import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Short.Internal as SBS import qualified Data.Text.Array as A import qualified Data.Text.Internal.Unsafe.Char as Char #if defined(ASSERTS) @@ -324,6 +325,8 @@ emptyStrictBuilder :: StrictBuilder emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder +appendRStrictBuilder (StrictBuilder 0 _) b2 = b2 +appendRStrictBuilder b1 (StrictBuilder 0 _) = b1 appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = StrictBuilder (n1 + n2) (\dst ofs -> do write2 dst (ofs + n1) @@ -476,15 +479,28 @@ handleUtf8Error onErr w = case onErr w of Nothing -> emptyStrictBuilder -- | Helper for 'decodeUtfWith'. +-- +-- This could be shorter by calling decodeUtf8With2 directly, +-- but we make the first call validateUtf8Chunk directly so that decodeUtf8With1 :: #if defined(ASSERTS) HasCallStack => #endif (Word8 -> Maybe Char) -> ByteString -> Text -decodeUtf8With1 onErr bs0 = strictBuilderToText $ - builder <> skipIncomplete onErr s - where - (builder, _, s) = decodeUtf8With2 onErr startUtf8State bs0 +decodeUtf8With1 onErr bs = case validateUtf8Chunk bs of + (len, Just s) + | len == B.length bs -> + let !(SBS.SBS arr) = SBS.toShort bs in + Text (A.ByteArray arr) 0 len + | otherwise -> strictBuilderToText $ + byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr s + (len, Nothing) -> + let (builder, _, s) = decodeUtf8With2 onErr startUtf8State (B.drop (len + 1) bs) in + strictBuilderToText $ + byteStringToStrictBuilder (B.take len bs) <> + handleUtf8Error onErr (B.index bs len) <> + builder <> + skipIncomplete onErr s -- | Helper for 'Data.Text.Encoding.decodeUtf8With', -- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy From 89f22071c219397036319cdd1d4af6ece6c071dd Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Sun, 5 Feb 2023 23:48:04 +0000 Subject: [PATCH 74/87] Go CPS --- src/Data/Text/Encoding.hs | 4 +- src/Data/Text/Internal/Encoding.hs | 139 ++++++++++++++++------------- src/Data/Text/Lazy/Encoding.hs | 6 +- 3 files changed, 81 insertions(+), 68 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index cd3593de..ecd89948 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -325,7 +325,7 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr = loop startUtf8State where loop s chunk = - let (builder, undecoded, s') = decodeUtf8With2 (onErr invalidUtf8Msg . Just) s chunk + let (builder, undecoded, s') = decodeUtf8With2 onErr invalidUtf8Msg s chunk in Some (strictBuilderToText builder) undecoded (loop s') -- | Decode a 'ByteString' containing UTF-8 encoded text. @@ -337,7 +337,7 @@ decodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Text -decodeUtf8With onErr = decodeUtf8With1 (onErr invalidUtf8Msg . Just) +decodeUtf8With onErr = decodeUtf8With1 onErr invalidUtf8Msg invalidUtf8Msg :: String invalidUtf8Msg = "Data.Text.Encoding: Invalid UTF-8 stream" diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 316cce22..5236bf14 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} +{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE Trustworthy #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ViewPatterns #-} @@ -20,16 +21,17 @@ import Data.Bits ((.&.), shiftL, shiftR) import Data.ByteString (ByteString) import Data.Functor (void) import Data.Semigroup (Semigroup(..)) -import Data.Text.Internal (Text(..), empty, safe) -import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word32, Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr) import Foreign.Storable (pokeElemOff) +import Data.Text.Encoding.Error (OnDecodeError) +import Data.Text.Internal (Text(..), empty, safe) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as BI import qualified Data.ByteString.Short.Internal as SBS @@ -146,6 +148,7 @@ partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs = else w -- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code. +{-# INLINE partUtf8Foldr #-} partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a partUtf8Foldr f x0 c = case partUtf8Len c of 0 -> x0 @@ -154,7 +157,6 @@ partUtf8Foldr f x0 c = case partUtf8Len c of _ -> build 0 (build 1 (build 2 x0)) where build i x = f (partUtf8UnsafeIndex c i) x -{-# INLINE partUtf8Foldr #-} -- | Convert 'PartialUtf8CodePoint' to 'ByteString'. partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString @@ -191,15 +193,19 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- -- @since 2.0.2 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) -validateUtf8Chunk = validateUtf8ChunkFrom 0 +validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,) --- | Add an offset to the index returned by 'validateUtf8More'. +-- Add an offset to the index returned by 'validateUtf8More'. -- -- @ --- validateUtf8ChunkFrom n = first (+ n) . 'validateUtf8More' . 'B.drop' n +-- validateUtf8ChunkFrom n bs (,) = (first (+ n) . 'validateUtf8More' . 'B.drop' n) bs -- @ -validateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) -validateUtf8ChunkFrom ofs bs +-- +-- CPS: inlining the continuation lets us avoid allocating a @Maybe@ in the +-- @decode...@ functions. +{-# INLINE validateUtf8ChunkFrom #-} +validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r +validateUtf8ChunkFrom ofs bs k #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) | guessUtf8Boundary > 0 && -- the rest of the bytestring valid utf-8 up to the boundary @@ -211,36 +217,36 @@ validateUtf8ChunkFrom ofs bs #else B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs) #endif - ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) bs + ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) -- No - | otherwise = slowValidateUtf8ChunkFrom ofs bs - where - len = B.length bs - ofs - isBoundary n word8 = len >= n && word8 <= B.index bs (ofs + len - n) - guessUtf8Boundary - | isBoundary 3 0xf0 = len - 3 -- third to last char starts a four-byte code point - | isBoundary 2 0xe0 = len - 2 -- pre-last char starts a three-or-four-byte code point - | isBoundary 1 0xc2 = len - 1 -- last char starts a two-(or more-)byte code point - | otherwise = len + | otherwise = if guessUtf8Boundary == len then k len (Just startUtf8State) else slowValidateUtf8ChunkFrom ofs + where + len = B.length bs - ofs + isBoundary n p = len >= n && p (B.index bs (ofs + len - n)) + guessUtf8Boundary + | isBoundary 1 (<= 0x80) = len -- last char is ASCII + | isBoundary 1 (0xc2 <=) = len - 1 -- last char starts a two-(or more-)byte code point + | isBoundary 2 (0xe0 <=) = len - 2 -- pre-last char starts a three-or-four-byte code point + | isBoundary 3 (0xf0 <=) = len - 3 -- third to last char starts a four-byte code point + | otherwise = len #else - = slowValidateUtf8ChunkFrom ofs bs + = slowValidateUtf8ChunkFrom ofs + where #endif + -- A pure Haskell implementation of validateUtf8More. + -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us + -- indices to let us avoid this function. + slowValidateUtf8ChunkFrom :: Int -> r + slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8StartState --- | A pure Haskell implementation of validateUtf8More. --- --- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us --- indices to let us avoid this function. -slowValidateUtf8ChunkFrom :: Int -> ByteString -> (Int, Maybe Utf8State) -slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState - where - loop !utf8End i s + slowLoop !utf8End i s | i < B.length bs = case updateUtf8State (B.index bs i) s of Just s' -> let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End - in loop utf8End' (i + 1) s' - Nothing -> (utf8End, Nothing) - | otherwise = (utf8End, Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) + in slowLoop utf8End' (i + 1) s' + Nothing -> k utf8End Nothing + | otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) -- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text. -- @@ -284,20 +290,26 @@ slowValidateUtf8ChunkFrom ofs bs = loop ofs ofs utf8StartState -- -- @since 2.0.2 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) -validateUtf8More st@(Utf8State s0 part) bs +validateUtf8More st bs = validateUtf8MoreCont st bs (,) + +-- CPS: inlining the continuation lets us make more tail calls and avoid +-- allocating a @Maybe@ in @decodeWith1/2@. +{-# INLINE validateUtf8MoreCont #-} +validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r +validateUtf8MoreCont st@(Utf8State s0 part) bs k | len > 0 = loop 0 s0 - | otherwise = (- partUtf8Len part, Just st) + | otherwise = k (- partUtf8Len part) (Just st) where len = B.length bs -- Complete an incomplete code point (if there is one) -- and then jump to validateUtf8ChunkFrom loop !i s - | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs + | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs k | i < len = case updateUtf8State (B.index bs i) s of - Nothing -> (- partUtf8Len part, Nothing) + Nothing -> k (- partUtf8Len part) Nothing Just s' -> loop (i + 1) s' - | otherwise = (- partUtf8Len part, Just (Utf8State s (partUtf8UnsafeAppend part bs))) + | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs))) -- | A delayed representation of strict 'Text'. @@ -341,10 +353,10 @@ byteStringToStrictBuilder :: ByteString -> StrictBuilder byteStringToStrictBuilder bs = StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) +{-# INLINE charToStrictBuilder #-} charToStrictBuilder :: Char -> StrictBuilder charToStrictBuilder c = StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) -{-# INLINE charToStrictBuilder #-} word8ToStrictBuilder :: Word8 -> StrictBuilder word8ToStrictBuilder !w = @@ -429,12 +441,11 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> -- @ decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = - case validateUtf8More s bs of - (len, s') -> - let builder | len <= 0 = emptyStrictBuilder - | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s) - <> byteStringToStrictBuilder (B.take len bs) - in (builder, B.drop len bs, s') + validateUtf8MoreCont s bs $ \len ms -> + let builder | len <= 0 = emptyStrictBuilder + | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s) + <> byteStringToStrictBuilder (B.take len bs) + in (builder, B.drop len bs, ms) -- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'. -- @@ -467,14 +478,16 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State -- value. The bytes are the positions from 'errStart' (inclusive) to -- 'errEnd' (exclusive). Any substitute characters are pushed onto the -- supplied 'TextDataStack' argument. -skipIncomplete :: (Word8 -> Maybe Char) -> Utf8State -> StrictBuilder -skipIncomplete onErr s = +{-# INLINE skipIncomplete #-} +skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder +skipIncomplete onErr msg s = partUtf8Foldr - ((<>) . handleUtf8Error onErr) + ((<>) . handleUtf8Error onErr msg) emptyStrictBuilder (partialUtf8CodePoint s) -handleUtf8Error :: (Word8 -> Maybe Char) -> Word8 -> StrictBuilder -handleUtf8Error onErr w = case onErr w of +{-# INLINE handleUtf8Error #-} +handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder +handleUtf8Error onErr msg w = case onErr msg (Just w) of Just c -> charToStrictBuilder c Nothing -> emptyStrictBuilder @@ -486,21 +499,21 @@ decodeUtf8With1 :: #if defined(ASSERTS) HasCallStack => #endif - (Word8 -> Maybe Char) -> ByteString -> Text -decodeUtf8With1 onErr bs = case validateUtf8Chunk bs of - (len, Just s) + OnDecodeError -> String -> ByteString -> Text +decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms of + Just s | len == B.length bs -> let !(SBS.SBS arr) = SBS.toShort bs in Text (A.ByteArray arr) 0 len | otherwise -> strictBuilderToText $ - byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr s - (len, Nothing) -> - let (builder, _, s) = decodeUtf8With2 onErr startUtf8State (B.drop (len + 1) bs) in + byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr msg s + Nothing -> + let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in strictBuilderToText $ byteStringToStrictBuilder (B.take len bs) <> - handleUtf8Error onErr (B.index bs len) <> + handleUtf8Error onErr msg (B.index bs len) <> builder <> - skipIncomplete onErr s + skipIncomplete onErr msg s -- | Helper for 'Data.Text.Encoding.decodeUtf8With', -- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy @@ -512,28 +525,28 @@ decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => #endif - (Word8 -> Maybe Char) -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State) -decodeUtf8With2 onErr s0 bs = loop s0 0 emptyStrictBuilder + OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State) +decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder where loop s i !builder = let nonEmptyPrefix len = builder <> partUtf8ToStrictBuilder (partialUtf8CodePoint s) <> byteStringToStrictBuilder (B.take len (B.drop i bs)) - in case validateUtf8More s (B.drop i bs) of - (len, Nothing) -> + in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of + Nothing -> if len < 0 then -- If the first byte cannot complete the partial code point in s, -- retry from startUtf8State. - let builder' = builder <> skipIncomplete onErr s + let builder' = builder <> skipIncomplete onErr msg s -- Note: loop is strict on builder, so if onErr raises an error it will -- be forced here, short-circuiting the loop as desired. in loop startUtf8State i builder' else let builder' = nonEmptyPrefix len - <> handleUtf8Error onErr (B.index bs (i + len)) + <> handleUtf8Error onErr msg (B.index bs (i + len)) in loop startUtf8State (i + len + 1) builder' - (len, Just s') -> + Just s' -> let builder' = if len <= 0 then builder else nonEmptyPrefix len undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s') then B.drop (i + len) bs -- Reuse bs if possible diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 1def3f51..5b6607bf 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -110,10 +110,10 @@ decodeUtf8With onErr = loop TE.startUtf8State where chunkb builder t | TE.sbLength builder == 0 = t | otherwise = Chunk (TE.strictBuilderToText builder) t - loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr' s b of + loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr msg s b of (builder, _, s') -> chunkb builder (loop s' bs) - loop s B.Empty = chunkb (TE.skipIncomplete onErr' s) Empty - onErr' = onErr "Data.Text.Internal.Encoding: Invalid UTF-8 stream" . Just + loop s B.Empty = chunkb (TE.skipIncomplete onErr msg s) Empty + msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. From b3ec653b7394ee9f861f30d9a67825b7dd3aa377 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 00:19:01 +0000 Subject: [PATCH 75/87] More doc, explicit exports --- src/Data/Text/Internal/Encoding.hs | 54 ++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 5236bf14..96e1e6c4 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -6,11 +6,27 @@ {-# LANGUAGE ViewPatterns #-} -- | +-- Module : Data.Text.Internal.Builder +-- License : BSD-style (see LICENSE) +-- Stability : experimental +-- +-- /Warning/: this is an internal module, and does not have a stable +-- API or name. Functions in this module may not check or enforce +-- preconditions expected by public modules. Use at your own risk! +-- -- Internals of "Data.Text.Encoding". -- --- If you'd like to depend on something from here that's not in "Data.Text.Encoding", --- please request to have it exported! -module Data.Text.Internal.Encoding where +-- @since 2.0.2 +module Data.Text.Internal.Encoding + ( validateUtf8Chunk + , validateUtf8More + , decodeUtf8Chunk + , decodeUtf8More + , decodeUtf8With1 + , decodeUtf8With2 + , Utf8State + , startUtf8State + ) where #if defined(ASSERTS) import Control.Exception (assert) @@ -20,7 +36,9 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits ((.&.), shiftL, shiftR) import Data.ByteString (ByteString) import Data.Functor (void) +#if MIN_VERSION_base(4,11,0) import Data.Semigroup (Semigroup(..)) +#endif import Data.Word (Word32, Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr) @@ -43,8 +61,6 @@ import GHC.Stack (HasCallStack) #ifdef SIMDUTF import Foreign.C.Types (CInt(..)) -#elif !MIN_VERSION_bytestring(0,11,2) -import qualified Data.ByteString.Unsafe as B #endif -- | State of decoding a 'ByteString' in UTF-8. @@ -312,12 +328,12 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs))) -- | A delayed representation of strict 'Text'. - --- For internal purposes, this is instead used as a delayed 'Array': --- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder', --- 'byteStringToStrictBuilder', 'partUtf8ToStrictBuilder'). -- -- @since 2.0.2 + +-- For internal purposes, this is instead abused as a delayed 'Array': +-- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder', +-- 'byteStringToStrictBuilder', 'utf8StateToStrictBuilder'). data StrictBuilder = StrictBuilder { sbLength :: {-# UNPACK #-} !Int , sbWrite :: forall s. A.MArray s -> Int -> ST s () @@ -367,6 +383,9 @@ partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder partUtf8ToStrictBuilder c = partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c +utf8StateToStrictBuilder :: Utf8State -> StrictBuilder +utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint + -- | Use 'StrictBuilder' to build 'Text'. -- -- @since 2.0.2 @@ -439,11 +458,13 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> -- s2b (pre1 '<>' pre2) = s2b pre3 -- ms2 = ms3 -- @ +-- +-- @since 2.0.2 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = validateUtf8MoreCont s bs $ \len ms -> let builder | len <= 0 = emptyStrictBuilder - | otherwise = partUtf8ToStrictBuilder (partialUtf8CodePoint s) + | otherwise = utf8StateToStrictBuilder s <> byteStringToStrictBuilder (B.take len bs) in (builder, B.drop len bs, ms) @@ -491,10 +512,13 @@ handleUtf8Error onErr msg w = case onErr msg (Just w) of Just c -> charToStrictBuilder c Nothing -> emptyStrictBuilder --- | Helper for 'decodeUtfWith'. +-- | Helper for 'Data.Text.Encoding.decodeUtfWith'. +-- +-- This could be shorter by calling decodeUtf8With2 directly, but we make the +-- first call validateUtf8Chunk directly to return even faster in successful +-- cases. -- --- This could be shorter by calling decodeUtf8With2 directly, --- but we make the first call validateUtf8Chunk directly so that +-- @since 2.0.2 decodeUtf8With1 :: #if defined(ASSERTS) HasCallStack => @@ -521,6 +545,8 @@ decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms o -- which use an 'OnDecodeError' to process bad bytes. -- -- See 'decodeUtf8Chunk' for a more flexible alternative. +-- +-- @since 2.0.2 decodeUtf8With2 :: #if defined(ASSERTS) HasCallStack => @@ -530,7 +556,7 @@ decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder where loop s i !builder = let nonEmptyPrefix len = builder - <> partUtf8ToStrictBuilder (partialUtf8CodePoint s) + <> utf8StateToStrictBuilder s <> byteStringToStrictBuilder (B.take len (B.drop i bs)) in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of Nothing -> From 7656b018bafd90a9e9de3a7a92567bbcce58cf1b Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 00:51:34 +0000 Subject: [PATCH 76/87] Make StrictBuilder module, explicit exports --- src/Data/Text/Internal/Encoding.hs | 135 ++++++++---------------- src/Data/Text/Internal/StrictBuilder.hs | 107 +++++++++++++++++++ src/Data/Text/Lazy/Encoding.hs | 3 +- tests/Tests/Properties/Transcoding.hs | 29 ++--- text.cabal | 1 + 5 files changed, 165 insertions(+), 110 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 96e1e6c4..df1fe903 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -26,17 +26,22 @@ module Data.Text.Internal.Encoding , decodeUtf8With2 , Utf8State , startUtf8State + , StrictBuilder() + , strictBuilderToText + , textToStrictBuilder + + -- * Internal + , skipIncomplete + , getCompleteLen + , getPartialUtf8 ) where #if defined(ASSERTS) import Control.Exception (assert) #endif -import Control.Monad.ST (ST, runST) -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits ((.&.), shiftL, shiftR) import Data.ByteString (ByteString) -import Data.Functor (void) -#if MIN_VERSION_base(4,11,0) +#if !MIN_VERSION_base(4,11,0) import Data.Semigroup (Semigroup(..)) #endif import Data.Word (Word32, Word8) @@ -44,17 +49,18 @@ import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr) import Foreign.Storable (pokeElemOff) import Data.Text.Encoding.Error (OnDecodeError) -import Data.Text.Internal (Text(..), empty, safe) +import Data.Text.Internal (Text(..)) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 - (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete, utf8Length) + (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) +import Data.Text.Internal.StrictBuilder (StrictBuilder) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Unsafe (unsafeDupablePerformIO) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as BI import qualified Data.ByteString.Short.Internal as SBS import qualified Data.Text.Array as A -import qualified Data.Text.Internal.Unsafe.Char as Char +import qualified Data.Text.Internal.StrictBuilder as SB #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif @@ -63,6 +69,12 @@ import GHC.Stack (HasCallStack) import Foreign.C.Types (CInt(..)) #endif +strictBuilderToText :: StrictBuilder -> Text +strictBuilderToText = SB.toText + +textToStrictBuilder :: Text -> StrictBuilder +textToStrictBuilder = SB.fromText + -- | State of decoding a 'ByteString' in UTF-8. -- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More', -- 'decodeUtf8Chunk', 'decodeUtf8More'). @@ -179,6 +191,14 @@ partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr -> partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0 +-- Exported for testing. +getCompleteLen :: Utf8State -> Int +getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint + +-- | Exported for testing. +getPartialUtf8 :: Utf8State -> B.ByteString +getPartialUtf8 = partUtf8ToByteString . partialUtf8CodePoint + #ifdef SIMDUTF foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 :: Ptr Word8 -> CSize -> IO CInt @@ -327,89 +347,20 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k Just s' -> loop (i + 1) s' | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs))) --- | A delayed representation of strict 'Text'. --- --- @since 2.0.2 - --- For internal purposes, this is instead abused as a delayed 'Array': --- it may not actually be valid 'Text' (e.g., 'word8ToStrictBuilder', --- 'byteStringToStrictBuilder', 'utf8StateToStrictBuilder'). -data StrictBuilder = StrictBuilder - { sbLength :: {-# UNPACK #-} !Int - , sbWrite :: forall s. A.MArray s -> Int -> ST s () - } - --- | Concatenation of 'StrictBuilder' is right-biased: --- the right builder will be run first. This allows a builder to --- run tail-recursively when it was accumulated left-to-right. -instance Semigroup StrictBuilder where - (<>) = appendRStrictBuilder - -instance Monoid StrictBuilder where - mempty = emptyStrictBuilder - mappend = (<>) - -emptyStrictBuilder :: StrictBuilder -emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) - -appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder -appendRStrictBuilder (StrictBuilder 0 _) b2 = b2 -appendRStrictBuilder b1 (StrictBuilder 0 _) = b1 -appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = - StrictBuilder (n1 + n2) (\dst ofs -> do - write2 dst (ofs + n1) - write1 dst ofs) - -copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s () -copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> - unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do - unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len - -byteStringToStrictBuilder :: ByteString -> StrictBuilder -byteStringToStrictBuilder bs = - StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) - -{-# INLINE charToStrictBuilder #-} -charToStrictBuilder :: Char -> StrictBuilder -charToStrictBuilder c = - StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) - -word8ToStrictBuilder :: Word8 -> StrictBuilder -word8ToStrictBuilder !w = - StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) - -- Eta-expanded to inline partUtf8Foldr partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder partUtf8ToStrictBuilder c = - partUtf8Foldr ((<>) . word8ToStrictBuilder) emptyStrictBuilder c + partUtf8Foldr ((<>) . SB.unsafeFromWord8) mempty c utf8StateToStrictBuilder :: Utf8State -> StrictBuilder utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint --- | Use 'StrictBuilder' to build 'Text'. --- --- @since 2.0.2 -strictBuilderToText :: StrictBuilder -> Text -strictBuilderToText (StrictBuilder 0 _) = empty -strictBuilderToText (StrictBuilder n write) = runST (do - dst <- A.new n - write dst 0 - arr <- A.unsafeFreeze dst - pure (Text arr 0 n)) - --- | Copy 'Text' in a 'StrictBuilder' --- --- @since 2.0.2 -textToStrictBuilder :: Text -> StrictBuilder -textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> - A.copyI n dst dstOfs src srcOfs) - -- | Decode another chunk in an ongoing UTF-8 stream. -- -- Returns a triple: -- -- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate --- chunks with @('<>')@ or output them with 'strictBuilderToText'. +-- chunks with @('<>')@ or output them with 'SB.toText'. -- 2. The undecoded remainder of the given chunk, for diagnosing errors -- and resuming (presumably after skipping some bytes). -- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered @@ -434,7 +385,7 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> -- where -- -- @ --- s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.strictBuilderToText' +-- s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.toText' -- p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString' -- @ -- @@ -463,9 +414,9 @@ textToStrictBuilder (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = validateUtf8MoreCont s bs $ \len ms -> - let builder | len <= 0 = emptyStrictBuilder + let builder | len <= 0 = mempty | otherwise = utf8StateToStrictBuilder s - <> byteStringToStrictBuilder (B.take len bs) + <> SB.unsafeFromByteString (B.take len bs) in (builder, B.drop len bs, ms) -- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'. @@ -487,7 +438,7 @@ decodeUtf8More s bs = -- @builder@ is a prefix and @rest@ is a suffix of @chunk@. -- -- @ --- 'Data.Text.Encoding.encodeUtf8' ('strictBuilderToText' builder) '<>' rest = chunk +-- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk -- @ -- -- @since 2.0.2 @@ -499,18 +450,20 @@ decodeUtf8Chunk = decodeUtf8More startUtf8State -- value. The bytes are the positions from 'errStart' (inclusive) to -- 'errEnd' (exclusive). Any substitute characters are pushed onto the -- supplied 'TextDataStack' argument. +-- +-- Exported for lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'. {-# INLINE skipIncomplete #-} skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder skipIncomplete onErr msg s = partUtf8Foldr ((<>) . handleUtf8Error onErr msg) - emptyStrictBuilder (partialUtf8CodePoint s) + mempty (partialUtf8CodePoint s) {-# INLINE handleUtf8Error #-} handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder handleUtf8Error onErr msg w = case onErr msg (Just w) of - Just c -> charToStrictBuilder c - Nothing -> emptyStrictBuilder + Just c -> SB.fromChar c + Nothing -> mempty -- | Helper for 'Data.Text.Encoding.decodeUtfWith'. -- @@ -529,12 +482,12 @@ decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms o | len == B.length bs -> let !(SBS.SBS arr) = SBS.toShort bs in Text (A.ByteArray arr) 0 len - | otherwise -> strictBuilderToText $ - byteStringToStrictBuilder (B.take len bs) <> skipIncomplete onErr msg s + | otherwise -> SB.toText $ + SB.unsafeFromByteString (B.take len bs) <> skipIncomplete onErr msg s Nothing -> let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in - strictBuilderToText $ - byteStringToStrictBuilder (B.take len bs) <> + SB.toText $ + SB.unsafeFromByteString (B.take len bs) <> handleUtf8Error onErr msg (B.index bs len) <> builder <> skipIncomplete onErr msg s @@ -552,12 +505,12 @@ decodeUtf8With2 :: HasCallStack => #endif OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State) -decodeUtf8With2 onErr msg s0 bs = loop s0 0 emptyStrictBuilder +decodeUtf8With2 onErr msg s0 bs = loop s0 0 mempty where loop s i !builder = let nonEmptyPrefix len = builder <> utf8StateToStrictBuilder s - <> byteStringToStrictBuilder (B.take len (B.drop i bs)) + <> SB.unsafeFromByteString (B.take len (B.drop i bs)) in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of Nothing -> if len < 0 diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs index e69de29b..a8cdcad0 100644 --- a/src/Data/Text/Internal/StrictBuilder.hs +++ b/src/Data/Text/Internal/StrictBuilder.hs @@ -0,0 +1,107 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE RankNTypes #-} + +-- | +-- Module : Data.Text.Internal.Builder +-- License : BSD-style (see LICENSE) +-- Stability : experimental +-- +-- /Warning/: this is an internal module, and does not have a stable +-- API or name. Functions in this module may not check or enforce +-- preconditions expected by public modules. Use at your own risk! +-- +-- @since 2.0.2 + +module Data.Text.Internal.StrictBuilder + ( StrictBuilder(..) + , toText + , fromChar + , fromText + + -- * Unsafe + , unsafeFromByteString + , unsafeFromWord8 + ) where + +import Control.Monad.ST (ST, runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Data.Functor (void) +import Data.Word (Word8) +import Data.ByteString (ByteString) +import Data.Text.Internal (Text(..), empty, safe) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding.Utf8 (utf8Length) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import qualified Data.ByteString as B +import qualified Data.Text.Array as A +import qualified Data.Text.Internal.Unsafe.Char as Char + +-- | A delayed representation of strict 'Text'. +-- +-- @since 2.0.2 + +-- For internal purposes, this is instead abused as a delayed 'Array': +-- it may not actually be valid 'Text' (e.g., 'unsafeFromWord8', +-- 'unsafeFromByteString'). +data StrictBuilder = StrictBuilder + { sbLength :: {-# UNPACK #-} !Int + , sbWrite :: forall s. A.MArray s -> Int -> ST s () + } + +-- | Use 'StrictBuilder' to build 'Text'. +-- +-- @since 2.0.2 +toText :: StrictBuilder -> Text +toText (StrictBuilder 0 _) = empty +toText (StrictBuilder n write) = runST (do + dst <- A.new n + write dst 0 + arr <- A.unsafeFreeze dst + pure (Text arr 0 n)) + +-- | Concatenation of 'StrictBuilder' is right-biased: +-- the right builder will be run first. This allows a builder to +-- run tail-recursively when it was accumulated left-to-right. +instance Semigroup StrictBuilder where + (<>) = appendRStrictBuilder + +instance Monoid StrictBuilder where + mempty = emptyStrictBuilder + mappend = (<>) + +emptyStrictBuilder :: StrictBuilder +emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) + +appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder +appendRStrictBuilder (StrictBuilder 0 _) b2 = b2 +appendRStrictBuilder b1 (StrictBuilder 0 _) = b1 +appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = + StrictBuilder (n1 + n2) (\dst ofs -> do + write2 dst (ofs + n1) + write1 dst ofs) + +copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s () +copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> + unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do + unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len + +-- | Copy a 'ByteString'. Note: This may not be valid text. +unsafeFromByteString :: ByteString -> StrictBuilder +unsafeFromByteString bs = + StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) + +{-# INLINE fromChar #-} +fromChar :: Char -> StrictBuilder +fromChar c = + StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) + +unsafeFromWord8 :: Word8 -> StrictBuilder +unsafeFromWord8 !w = + StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) + +-- | Copy 'Text' in a 'StrictBuilder' +-- +-- @since 2.0.2 +fromText :: Text -> StrictBuilder +fromText (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> + A.copyI n dst dstOfs src srcOfs) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 5b6607bf..06a06224 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -69,6 +69,7 @@ import qualified Data.Text.Encoding as TE import qualified Data.Text.Internal.Encoding as TE import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E import qualified Data.Text.Internal.Lazy.Fusion as F +import qualified Data.Text.Internal.StrictBuilder as SB import Data.Text.Unsafe (unsafeDupablePerformIO) -- $strict @@ -108,7 +109,7 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr = loop TE.startUtf8State where - chunkb builder t | TE.sbLength builder == 0 = t + chunkb builder t | SB.sbLength builder == 0 = t | otherwise = Chunk (TE.strictBuilderToText builder) t loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr msg s b of (builder, _, s') -> chunkb builder (loop s' bs) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 7bf2067f..49ce4f2e 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -1,19 +1,20 @@ -- | Tests for encoding and decoding -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} +{-# LANGUAGE CPP, OverloadedStrings, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} {-# LANGUAGE BangPatterns #-} module Tests.Properties.Transcoding ( testTranscoding ) where --- import Debug.Trace (trace) import Prelude hiding (head, tail) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) import Data.Functor (void) import Data.Maybe (isNothing) +#if !MIN_VERSION_base(4,11,0) import Data.Semigroup ((<>)) +#endif import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -102,8 +103,7 @@ t_pn_utf8_2 = do pure () t_pn_utf8_3 = do s1 <- testValidateUtf8 [0xc2] 0 - let c = E.partialUtf8CodePoint s1 - assertBool "PartialUtf8 must be partial" $ E.partUtf8Len c < E.partUtf8CompleteLen c + assertBool "PartialUtf8 must be partial" $ B.length (E.getPartialUtf8 s1) < E.getCompleteLen s1 testValidateUtf8MoreFail s1 [0x80, 0x80] 1 -- Precondition: (i, ms1) = E.validateUtf8More s chunk @@ -111,7 +111,7 @@ t_pn_utf8_3 = do -- The index points to the end of the longest valid prefix -- of prechunk `B.append` chunk pre_validateUtf8More_validPrefix s chunk i = - let prechunk = E.partUtf8ToByteString (E.partialUtf8CodePoint s) in + let prechunk = E.getPartialUtf8 s in -- Note: i <= 0 implies take i = id let (j, ms2) = E.validateUtf8Chunk (B.take (B.length prechunk + i) (prechunk `B.append` chunk)) in counterexample (show prechunk) $ @@ -351,11 +351,6 @@ t_decode_utf8_lenient :: Property t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs --- t_decode_utf8_lenient = --- let !res = trace "HEREHEREHERE" $ --- EL.decodeUtf8With E.lenientDecode (BS.Chunk (B.pack [0xe1]) (BS.Chunk (B.pack [0xa0]) BS.Empty)) === TL.fromStrict (T.pack "\xFFFD\xFFFD") in --- trace "THERETHERETHERE" res - -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... -- When faced with this illegal byte sequence ... a UTF-8 conformant process @@ -401,8 +396,7 @@ testDecodeUtf8With k s xs expected = if T.null txt then bs @?= xs' else - let c = E.partUtf8ToByteString (E.partialUtf8CodePoint s) - in E.encodeUtf8 txt `B.append` bs @?= c `B.append` xs' + E.encodeUtf8 txt `B.append` bs @?= E.getPartialUtf8 s `B.append` xs' k s' testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State @@ -417,8 +411,7 @@ testDecodeUtf8Fail = testDecodeUtf8With (\ms -> case ms of t_decode_chunk1 = do s1 <- testDecodeUtf8 E.startUtf8State [0xc2] "" - let c1 = E.partialUtf8CodePoint s1 - E.partUtf8Len c1 @?= 1 + B.length (E.getPartialUtf8 s1) @?= 1 testDecodeUtf8Fail s1 [0x80, 0x80] "\128" t_decode_chunk2 = do @@ -464,12 +457,12 @@ pre_decodeUtf8More_suffix s chunk = -- Decoding chunks separately is equivalent to decoding their concatenation. pre_decodeUtf8More_append s chunk1 chunk2 = - let (pre1, suf1, ms1) = E.decodeUtf8More s chunk1 in + let (pre1, _, ms1) = E.decodeUtf8More s chunk1 in case ms1 of Nothing -> discard Just s1 -> - let (pre2, suf2, ms2) = E.decodeUtf8More s1 chunk2 in - let (pre3, suf3, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in + let (pre2, _, ms2) = E.decodeUtf8More s1 chunk2 in + let (pre3, _, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in (s2b (pre1 <> pre2), ms2) === (s2b pre3, ms3) -- Properties for any chunk @@ -487,7 +480,7 @@ t_decodeUtf8More2 = property $ do pre_decodeUtf8More_append s chunk chunk2 s2b = E.encodeUtf8 . E.strictBuilderToText -p2b = E.partUtf8ToByteString . E.partialUtf8CodePoint +p2b = E.getPartialUtf8 testTranscoding :: TestTree testTranscoding = diff --git a/text.cabal b/text.cabal index 0d420fa2..05946247 100644 --- a/text.cabal +++ b/text.cabal @@ -169,6 +169,7 @@ library Data.Text.Internal.Private Data.Text.Internal.Read Data.Text.Internal.Search + Data.Text.Internal.StrictBuilder Data.Text.Internal.Unsafe Data.Text.Internal.Unsafe.Char Data.Text.Lazy From f663a6e64beb726a23ada780798afa0fb094ac87 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 01:20:57 +0000 Subject: [PATCH 77/87] Revert changes to Data.Text.Internal.Encoding.Utf8 --- changelog.md | 6 +++ src/Data/Text/Internal/Encoding.hs | 29 ++++++----- src/Data/Text/Internal/Encoding/Utf8.hs | 66 ++++++++++++++++++------- 3 files changed, 69 insertions(+), 32 deletions(-) diff --git a/changelog.md b/changelog.md index e63283c1..2b712fc6 100644 --- a/changelog.md +++ b/changelog.md @@ -21,6 +21,12 @@ * Fix quadratic slowdown when decoding invalid UTF-8 bytestrings (https://github.com/haskell/text/issues/495) +* Add internal module `Data.Text.Internal.StrictBuilder` + +* Add internal module `Data.Text.Internal.Encoding` + +* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState` + ### 2.0.1 * Improve portability of C and C++ code. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index df1fe903..95ec8f3c 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -52,7 +52,7 @@ import Data.Text.Encoding.Error (OnDecodeError) import Data.Text.Internal (Text(..)) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 - (Utf8CodePointState, utf8StartState, updateUtf8State, isUtf8StateIsComplete) + (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState) import Data.Text.Internal.StrictBuilder (StrictBuilder) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Text.Unsafe (unsafeDupablePerformIO) @@ -92,17 +92,17 @@ textToStrictBuilder = SB.fromText -- @since 2.0.2 data Utf8State = Utf8State { -- | State of the UTF-8 state machine. - utf8CodePointState :: {-# UNPACK #-} !Utf8CodePointState + utf8CodePointState :: {-# UNPACK #-} !DecoderState -- | Bytes of the currently incomplete code point (if any). , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint } - deriving (Eq, Ord, Show) + deriving (Eq, Show) -- | Initial 'Utf8State'. -- -- @since 2.0.2 startUtf8State :: Utf8State -startUtf8State = Utf8State utf8StartState partUtf8Empty +startUtf8State = Utf8State utf8AcceptState partUtf8Empty -- | Prefix of a UTF-8 code point encoded in 4 bytes, -- possibly empty. @@ -117,7 +117,7 @@ startUtf8State = Utf8State utf8StartState partUtf8Empty -- -- @since 2.0.2 newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32 - deriving (Eq, Ord, Show) + deriving (Eq, Show) -- | Empty prefix. partUtf8Empty :: PartialUtf8CodePoint @@ -273,15 +273,14 @@ validateUtf8ChunkFrom ofs bs k -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us -- indices to let us avoid this function. slowValidateUtf8ChunkFrom :: Int -> r - slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8StartState + slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8AcceptState slowLoop !utf8End i s | i < B.length bs = - case updateUtf8State (B.index bs i) s of - Just s' -> - let utf8End' = if isUtf8StateIsComplete s' then i + 1 else utf8End - in slowLoop utf8End' (i + 1) s' - Nothing -> k utf8End Nothing + case updateDecoderState (B.index bs i) s of + s' | s' == utf8RejectState -> k utf8End Nothing + | s' == utf8AcceptState -> slowLoop (i + 1) (i + 1) s' + | otherwise -> slowLoop utf8End (i + 1) s' | otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) -- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text. @@ -340,11 +339,11 @@ validateUtf8MoreCont st@(Utf8State s0 part) bs k -- Complete an incomplete code point (if there is one) -- and then jump to validateUtf8ChunkFrom loop !i s - | isUtf8StateIsComplete s = validateUtf8ChunkFrom i bs k + | s == utf8AcceptState = validateUtf8ChunkFrom i bs k | i < len = - case updateUtf8State (B.index bs i) s of - Nothing -> k (- partUtf8Len part) Nothing - Just s' -> loop (i + 1) s' + case updateDecoderState (B.index bs i) s of + s' | s' == utf8RejectState -> k (- partUtf8Len part) Nothing + | otherwise -> loop (i + 1) s' | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs))) -- Eta-expanded to inline partUtf8Foldr diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 148995a8..c8357677 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -34,10 +34,14 @@ module Data.Text.Internal.Encoding.Utf8 , validate3 , validate4 -- * Naive decoding - , Utf8CodePointState - , utf8StartState - , updateUtf8State - , isUtf8StateIsComplete + , DecoderState(..) + , utf8AcceptState + , utf8RejectState + , updateDecoderState + , DecoderResult(..) + , CodePoint(..) + , utf8DecodeStart + , utf8DecodeContinue ) where #if defined(ASSERTS) @@ -45,7 +49,7 @@ import Control.Exception (assert) import GHC.Stack (HasCallStack) #endif import Data.Bits (Bits(..), FiniteBits(..)) -import Data.Char (ord) +import Data.Char (ord, chr) import GHC.Exts import GHC.Word (Word8(..)) @@ -242,14 +246,17 @@ byteToClass n = ByteClass (W8# el#) table# :: Addr# table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# -newtype Utf8CodePointState = Utf8CodePointState Word8 - deriving (Eq, Ord, Show) +newtype DecoderState = DecoderState Word8 + deriving (Eq, Show) -utf8StartState :: Utf8CodePointState -utf8StartState = Utf8CodePointState 0 +utf8AcceptState :: DecoderState +utf8AcceptState = DecoderState 0 -transitionUtf8State :: ByteClass -> Utf8CodePointState -> Utf8CodePointState -transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W8# el#) +utf8RejectState :: DecoderState +utf8RejectState = DecoderState 12 + +updateState :: ByteClass -> DecoderState -> DecoderState +updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#) where !(I# n#) = word8ToInt (c + s) el# = indexWord8OffAddr# table# n# @@ -257,10 +264,35 @@ transitionUtf8State (ByteClass c) (Utf8CodePointState s) = Utf8CodePointState (W table# :: Addr# table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"# -updateUtf8State :: Word8 -> Utf8CodePointState -> Maybe Utf8CodePointState -updateUtf8State w st = case transitionUtf8State (byteToClass w) st of - Utf8CodePointState 12 -> Nothing - st' -> Just st' +updateDecoderState :: Word8 -> DecoderState -> DecoderState +updateDecoderState b s = updateState (byteToClass b) s + +newtype CodePoint = CodePoint Int -isUtf8StateIsComplete :: Utf8CodePointState -> Bool -isUtf8StateIsComplete (Utf8CodePointState s) = s == 0 +-- | @since 2.0 +data DecoderResult + = Accept !Char + | Incomplete !DecoderState !CodePoint + | Reject + +-- | @since 2.0 +utf8DecodeStart :: Word8 -> DecoderResult +utf8DecodeStart !w + | st == utf8AcceptState = Accept (chr (word8ToInt w)) + | st == utf8RejectState = Reject + | otherwise = Incomplete st (CodePoint cp) + where + cl@(ByteClass cl') = byteToClass w + st = updateState cl utf8AcceptState + cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w + +-- | @since 2.0 +utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult +utf8DecodeContinue !w !st (CodePoint !cp) + | st' == utf8AcceptState = Accept (chr cp') + | st' == utf8RejectState = Reject + | otherwise = Incomplete st' (CodePoint cp') + where + cl = byteToClass w + st' = updateState cl st + cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f) From bc659212ee2e916f97c749f8e5264f09153344be Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 01:32:24 +0000 Subject: [PATCH 78/87] Docs --- src/Data/Text/Encoding.hs | 4 ++-- src/Data/Text/Internal/Encoding.hs | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index ecd89948..498ff889 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -547,12 +547,12 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) -- The functions 'decodeUtf8Chunk' and 'decodeUtf8More' provide more -- control for error-handling and streaming. -- --- - You get an UTF-8 prefix of the given 'ByteString' up to the next error. +-- - Those functions return an UTF-8 prefix of the given 'ByteString' up to the next error. -- For example this lets you insert or delete arbitrary text, or do some -- stateful operations before resuming, such as keeping track of error locations. -- In contrast, the older stream-oriented interface only lets you substitute -- a single fixed 'Char' for each invalid byte in 'OnDecodeError'. --- - The prefix is encoded as a 'StrictBuilder', so you can accumulate chunks +-- - That prefix is encoded as a 'StrictBuilder', so you can accumulate chunks -- before doing the copying work to construct a 'Text', or you can -- output decoded fragments immediately as a lazy 'Data.Text.Lazy.Text'. -- diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 95ec8f3c..6bec2215 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -76,7 +76,7 @@ textToStrictBuilder :: Text -> StrictBuilder textToStrictBuilder = SB.fromText -- | State of decoding a 'ByteString' in UTF-8. --- Enables stream decoding ('validateUtf8Chunk', 'validateUtf8More', +-- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More', -- 'decodeUtf8Chunk', 'decodeUtf8More'). -- Internal invariant: @@ -444,13 +444,12 @@ decodeUtf8More s bs = decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8Chunk = decodeUtf8More startUtf8State --- | Call an error handler with the give 'String' message for each byte --- in given 'ByteString' and lead data in the given 'Utf8State' --- value. The bytes are the positions from 'errStart' (inclusive) to --- 'errEnd' (exclusive). Any substitute characters are pushed onto the --- supplied 'TextDataStack' argument. +-- | Call the error handler on each byte of the partial code point stored in +-- 'Utf8State' and append the results. -- --- Exported for lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'. +-- Exported for use in lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'. +-- +-- @since 2.0.2 {-# INLINE skipIncomplete #-} skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder skipIncomplete onErr msg s = @@ -464,13 +463,13 @@ handleUtf8Error onErr msg w = case onErr msg (Just w) of Just c -> SB.fromChar c Nothing -> mempty --- | Helper for 'Data.Text.Encoding.decodeUtfWith'. +-- | Helper for 'Data.Text.Encoding.decodeUtf8With'. -- --- This could be shorter by calling decodeUtf8With2 directly, but we make the +-- @since 2.0.2 + +-- This could be shorter by calling 'decodeUtf8With2' directly, but we make the -- first call validateUtf8Chunk directly to return even faster in successful -- cases. --- --- @since 2.0.2 decodeUtf8With1 :: #if defined(ASSERTS) HasCallStack => From 28a7460555485ba806f440dd4c59b86b5e181204 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 02:24:01 +0000 Subject: [PATCH 79/87] Fix short-circuit --- src/Data/Text/Internal/Encoding.hs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 6bec2215..1a90e626 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -231,20 +231,16 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,) --- Add an offset to the index returned by 'validateUtf8More'. +-- Assume bytes up to offset @ofs@ have been validated already. -- --- @ --- validateUtf8ChunkFrom n bs (,) = (first (+ n) . 'validateUtf8More' . 'B.drop' n) bs --- @ --- --- CPS: inlining the continuation lets us avoid allocating a @Maybe@ in the --- @decode...@ functions. +-- Using CPS lets us inline the continuation and avoid allocating a @Maybe@ +-- in the @decode...@ functions. {-# INLINE validateUtf8ChunkFrom #-} validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r validateUtf8ChunkFrom ofs bs k #if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) | guessUtf8Boundary > 0 && - -- the rest of the bytestring valid utf-8 up to the boundary + -- the rest of the bytestring is valid utf-8 up to the boundary ( #ifdef SIMDUTF withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $ @@ -253,14 +249,15 @@ validateUtf8ChunkFrom ofs bs k #else B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs) #endif - ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) + ) = if guessUtf8Boundary == len then k (B.length bs) (Just startUtf8State) + else slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) -- No - | otherwise = if guessUtf8Boundary == len then k len (Just startUtf8State) else slowValidateUtf8ChunkFrom ofs + | otherwise = slowValidateUtf8ChunkFrom ofs where len = B.length bs - ofs isBoundary n p = len >= n && p (B.index bs (ofs + len - n)) guessUtf8Boundary - | isBoundary 1 (<= 0x80) = len -- last char is ASCII + | isBoundary 1 (<= 0x80) = len -- last char is ASCII (common short-circuit) | isBoundary 1 (0xc2 <=) = len - 1 -- last char starts a two-(or more-)byte code point | isBoundary 2 (0xe0 <=) = len - 2 -- pre-last char starts a three-or-four-byte code point | isBoundary 3 (0xf0 <=) = len - 3 -- third to last char starts a four-byte code point From 9e8e0f959055166b07ae798604fa7e4623698315 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 02:31:36 +0000 Subject: [PATCH 80/87] Doc --- src/Data/Text/Internal/Encoding.hs | 26 ++++++++++++++----------- src/Data/Text/Internal/StrictBuilder.hs | 20 ++++++++++++++----- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 1a90e626..bcb98004 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -69,15 +69,23 @@ import GHC.Stack (HasCallStack) import Foreign.C.Types (CInt(..)) #endif +-- | Use 'StrictBuilder' to build 'Text'. +-- +-- @since 2.0.2 strictBuilderToText :: StrictBuilder -> Text strictBuilderToText = SB.toText +-- | Copy 'Text' in a 'StrictBuilder' +-- +-- @since 2.0.2 textToStrictBuilder :: Text -> StrictBuilder textToStrictBuilder = SB.fromText -- | State of decoding a 'ByteString' in UTF-8. -- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More', -- 'decodeUtf8Chunk', 'decodeUtf8More'). +-- +-- @since 2.0.2 -- Internal invariant: -- the first component is the initial state if and only if @@ -88,8 +96,6 @@ textToStrictBuilder = SB.fromText -- <=> -- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0 -- @ --- --- @since 2.0.2 data Utf8State = Utf8State { -- | State of the UTF-8 state machine. utf8CodePointState :: {-# UNPACK #-} !DecoderState @@ -191,7 +197,7 @@ partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr -> partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0 --- Exported for testing. +-- | Exported for testing. getCompleteLen :: Utf8State -> Int getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint @@ -212,6 +218,8 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State' -- @ -- +-- @since 2.0.2 +-- -- === Properties -- -- Given: @@ -226,8 +234,6 @@ foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 -- @ -- 'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State') -- @ --- --- @since 2.0.2 validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,) @@ -297,6 +303,8 @@ validateUtf8ChunkFrom ofs bs k -- - if @ms = Just s'@, you can carry on validating another chunk -- by calling 'validateUtf8More' with the new state @s'@. -- +-- @since 2.0.2 +-- -- === Properties -- -- Given: @@ -319,8 +327,6 @@ validateUtf8ChunkFrom ofs bs k -- ms = Just s' -- ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more) -- @ --- --- @since 2.0.2 validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) validateUtf8More st bs = validateUtf8MoreCont st bs (,) @@ -405,8 +411,6 @@ utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint -- s2b (pre1 '<>' pre2) = s2b pre3 -- ms2 = ms3 -- @ --- --- @since 2.0.2 decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8More s bs = validateUtf8MoreCont s bs $ \len ms -> @@ -419,6 +423,8 @@ decodeUtf8More s bs = -- -- See 'decodeUtf8More' for details on the result. -- +-- @since 2.0.2 +-- -- === Properties -- -- @ @@ -436,8 +442,6 @@ decodeUtf8More s bs = -- @ -- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk -- @ --- --- @since 2.0.2 decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) decodeUtf8Chunk = decodeUtf8More startUtf8State diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs index a8cdcad0..d1c44248 100644 --- a/src/Data/Text/Internal/StrictBuilder.hs +++ b/src/Data/Text/Internal/StrictBuilder.hs @@ -19,6 +19,7 @@ module Data.Text.Internal.StrictBuilder , fromText -- * Unsafe + -- $unsafe , unsafeFromByteString , unsafeFromWord8 ) where @@ -39,10 +40,6 @@ import qualified Data.Text.Internal.Unsafe.Char as Char -- | A delayed representation of strict 'Text'. -- -- @since 2.0.2 - --- For internal purposes, this is instead abused as a delayed 'Array': --- it may not actually be valid 'Text' (e.g., 'unsafeFromWord8', --- 'unsafeFromByteString'). data StrictBuilder = StrictBuilder { sbLength :: {-# UNPACK #-} !Int , sbWrite :: forall s. A.MArray s -> Int -> ST s () @@ -85,16 +82,29 @@ copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len --- | Copy a 'ByteString'. Note: This may not be valid text. +-- | Copy a 'ByteString'. +-- +-- Unsafe: This may not be valid UTF-8 text. +-- +-- @since 2.0.2 unsafeFromByteString :: ByteString -> StrictBuilder unsafeFromByteString bs = StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) +-- | +-- @since 2.0.2 {-# INLINE fromChar #-} fromChar :: Char -> StrictBuilder fromChar c = StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) +-- $unsafe +-- For internal purposes, we abuse 'StrictBuilder' as a delayed 'Array' rather +-- than 'Text': it may not actually be valid 'Text'. + +-- | Unsafe: This may not be valid UTF-8 text. +-- +-- @since 2.0.2 unsafeFromWord8 :: Word8 -> StrictBuilder unsafeFromWord8 !w = StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) From 1f5a873aae8b6a5a3b70ae69cd3aa66bfc3710bb Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 02:59:13 +0000 Subject: [PATCH 81/87] Sort imports --- src/Data/Text/Encoding.hs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 498ff889..c0f99f7b 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -88,30 +88,30 @@ import Control.Exception (evaluate, try) import Control.Monad.ST (runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits (shiftR, (.&.)) -import Data.ByteString (ByteString) -import qualified Data.ByteString.Short.Internal as SBS -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Internal (Text(..), empty) -import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Data.ByteString (ByteString) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) +import Data.Text.Internal (Text(..), empty) import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Text.Show () import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP +import qualified Data.ByteString.Short.Internal as SBS import qualified Data.Text.Array as A -import Data.Text.Internal.Encoding import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F -import Data.Text.Show () #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif From 72f91ee96e06fe9632b03874fe3a9749292f3191 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 03:09:06 +0000 Subject: [PATCH 82/87] Undo useless optimization --- src/Data/Text/Internal/Encoding.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index bcb98004..31d81103 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -255,8 +255,7 @@ validateUtf8ChunkFrom ofs bs k #else B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs) #endif - ) = if guessUtf8Boundary == len then k (B.length bs) (Just startUtf8State) - else slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) + ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) -- No | otherwise = slowValidateUtf8ChunkFrom ofs where From d347bba6a0cc34393c0b503689558c1088c1598d Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 03:10:12 +0000 Subject: [PATCH 83/87] import Semigroup for old base --- src/Data/Text/Internal/StrictBuilder.hs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs index d1c44248..84a57264 100644 --- a/src/Data/Text/Internal/StrictBuilder.hs +++ b/src/Data/Text/Internal/StrictBuilder.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE RankNTypes #-} @@ -29,6 +30,9 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Functor (void) import Data.Word (Word8) import Data.ByteString (ByteString) +#if !MIN_VERSION_base(4,11,0) +import Data.Semigroup (Semigroup(..)) +#endif import Data.Text.Internal (Text(..), empty, safe) import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 (utf8Length) From 9a0c8a80f17efc3ebf73b922f562942e49e1fcad Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 07:13:53 +0000 Subject: [PATCH 84/87] Clean up imports --- src/Data/Text/Encoding.hs | 5 ----- src/Data/Text/Internal/Encoding.hs | 10 +++++----- tests/Tests/Properties/Transcoding.hs | 1 - 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index c0f99f7b..500461a1 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -116,11 +116,6 @@ import qualified Data.Text.Internal.Fusion as F import GHC.Stack (HasCallStack) #endif -#ifdef SIMDUTF -#elif !MIN_VERSION_bytestring(0,11,2) -import qualified Data.ByteString.Unsafe as B -#endif - -- $validation -- These functions are for validating 'ByteString's as encoded text. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs index 31d81103..26a24afb 100644 --- a/src/Data/Text/Internal/Encoding.hs +++ b/src/Data/Text/Internal/Encoding.hs @@ -45,17 +45,12 @@ import Data.ByteString (ByteString) import Data.Semigroup (Semigroup(..)) #endif import Data.Word (Word32, Word8) -import Foreign.C.Types (CSize(..)) -import Foreign.Ptr (Ptr) import Foreign.Storable (pokeElemOff) import Data.Text.Encoding.Error (OnDecodeError) import Data.Text.Internal (Text(..)) -import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding.Utf8 (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState) import Data.Text.Internal.StrictBuilder (StrictBuilder) -import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Unsafe (unsafeDupablePerformIO) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as BI import qualified Data.ByteString.Short.Internal as SBS @@ -66,7 +61,12 @@ import GHC.Stack (HasCallStack) #endif #ifdef SIMDUTF +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Foreign.C.Types (CSize(..)) import Foreign.C.Types (CInt(..)) +import Foreign.Ptr (Ptr) #endif -- | Use 'StrictBuilder' to build 'Text'. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 49ce4f2e..7f907d28 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -2,7 +2,6 @@ {-# LANGUAGE CPP, OverloadedStrings, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} -{-# LANGUAGE BangPatterns #-} module Tests.Properties.Transcoding ( testTranscoding ) where From 64fd0291c44e8655c7d662478e801447cc4fab5b Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 07:17:59 +0000 Subject: [PATCH 85/87] Minimize test diff --- src/Data/Text/Encoding.hs | 1 - tests/Tests/Properties/Transcoding.hs | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 500461a1..6f355ac6 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -3,7 +3,6 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Text.Encoding -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 7f907d28..9c1f108e 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -490,6 +490,8 @@ testTranscoding = testProperty "tl_latin1" tl_latin1, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', + testProperty "t_utf8_undecoded" t_utf8_undecoded, + testProperty "t_utf8_incr" t_utf8_incr, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "t_utf16LE" t_utf16LE, @@ -516,6 +518,11 @@ testTranscoding = testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, + testCase "t_decode_with_error1'" t_decode_with_error1', + testCase "t_decode_with_error2'" t_decode_with_error2', + testCase "t_decode_with_error3'" t_decode_with_error3', + testCase "t_decode_with_error4'" t_decode_with_error4', + testCase "t_decode_with_error5'" t_decode_with_error5', testProperty "t_infix_concat" t_infix_concat ], testGroup "validate" [ @@ -524,8 +531,6 @@ testTranscoding = testProperty "t_validateUtf8More_valid" t_validateUtf8More_valid ], testGroup "streaming" [ - testProperty "t_utf8_undecoded" t_utf8_undecoded, - testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_c" t_utf8_c, testCase "t_p_utf8_1" t_p_utf8_1, testCase "t_p_utf8_2" t_p_utf8_2, @@ -542,11 +547,6 @@ testTranscoding = testCase "t_pn_utf8_3" t_pn_utf8_3, testCase "t_decode_chunk1" t_decode_chunk1, testCase "t_decode_chunk2" t_decode_chunk2, - testCase "t_decode_with_error1'" t_decode_with_error1', - testCase "t_decode_with_error2'" t_decode_with_error2', - testCase "t_decode_with_error3'" t_decode_with_error3', - testCase "t_decode_with_error4'" t_decode_with_error4', - testCase "t_decode_with_error5'" t_decode_with_error5', testProperty "t_decodeUtf8Chunk_split" t_decodeUtf8Chunk_split, testProperty "t_decodeUtf8More1" t_decodeUtf8More1, testProperty "t_decodeUtf8More2" t_decodeUtf8More2 From e136cadb12fa599e68b5d8dbdfc9cd9a43f3cf38 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 07:19:14 +0000 Subject: [PATCH 86/87] test: sort imports --- tests/Tests/Properties/Transcoding.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 9c1f108e..bf8d1ee8 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -14,6 +14,7 @@ import Data.Maybe (isNothing) #if !MIN_VERSION_base(4,11,0) import Data.Semigroup ((<>)) #endif +import Data.Word (Word8) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -34,7 +35,6 @@ import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Internal.Encoding as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL -import Data.Word (Word8) t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a where a = T.map (\c -> chr (ord c `mod` 128)) t From 74db3ebede83c903dcf274210d8ae31e59b010bf Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 6 Feb 2023 23:51:45 +0000 Subject: [PATCH 87/87] Apply suggestions --- changelog.md | 2 +- src/Data/Text/Encoding.hs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/changelog.md b/changelog.md index 2b712fc6..19deee72 100644 --- a/changelog.md +++ b/changelog.md @@ -25,7 +25,7 @@ * Add internal module `Data.Text.Internal.Encoding` -* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState` +* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState` and export `utf8{Accept,Reject}State` from the same module. ### 2.0.1 diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 6f355ac6..65f32b91 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -50,7 +50,7 @@ module Data.Text.Encoding , decodeUtf8More , Utf8State , startUtf8State - , StrictBuilder() + , StrictBuilder , strictBuilderToText , textToStrictBuilder