diff --git a/changelog.md b/changelog.md index b0fda4bd..19deee72 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,31 @@ * Remove support for GHC 8.0. +### 2.0.2 + +* Add decoding functions in `Data.Text.Encoding` that allow + more control for error handling and for how to allocate text. + (https://github.com/haskell/text/pull/448 Thanks to David Sledge) + * `decodeASCIIPrefix` + * `decodeUtf8Chunk` + * `decodeUtf8More` + * `Utf8ValidState` + * `startUtf8ValidState` + * `StrictBuilder` + * `strictBuilderToText` + * `textToStrictBuilder` + * `validateUtf8Chunk` + * `validateUtf8More` + +* Fix quadratic slowdown when decoding invalid UTF-8 bytestrings + (https://github.com/haskell/text/issues/495) + +* Add internal module `Data.Text.Internal.StrictBuilder` + +* Add internal module `Data.Text.Internal.Encoding` + +* Add `Data.Text.Internal.Encoding.Utf8.updateDecoderState` and export `utf8{Accept,Reject}State` from the same module. + ### 2.0.1 * Improve portability of C and C++ code. diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 54f16722..65f32b91 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -3,7 +3,6 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Text.Encoding -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, @@ -29,9 +28,8 @@ module Data.Text.Encoding -- ** Total Functions #total# -- $total decodeLatin1 + , decodeASCIIPrefix , decodeUtf8Lenient - - -- *** Catchable failure , decodeUtf8' -- *** Controllable error handling @@ -46,6 +44,16 @@ module Data.Text.Encoding , streamDecodeUtf8With , Decoding(..) + -- *** Incremental UTF-8 decoding + -- $incremental + , decodeUtf8Chunk + , decodeUtf8More + , Utf8State + , startUtf8State + , StrictBuilder + , strictBuilderToText + , textToStrictBuilder + -- ** Partial Functions -- $partial , decodeASCII @@ -68,48 +76,47 @@ module Data.Text.Encoding -- * Encoding Text using ByteString Builders , encodeUtf8Builder , encodeUtf8BuilderEscaped - ) where -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) + -- * ByteString validation + -- $validation + , validateUtf8Chunk + , validateUtf8More + ) where import Control.Exception (evaluate, try) -import Control.Monad.ST (runST, ST) +import Control.Monad.ST (runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits (shiftR, (.&.)) -import Data.ByteString (ByteString) -import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B -import qualified Data.ByteString.Short.Internal as SBS -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) -import Data.Text.Internal (Text(..), safe, empty, append) -import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Internal.Unsafe.Char (unsafeWrite) -import Data.Text.Show as T (singleton) -import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Data.ByteString (ByteString) +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) +import Data.Text.Internal (Text(..), empty) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Text.Show () +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP -import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..)) +import qualified Data.ByteString.Short.Internal as SBS import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Fusion as F -import Data.Text.Internal.ByteStringCompat #if defined(ASSERTS) import GHC.Stack (HasCallStack) #endif -#ifdef SIMDUTF -import Foreign.C.Types (CInt(..)) -#elif !MIN_VERSION_bytestring(0,11,2) -import qualified Data.ByteString.Unsafe as B -import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -#endif +-- $validation +-- These functions are for validating 'ByteString's as encoded text. -- $strict -- @@ -135,20 +142,53 @@ import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -- (preferably not at all). See "Data.Text.Encoding#g:total" for better -- solutions. --- | Decode a 'ByteString' containing 7-bit ASCII --- encoded text. +-- | Decode a 'ByteString' containing ASCII text. +-- +-- This is a total function which returns a pair of the longest ASCII prefix +-- as 'Text', and the remaining suffix as 'ByteString'. +-- +-- Important note: the pair is lazy. This lets you check for errors by testing +-- whether the second component is empty, without forcing the first component +-- (which does a copy). +-- To drop references to the input bytestring, force the prefix +-- (using 'seq' or @BangPatterns@) and drop references to the suffix. +-- +-- === Properties +-- +-- - If @(prefix, suffix) = decodeAsciiPrefix s@, then @'encodeUtf8' prefix <> suffix = s@. +-- - Either @suffix@ is empty, or @'B.head' suffix > 127@. +-- +-- @since 2.0.2 +decodeASCIIPrefix :: ByteString -> (Text, ByteString) +decodeASCIIPrefix bs = if B.null bs + then (empty, B.empty) + else + let len = asciiPrefixLength bs + prefix = + let !(SBS.SBS arr) = SBS.toShort (B.take len bs) in + Text (A.ByteArray arr) 0 len + suffix = B.drop len bs in + (prefix, suffix) +{-# INLINE decodeASCIIPrefix #-} + +-- | Length of the longest ASCII prefix. +asciiPrefixLength :: ByteString -> Int +asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len -> + unsafeWithForeignPtr fp $ \src -> do + fromIntegral <$> c_is_ascii src (src `plusPtr` len) + +-- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a partial function: it checks that input does not contain -- anything except ASCII and copies buffer or throws an error otherwise. --- decodeASCII :: ByteString -> Text -decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - c_is_ascii src (src `plusPtr` len) - if asciiPrefixLen == len - then let !(SBS.SBS arr) = SBS.toShort bs in - return (Text (A.ByteArray arr) 0 len) - else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen +decodeASCII bs = + let (prefix, suffix) = decodeASCIIPrefix bs in + case B.uncons suffix of + Nothing -> prefix + Just (word, _) -> + let !errPos = B.length bs - B.length suffix in + error $ "decodeASCII: detected non-ASCII codepoint " ++ show word ++ " at position " ++ show errPos -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. -- @@ -166,7 +206,7 @@ decodeLatin1 :: decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) let inner srcOff dstOff = if srcOff >= len then return dstOff else do - asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + asciiPrefixLen <- fmap fromIntegral $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len) if asciiPrefixLen == 0 then do @@ -178,7 +218,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen) - actualLen <- inner 0 0 dst' <- A.resizeM dst actualLen arr <- A.unsafeFreeze dst' @@ -187,135 +226,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize -isValidBS :: ByteString -> Bool -#ifdef SIMDUTF -isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len) -#else -#if MIN_VERSION_bytestring(0,11,2) -isValidBS = B.isValidUtf8 -#else -isValidBS bs = start 0 - where - start ix - | ix >= B.length bs = True - | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st _ -> step (ix + 1) st - step ix st - | ix >= B.length bs = False - -- We do not use decoded code point, so passing a dummy value to save an argument. - | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st' _ -> step (ix + 1) st' -#endif -#endif - --- | Decode a 'ByteString' containing UTF-8 encoded text. --- --- Surrogate code points in replacement character returned by 'OnDecodeError' --- will be automatically remapped to the replacement char @U+FFFD@. -decodeUtf8With :: -#if defined(ASSERTS) - HasCallStack => -#endif - OnDecodeError -> ByteString -> Text -decodeUtf8With onErr bs - | isValidBS bs = - let !(SBS.SBS arr) = SBS.toShort bs in - (Text (A.ByteArray arr) 0 (B.length bs)) - | B.null undecoded = txt - | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of - Nothing -> txt' - Just c -> T.singleton c `append` txt') - where - (txt, undecoded) = decodeUtf8With2 onErr mempty bs - txt' = decodeUtf8With onErr (B.tail undecoded) - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - --- | Decode two consecutive bytestrings, returning Text and undecoded remainder. -decodeUtf8With2 :: -#if defined(ASSERTS) - HasCallStack => -#endif - OnDecodeError -> ByteString -> ByteString -> (Text, ByteString) -decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do - marr <- A.new len' - outer marr len' 0 0 - where - len = len1 + len2 - len' = len + 4 - - index i - | i < len1 = B.index bs1 i - | otherwise = B.index bs2 (i - len1) - - -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 - guessUtf8Boundary :: Int - guessUtf8Boundary - | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII - | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point - | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point - | len2 >= 3 && w2 >= 0xC0 = len2 - 3 - | len2 >= 4 && w3 >= 0xC0 = len2 - 4 - | otherwise = 0 - where - w0 = B.index bs2 (len2 - 1) - w1 = B.index bs2 (len2 - 2) - w2 = B.index bs2 (len2 - 3) - w3 = B.index bs2 (len2 - 4) - - decodeFrom :: Int -> DecoderResult - decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) - where - step i (Incomplete a b) - | i < len = step (i + 1) (utf8DecodeContinue (index i) a b) - step _ st = st - - outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString) - outer dst dstLen = inner - where - inner srcOff dstOff - | srcOff >= len = do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - return (Text arr 0 dstOff, mempty) - - | srcOff >= len1 - , srcOff < len1 + guessUtf8Boundary - , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen - , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) - , isValidBS bs = do - withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> - unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) - inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) - - | dstOff + 4 > dstLen = do - let dstLen' = dstLen + 4 - dst' <- A.resizeM dst dstLen' - outer dst' dstLen' srcOff dstOff - - | otherwise = case decodeFrom srcOff of - Accept c -> do - d <- unsafeWrite dst dstOff c - inner (srcOff + d) (dstOff + d) - Reject -> case onErr desc (Just (index srcOff)) of - Nothing -> inner (srcOff + 1) dstOff - Just c -> do - d <- unsafeWrite dst dstOff (safe c) - inner (srcOff + 1) (dstOff + d) - Incomplete{} -> do - A.shrinkM dst dstOff - arr <- A.unsafeFreeze dst - let bs = if srcOff >= len1 - then B.drop (srcOff - len1) bs2 - else B.drop srcOff (bs1 `B.append` bs2) - return (Text arr 0 dstOff, bs) - - desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" - -- $stream -- -- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept @@ -406,11 +316,25 @@ streamDecodeUtf8With :: HasCallStack => #endif OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With onErr = go mempty +streamDecodeUtf8With onErr = loop startUtf8State where - go bs1 bs2 = Some txt undecoded (go undecoded) - where - (txt, undecoded) = decodeUtf8With2 onErr bs1 bs2 + loop s chunk = + let (builder, undecoded, s') = decodeUtf8With2 onErr invalidUtf8Msg s chunk + in Some (strictBuilderToText builder) undecoded (loop s') + +-- | Decode a 'ByteString' containing UTF-8 encoded text. +-- +-- Surrogate code points in replacement character returned by 'OnDecodeError' +-- will be automatically remapped to the replacement char @U+FFFD@. +decodeUtf8With :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> ByteString -> Text +decodeUtf8With onErr = decodeUtf8With1 onErr invalidUtf8Msg + +invalidUtf8Msg :: String +invalidUtf8Msg = "Data.Text.Encoding: Invalid UTF-8 stream" -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. @@ -613,10 +537,17 @@ encodeUtf32BE :: Text -> ByteString encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) {-# INLINE encodeUtf32BE #-} -cSizeToInt :: CSize -> Int -cSizeToInt = fromIntegral - -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif +-- $incremental +-- The functions 'decodeUtf8Chunk' and 'decodeUtf8More' provide more +-- control for error-handling and streaming. +-- +-- - Those functions return an UTF-8 prefix of the given 'ByteString' up to the next error. +-- For example this lets you insert or delete arbitrary text, or do some +-- stateful operations before resuming, such as keeping track of error locations. +-- In contrast, the older stream-oriented interface only lets you substitute +-- a single fixed 'Char' for each invalid byte in 'OnDecodeError'. +-- - That prefix is encoded as a 'StrictBuilder', so you can accumulate chunks +-- before doing the copying work to construct a 'Text', or you can +-- output decoded fragments immediately as a lazy 'Data.Text.Lazy.Text'. +-- +-- For even lower-level primitives, see 'validateUtf8Chunk' and 'validateUtf8More'. diff --git a/src/Data/Text/Internal/Encoding.hs b/src/Data/Text/Internal/Encoding.hs new file mode 100644 index 00000000..26a24afb --- /dev/null +++ b/src/Data/Text/Internal/Encoding.hs @@ -0,0 +1,531 @@ +{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash, + UnliftedFFITypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE Trustworthy #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ViewPatterns #-} + +-- | +-- Module : Data.Text.Internal.Builder +-- License : BSD-style (see LICENSE) +-- Stability : experimental +-- +-- /Warning/: this is an internal module, and does not have a stable +-- API or name. Functions in this module may not check or enforce +-- preconditions expected by public modules. Use at your own risk! +-- +-- Internals of "Data.Text.Encoding". +-- +-- @since 2.0.2 +module Data.Text.Internal.Encoding + ( validateUtf8Chunk + , validateUtf8More + , decodeUtf8Chunk + , decodeUtf8More + , decodeUtf8With1 + , decodeUtf8With2 + , Utf8State + , startUtf8State + , StrictBuilder() + , strictBuilderToText + , textToStrictBuilder + + -- * Internal + , skipIncomplete + , getCompleteLen + , getPartialUtf8 + ) where + +#if defined(ASSERTS) +import Control.Exception (assert) +#endif +import Data.Bits ((.&.), shiftL, shiftR) +import Data.ByteString (ByteString) +#if !MIN_VERSION_base(4,11,0) +import Data.Semigroup (Semigroup(..)) +#endif +import Data.Word (Word32, Word8) +import Foreign.Storable (pokeElemOff) +import Data.Text.Encoding.Error (OnDecodeError) +import Data.Text.Internal (Text(..)) +import Data.Text.Internal.Encoding.Utf8 + (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState) +import Data.Text.Internal.StrictBuilder (StrictBuilder) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Short.Internal as SBS +import qualified Data.Text.Array as A +import qualified Data.Text.Internal.StrictBuilder as SB +#if defined(ASSERTS) +import GHC.Stack (HasCallStack) +#endif + +#ifdef SIMDUTF +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Foreign.C.Types (CSize(..)) +import Foreign.C.Types (CInt(..)) +import Foreign.Ptr (Ptr) +#endif + +-- | Use 'StrictBuilder' to build 'Text'. +-- +-- @since 2.0.2 +strictBuilderToText :: StrictBuilder -> Text +strictBuilderToText = SB.toText + +-- | Copy 'Text' in a 'StrictBuilder' +-- +-- @since 2.0.2 +textToStrictBuilder :: Text -> StrictBuilder +textToStrictBuilder = SB.fromText + +-- | State of decoding a 'ByteString' in UTF-8. +-- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More', +-- 'decodeUtf8Chunk', 'decodeUtf8More'). +-- +-- @since 2.0.2 + +-- Internal invariant: +-- the first component is the initial state if and only if +-- the second component is empty. +-- +-- @ +-- 'utf9CodePointState' s = 'utf8StartState' +-- <=> +-- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0 +-- @ +data Utf8State = Utf8State + { -- | State of the UTF-8 state machine. + utf8CodePointState :: {-# UNPACK #-} !DecoderState + -- | Bytes of the currently incomplete code point (if any). + , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint + } + deriving (Eq, Show) + +-- | Initial 'Utf8State'. +-- +-- @since 2.0.2 +startUtf8State :: Utf8State +startUtf8State = Utf8State utf8AcceptState partUtf8Empty + +-- | Prefix of a UTF-8 code point encoded in 4 bytes, +-- possibly empty. +-- +-- - The most significant byte contains the number of bytes, +-- between 0 and 3. +-- - The remaining bytes hold the incomplete code point. +-- - Unused bytes must be 0. +-- +-- All of operations available on it are the functions below. +-- The constructor should never be used outside of those. +-- +-- @since 2.0.2 +newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32 + deriving (Eq, Show) + +-- | Empty prefix. +partUtf8Empty :: PartialUtf8CodePoint +partUtf8Empty = PartialUtf8CodePoint 0 + +-- | Length of the partial code point, stored in the most significant byte. +partUtf8Len :: PartialUtf8CodePoint -> Int +partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24 + +-- | Length of the code point once completed (it is known in the first byte). +-- 0 if empty. +partUtf8CompleteLen :: PartialUtf8CodePoint -> Int +partUtf8CompleteLen c@(PartialUtf8CodePoint w) + | partUtf8Len c == 0 = 0 + | 0xf0 <= firstByte = 4 + | 0xe0 <= firstByte = 3 + | 0xc2 <= firstByte = 2 + | otherwise = 0 + where + firstByte = (w `shiftR` 16) .&. 255 + +-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@. +-- +-- Unsafe: no bounds checking. +partUtf8UnsafeIndex :: +#if defined(ASSERTS) + HasCallStack => +#endif + PartialUtf8CodePoint -> Int -> Word8 +partUtf8UnsafeIndex _c@(PartialUtf8CodePoint w) n = +#if defined(ASSERTS) + assert (0 <= n && n < partUtf8Len _c) $ +#endif + fromIntegral $ w `shiftR` (16 - 8 * n) + +-- | Append some bytes. +-- +-- Unsafe: no bounds checking. +partUtf8UnsafeAppend :: +#if defined(ASSERTS) + HasCallStack => +#endif + PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint +partUtf8UnsafeAppend c@(PartialUtf8CodePoint word) bs = +#if defined(ASSERTS) + assert (lenc + lenbs <= 3) $ +#endif + PartialUtf8CodePoint $ + tryPush 0 $ tryPush 1 $ tryPush 2 $ word + (fromIntegral lenbs `shiftL` 24) + where + lenc = partUtf8Len c + lenbs = B.length bs + tryPush i w = + if i < lenbs + then w + (fromIntegral (B.index bs i) `shiftL` fromIntegral (16 - 8 * (lenc + i))) + else w + +-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code. +{-# INLINE partUtf8Foldr #-} +partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a +partUtf8Foldr f x0 c = case partUtf8Len c of + 0 -> x0 + 1 -> build 0 x0 + 2 -> build 0 (build 1 x0) + _ -> build 0 (build 1 (build 2 x0)) + where + build i x = f (partUtf8UnsafeIndex c i) x + +-- | Convert 'PartialUtf8CodePoint' to 'ByteString'. +partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString +partUtf8ToByteString c = BI.unsafeCreate (partUtf8Len c) $ \ptr -> + partUtf8Foldr (\w k i -> pokeElemOff ptr i w >> k (i+1)) (\_ -> pure ()) c 0 + +-- | Exported for testing. +getCompleteLen :: Utf8State -> Int +getCompleteLen = partUtf8CompleteLen . partialUtf8CodePoint + +-- | Exported for testing. +getPartialUtf8 :: Utf8State -> B.ByteString +getPartialUtf8 = partUtf8ToByteString . partialUtf8CodePoint + +#ifdef SIMDUTF +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt +#endif + +-- | Validate a 'ByteString' as UTF-8-encoded text. To be continued using 'validateUtf8More'. +-- +-- See also 'validateUtf8More' for details on the result of this function. +-- +-- @ +-- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State' +-- @ +-- +-- @since 2.0.2 +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- 'validateUtf8Chunk' chunk = (n, ms) +-- @ +-- +-- - The prefix is valid UTF-8. In particular, it should be accepted +-- by this validation: +-- +-- @ +-- 'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State') +-- @ +validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State) +validateUtf8Chunk bs = validateUtf8ChunkFrom 0 bs (,) + +-- Assume bytes up to offset @ofs@ have been validated already. +-- +-- Using CPS lets us inline the continuation and avoid allocating a @Maybe@ +-- in the @decode...@ functions. +{-# INLINE validateUtf8ChunkFrom #-} +validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r +validateUtf8ChunkFrom ofs bs k +#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,11,2) + | guessUtf8Boundary > 0 && + -- the rest of the bytestring is valid utf-8 up to the boundary + ( +#ifdef SIMDUTF + withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> + c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary) +#else + B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs) +#endif + ) = slowValidateUtf8ChunkFrom (ofs + guessUtf8Boundary) + -- No + | otherwise = slowValidateUtf8ChunkFrom ofs + where + len = B.length bs - ofs + isBoundary n p = len >= n && p (B.index bs (ofs + len - n)) + guessUtf8Boundary + | isBoundary 1 (<= 0x80) = len -- last char is ASCII (common short-circuit) + | isBoundary 1 (0xc2 <=) = len - 1 -- last char starts a two-(or more-)byte code point + | isBoundary 2 (0xe0 <=) = len - 2 -- pre-last char starts a three-or-four-byte code point + | isBoundary 3 (0xf0 <=) = len - 3 -- third to last char starts a four-byte code point + | otherwise = len +#else + = slowValidateUtf8ChunkFrom ofs + where +#endif + -- A pure Haskell implementation of validateUtf8More. + -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us + -- indices to let us avoid this function. + slowValidateUtf8ChunkFrom :: Int -> r + slowValidateUtf8ChunkFrom ofs1 = slowLoop ofs1 ofs1 utf8AcceptState + + slowLoop !utf8End i s + | i < B.length bs = + case updateDecoderState (B.index bs i) s of + s' | s' == utf8RejectState -> k utf8End Nothing + | s' == utf8AcceptState -> slowLoop (i + 1) (i + 1) s' + | otherwise -> slowLoop utf8End (i + 1) s' + | otherwise = k utf8End (Just (Utf8State s (partUtf8UnsafeAppend partUtf8Empty (B.drop utf8End bs)))) + +-- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text. +-- +-- Returns a pair: +-- +-- 1. The first component @n@ is the end position, relative to the current +-- chunk, of the longest prefix of the accumulated bytestring which is valid UTF-8. +-- @n@ may be negative: that happens when an incomplete code point started in +-- a previous chunk and is not completed by the current chunk (either +-- that code point is still incomplete, or it is broken by an invalid byte). +-- +-- 2. The second component @ms@ indicates the following: +-- +-- - if @ms = Nothing@, the remainder of the chunk contains an invalid byte, +-- within four bytes from position @n@; +-- - if @ms = Just s'@, you can carry on validating another chunk +-- by calling 'validateUtf8More' with the new state @s'@. +-- +-- @since 2.0.2 +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- 'validateUtf8More' s chunk = (n, ms) +-- @ +-- +-- - If the chunk is invalid, it cannot be extended to be valid. +-- +-- @ +-- ms = Nothing +-- ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing) +-- @ +-- +-- - Validating two chunks sequentially is the same as validating them +-- together at once: +-- +-- @ +-- ms = Just s' +-- ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more) +-- @ +validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State) +validateUtf8More st bs = validateUtf8MoreCont st bs (,) + +-- CPS: inlining the continuation lets us make more tail calls and avoid +-- allocating a @Maybe@ in @decodeWith1/2@. +{-# INLINE validateUtf8MoreCont #-} +validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r +validateUtf8MoreCont st@(Utf8State s0 part) bs k + | len > 0 = loop 0 s0 + | otherwise = k (- partUtf8Len part) (Just st) + where + len = B.length bs + -- Complete an incomplete code point (if there is one) + -- and then jump to validateUtf8ChunkFrom + loop !i s + | s == utf8AcceptState = validateUtf8ChunkFrom i bs k + | i < len = + case updateDecoderState (B.index bs i) s of + s' | s' == utf8RejectState -> k (- partUtf8Len part) Nothing + | otherwise -> loop (i + 1) s' + | otherwise = k (- partUtf8Len part) (Just (Utf8State s (partUtf8UnsafeAppend part bs))) + +-- Eta-expanded to inline partUtf8Foldr +partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder +partUtf8ToStrictBuilder c = + partUtf8Foldr ((<>) . SB.unsafeFromWord8) mempty c + +utf8StateToStrictBuilder :: Utf8State -> StrictBuilder +utf8StateToStrictBuilder = partUtf8ToStrictBuilder . partialUtf8CodePoint + +-- | Decode another chunk in an ongoing UTF-8 stream. +-- +-- Returns a triple: +-- +-- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate +-- chunks with @('<>')@ or output them with 'SB.toText'. +-- 2. The undecoded remainder of the given chunk, for diagnosing errors +-- and resuming (presumably after skipping some bytes). +-- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered +-- (it will be within the first 4 bytes of the undecoded remainder). +-- +-- @since 2.0.2 +-- +-- === Properties +-- +-- Given: +-- +-- @ +-- (pre, suf, ms) = 'decodeUtf8More' s chunk +-- @ +-- +-- 1. If the output @pre@ is nonempty (alternatively, if @length chunk > length suf@) +-- +-- @ +-- s2b pre \`'Data.ByteString.append'\` suf = p2b s \`'Data.ByteString.append'\` chunk +-- @ +-- +-- where +-- +-- @ +-- s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.toText' +-- p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString' +-- @ +-- +-- 2. If the output @pre@ is empty (alternatively, if @length chunk = length suf@) +-- +-- @suf = chunk@ +-- +-- 3. Decoding chunks separately is equivalent to decoding their concatenation. +-- +-- Given: +-- +-- @ +-- (pre1, suf1, Just s1) = 'decodeUtf8More' s chunk1 +-- (pre2, suf2, ms2) = 'decodeUtf8More' s1 chunk2 +-- (pre3, suf3, ms3) = 'decodeUtf8More' s (chunk1 \`B.append\` chunk2) +-- @ +-- +-- we have: +-- +-- @ +-- s2b (pre1 '<>' pre2) = s2b pre3 +-- ms2 = ms3 +-- @ +decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8More s bs = + validateUtf8MoreCont s bs $ \len ms -> + let builder | len <= 0 = mempty + | otherwise = utf8StateToStrictBuilder s + <> SB.unsafeFromByteString (B.take len bs) + in (builder, B.drop len bs, ms) + +-- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'. +-- +-- See 'decodeUtf8More' for details on the result. +-- +-- @since 2.0.2 +-- +-- === Properties +-- +-- @ +-- 'decodeUtf8Chunk' = 'decodeUtf8More' 'startUtf8State' +-- @ +-- +-- Given: +-- +-- @ +-- 'decodeUtf8Chunk' chunk = (builder, rest, ms) +-- @ +-- +-- @builder@ is a prefix and @rest@ is a suffix of @chunk@. +-- +-- @ +-- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk +-- @ +decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State) +decodeUtf8Chunk = decodeUtf8More startUtf8State + +-- | Call the error handler on each byte of the partial code point stored in +-- 'Utf8State' and append the results. +-- +-- Exported for use in lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'. +-- +-- @since 2.0.2 +{-# INLINE skipIncomplete #-} +skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder +skipIncomplete onErr msg s = + partUtf8Foldr + ((<>) . handleUtf8Error onErr msg) + mempty (partialUtf8CodePoint s) + +{-# INLINE handleUtf8Error #-} +handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder +handleUtf8Error onErr msg w = case onErr msg (Just w) of + Just c -> SB.fromChar c + Nothing -> mempty + +-- | Helper for 'Data.Text.Encoding.decodeUtf8With'. +-- +-- @since 2.0.2 + +-- This could be shorter by calling 'decodeUtf8With2' directly, but we make the +-- first call validateUtf8Chunk directly to return even faster in successful +-- cases. +decodeUtf8With1 :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> String -> ByteString -> Text +decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms of + Just s + | len == B.length bs -> + let !(SBS.SBS arr) = SBS.toShort bs in + Text (A.ByteArray arr) 0 len + | otherwise -> SB.toText $ + SB.unsafeFromByteString (B.take len bs) <> skipIncomplete onErr msg s + Nothing -> + let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in + SB.toText $ + SB.unsafeFromByteString (B.take len bs) <> + handleUtf8Error onErr msg (B.index bs len) <> + builder <> + skipIncomplete onErr msg s + +-- | Helper for 'Data.Text.Encoding.decodeUtf8With', +-- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy +-- 'Data.Text.Lazy.Encoding.decodeUtf8With', +-- which use an 'OnDecodeError' to process bad bytes. +-- +-- See 'decodeUtf8Chunk' for a more flexible alternative. +-- +-- @since 2.0.2 +decodeUtf8With2 :: +#if defined(ASSERTS) + HasCallStack => +#endif + OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State) +decodeUtf8With2 onErr msg s0 bs = loop s0 0 mempty + where + loop s i !builder = + let nonEmptyPrefix len = builder + <> utf8StateToStrictBuilder s + <> SB.unsafeFromByteString (B.take len (B.drop i bs)) + in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of + Nothing -> + if len < 0 + then + -- If the first byte cannot complete the partial code point in s, + -- retry from startUtf8State. + let builder' = builder <> skipIncomplete onErr msg s + -- Note: loop is strict on builder, so if onErr raises an error it will + -- be forced here, short-circuiting the loop as desired. + in loop startUtf8State i builder' + else + let builder' = nonEmptyPrefix len + <> handleUtf8Error onErr msg (B.index bs (i + len)) + in loop startUtf8State (i + len + 1) builder' + Just s' -> + let builder' = if len <= 0 then builder else nonEmptyPrefix len + undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s') + then B.drop (i + len) bs -- Reuse bs if possible + else partUtf8ToByteString (partialUtf8CodePoint s') + in (builder', undecoded, s') diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index bcf6a778..c8357677 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -34,8 +34,11 @@ module Data.Text.Internal.Encoding.Utf8 , validate3 , validate4 -- * Naive decoding - , DecoderResult(..) , DecoderState(..) + , utf8AcceptState + , utf8RejectState + , updateDecoderState + , DecoderResult(..) , CodePoint(..) , utf8DecodeStart , utf8DecodeContinue @@ -244,7 +247,7 @@ byteToClass n = ByteClass (W8# el#) table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"# newtype DecoderState = DecoderState Word8 - deriving (Eq) + deriving (Eq, Show) utf8AcceptState :: DecoderState utf8AcceptState = DecoderState 0 @@ -261,6 +264,9 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#) table# :: Addr# table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"# +updateDecoderState :: Word8 -> DecoderState -> DecoderState +updateDecoderState b s = updateState (byteToClass b) s + newtype CodePoint = CodePoint Int -- | @since 2.0 diff --git a/src/Data/Text/Internal/StrictBuilder.hs b/src/Data/Text/Internal/StrictBuilder.hs new file mode 100644 index 00000000..84a57264 --- /dev/null +++ b/src/Data/Text/Internal/StrictBuilder.hs @@ -0,0 +1,121 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE RankNTypes #-} + +-- | +-- Module : Data.Text.Internal.Builder +-- License : BSD-style (see LICENSE) +-- Stability : experimental +-- +-- /Warning/: this is an internal module, and does not have a stable +-- API or name. Functions in this module may not check or enforce +-- preconditions expected by public modules. Use at your own risk! +-- +-- @since 2.0.2 + +module Data.Text.Internal.StrictBuilder + ( StrictBuilder(..) + , toText + , fromChar + , fromText + + -- * Unsafe + -- $unsafe + , unsafeFromByteString + , unsafeFromWord8 + ) where + +import Control.Monad.ST (ST, runST) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Data.Functor (void) +import Data.Word (Word8) +import Data.ByteString (ByteString) +#if !MIN_VERSION_base(4,11,0) +import Data.Semigroup (Semigroup(..)) +#endif +import Data.Text.Internal (Text(..), empty, safe) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Encoding.Utf8 (utf8Length) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import qualified Data.ByteString as B +import qualified Data.Text.Array as A +import qualified Data.Text.Internal.Unsafe.Char as Char + +-- | A delayed representation of strict 'Text'. +-- +-- @since 2.0.2 +data StrictBuilder = StrictBuilder + { sbLength :: {-# UNPACK #-} !Int + , sbWrite :: forall s. A.MArray s -> Int -> ST s () + } + +-- | Use 'StrictBuilder' to build 'Text'. +-- +-- @since 2.0.2 +toText :: StrictBuilder -> Text +toText (StrictBuilder 0 _) = empty +toText (StrictBuilder n write) = runST (do + dst <- A.new n + write dst 0 + arr <- A.unsafeFreeze dst + pure (Text arr 0 n)) + +-- | Concatenation of 'StrictBuilder' is right-biased: +-- the right builder will be run first. This allows a builder to +-- run tail-recursively when it was accumulated left-to-right. +instance Semigroup StrictBuilder where + (<>) = appendRStrictBuilder + +instance Monoid StrictBuilder where + mempty = emptyStrictBuilder + mappend = (<>) + +emptyStrictBuilder :: StrictBuilder +emptyStrictBuilder = StrictBuilder 0 (\_ _ -> pure ()) + +appendRStrictBuilder :: StrictBuilder -> StrictBuilder -> StrictBuilder +appendRStrictBuilder (StrictBuilder 0 _) b2 = b2 +appendRStrictBuilder b1 (StrictBuilder 0 _) = b1 +appendRStrictBuilder (StrictBuilder n1 write1) (StrictBuilder n2 write2) = + StrictBuilder (n1 + n2) (\dst ofs -> do + write2 dst (ofs + n1) + write1 dst ofs) + +copyFromByteString :: A.MArray s -> Int -> ByteString -> ST s () +copyFromByteString dst ofs src = withBS src $ \ srcFPtr len -> + unsafeIOToST $ unsafeWithForeignPtr srcFPtr $ \ srcPtr -> do + unsafeSTToIO $ A.copyFromPointer dst ofs srcPtr len + +-- | Copy a 'ByteString'. +-- +-- Unsafe: This may not be valid UTF-8 text. +-- +-- @since 2.0.2 +unsafeFromByteString :: ByteString -> StrictBuilder +unsafeFromByteString bs = + StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs) + +-- | +-- @since 2.0.2 +{-# INLINE fromChar #-} +fromChar :: Char -> StrictBuilder +fromChar c = + StrictBuilder (utf8Length c) (\dst ofs -> void (Char.unsafeWrite dst ofs (safe c))) + +-- $unsafe +-- For internal purposes, we abuse 'StrictBuilder' as a delayed 'Array' rather +-- than 'Text': it may not actually be valid 'Text'. + +-- | Unsafe: This may not be valid UTF-8 text. +-- +-- @since 2.0.2 +unsafeFromWord8 :: Word8 -> StrictBuilder +unsafeFromWord8 !w = + StrictBuilder 1 (\dst ofs -> A.unsafeWrite dst ofs w) + +-- | Copy 'Text' in a 'StrictBuilder' +-- +-- @since 2.0.2 +fromText :: Text -> StrictBuilder +fromText (Text src srcOfs n) = StrictBuilder n (\dst dstOfs -> + A.copyI n dst dstOfs src srcOfs) diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index ad361af5..06a06224 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BangPatterns,CPP #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Text.Lazy.Encoding @@ -60,15 +61,15 @@ import Data.Monoid (Monoid(..)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks) import Data.Word (Word8) -import qualified Data.ByteString as S import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B -import qualified Data.Text as T import qualified Data.Text.Encoding as TE +import qualified Data.Text.Internal.Encoding as TE import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E import qualified Data.Text.Internal.Lazy.Fusion as F +import qualified Data.Text.Internal.StrictBuilder as SB import Data.Text.Unsafe (unsafeDupablePerformIO) -- $strict @@ -106,24 +107,14 @@ decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text -decodeUtf8With onErr (B.Chunk b0 bs0) = - case TE.streamDecodeUtf8With onErr b0 of - TE.Some t l f -> chunk t (go f l bs0) +decodeUtf8With onErr = loop TE.startUtf8State where - go f0 _ (B.Chunk b bs) = - case f0 b of - TE.Some t l f -> chunk t (go f l bs) - go _ l _ - | S.null l = empty - | otherwise = - let !t = T.pack (skipBytes l) - skipBytes = S.foldr (\x s' -> - case onErr desc (Just x) of - Just c -> c : s' - Nothing -> s') [] in - Chunk t Empty - desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" -decodeUtf8With _ _ = empty + chunkb builder t | SB.sbLength builder == 0 = t + | otherwise = Chunk (TE.strictBuilderToText builder) t + loop s (B.Chunk b bs) = case TE.decodeUtf8With2 onErr msg s b of + (builder, _, s') -> chunkb builder (loop s' bs) + loop s B.Empty = chunkb (TE.skipIncomplete onErr msg s) Empty + msg = "Data.Text.Internal.Encoding: Invalid UTF-8 stream" -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 2422ddd7..bf8d1ee8 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -1,6 +1,6 @@ -- | Tests for encoding and decoding -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} +{-# LANGUAGE CPP, OverloadedStrings, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} module Tests.Properties.Transcoding ( testTranscoding @@ -9,9 +9,16 @@ module Tests.Properties.Transcoding import Prelude hiding (head, tail) import Data.Bits ((.&.), shiftR) import Data.Char (chr, ord) +import Data.Functor (void) +import Data.Maybe (isNothing) +#if !MIN_VERSION_base(4,11,0) +import Data.Semigroup ((<>)) +#endif +import Data.Word (Word8) import Test.QuickCheck hiding ((.&.)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) +import Test.Tasty.HUnit ((@?=), assertBool, assertFailure, testCase) import Tests.QuickCheckUtils import qualified Control.Exception as Exception import qualified Data.Bits as Bits (shiftL, shiftR) @@ -25,6 +32,7 @@ import qualified Data.ByteString.Lazy.Char8 as BLC import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Data.Text.Encoding.Error as E +import qualified Data.Text.Internal.Encoding as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -36,6 +44,167 @@ tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a t_latin1 = E.decodeLatin1 `eq` (T.pack . BC.unpack) tl_latin1 = EL.decodeLatin1 `eq` (TL.pack . BLC.unpack) +t_p_utf8_1 = testValidateUtf8_ [0x63] 1 +t_p_utf8_2 = testValidateUtf8_ [0x63, 0x63, 0x63] 3 +t_p_utf8_3 = testValidateUtf8_ [0x63, 0x63, 0xc2, 0x80, 0x63] 5 +t_p_utf8_4 = testValidateUtf8_ [0x63, 0xe1, 0x80, 0x80, 0x63] 5 +t_p_utf8_5 = testValidateUtf8_ [0xF0, 0x90, 0x80, 0x80, 0x63] 5 +t_p_utf8_6 = testValidateUtf8_ [0x63, 0x63, 0xF0, 0x90, 0x80] 2 +t_p_utf8_7 = testValidateUtf8_ [0x63, 0x63, 0x63, 0xF0, 0x90] 3 +t_p_utf8_8 = testValidateUtf8Fail [0xF0, 0x90, 0x80, 0x63, 0x63] 0 +t_p_utf8_9 = testValidateUtf8Fail [0x63, 0x63, 0x80, 0x63, 0x63] 2 +t_p_utf8_0 = testValidateUtf8Fail [0x63, 0x63, 0xe1, 0x63, 0x63] 2 + +testValidateUtf8With :: + (B.ByteString -> (Int, Maybe E.Utf8State)) -> + (Maybe E.Utf8State -> IO r) -> + [Word8] -> Int -> IO r +testValidateUtf8With validate k xs expectedLen = case validate (B.pack xs) of + (len, s) -> do + len @?= expectedLen + k s + +expectJust :: Maybe a -> IO a +expectJust Nothing = assertFailure "Unexpected Nothing" +expectJust (Just s) = pure s + +expectNothing :: Maybe a -> IO () +expectNothing Nothing = pure () +expectNothing (Just _) = assertFailure "Unexpected Just" + +testValidateUtf8 :: [Word8] -> Int -> IO E.Utf8State +testValidateUtf8 = testValidateUtf8With E.validateUtf8Chunk expectJust + +testValidateUtf8_ :: [Word8] -> Int -> IO () +testValidateUtf8_ = testValidateUtf8With E.validateUtf8Chunk (void . expectJust) + +testValidateUtf8Fail :: [Word8] -> Int -> IO () +testValidateUtf8Fail = testValidateUtf8With E.validateUtf8Chunk expectNothing + +testValidateUtf8More :: E.Utf8State -> [Word8] -> Int -> IO E.Utf8State +testValidateUtf8More s = testValidateUtf8With (E.validateUtf8More s) expectJust + +testValidateUtf8MoreFail :: E.Utf8State -> [Word8] -> Int -> IO () +testValidateUtf8MoreFail s = testValidateUtf8With (E.validateUtf8More s) expectNothing + +t_pn_utf8_1 = do + s <- testValidateUtf8 [0xF0, 0x90, 0x80] 0 + _ <- testValidateUtf8More s [0x80] 1 + testValidateUtf8MoreFail s [0x7f] (-3) +t_pn_utf8_2 = do + s0 <- testValidateUtf8 [0xF0] 0 + testValidateUtf8MoreFail s0 [0x7f] (-1) + s1 <- testValidateUtf8More s0 [0x90] (-1) + testValidateUtf8MoreFail s1 [0x7f] (-2) + s2 <- testValidateUtf8More s1 [0x80] (-2) + testValidateUtf8MoreFail s2 [0x7f] (-3) + _ <- testValidateUtf8More s2 [0x80] 1 + pure () +t_pn_utf8_3 = do + s1 <- testValidateUtf8 [0xc2] 0 + assertBool "PartialUtf8 must be partial" $ B.length (E.getPartialUtf8 s1) < E.getCompleteLen s1 + testValidateUtf8MoreFail s1 [0x80, 0x80] 1 + +-- Precondition: (i, ms1) = E.validateUtf8More s chunk +-- +-- The index points to the end of the longest valid prefix +-- of prechunk `B.append` chunk +pre_validateUtf8More_validPrefix s chunk i = + let prechunk = E.getPartialUtf8 s in + -- Note: i <= 0 implies take i = id + let (j, ms2) = E.validateUtf8Chunk (B.take (B.length prechunk + i) (prechunk `B.append` chunk)) in + counterexample (show prechunk) $ + (B.length prechunk + i, ms2) === (j, Just E.startUtf8State) + +-- Precondition: (i, Nothing) = E.validateUtf8More s chunk +-- +-- Appending to an invalid chunk yields another invalid chunk. +pre_validateUtf8More_maximalPrefix s chunk i more = + E.validateUtf8More s (chunk `B.append` more) === (i, Nothing) + +-- Precondition: (i, Just s1) = E.validateUtf8More s chunk +pre_validateUtf8More_suffix s chunk i s1 = + if 0 <= i + then B.drop i chunk === p2b s1 -- The state s1 contains a suffix of the chunk. + else p2b s `B.append` chunk === p2b s1 -- Or the chunk extends the incomplete code point in s1. + +-- Precondition: (i, Just s1) = E.validateUtf8More s chunk1 +-- +-- Validating two chunks sequentially is equivalent to validating them at once. +pre_validateUtf8More_append s chunk1 s1 chunk2 = + let (j, ms2) = E.validateUtf8More s1 chunk2 in + (B.length chunk1 + j, ms2) === E.validateUtf8More s (chunk1 `B.append` chunk2) + +-- These wrappers use custom generators to satisfy the above properties. + +t_validateUtf8More_validPrefix = property $ do + cex@(s, chunk, i, _ms1) <- randomMoreChunk + pure $ counterexample (show cex) $ + pre_validateUtf8More_validPrefix s chunk i + +t_validateUtf8More_maximalPrefix = property $ do + -- We want chunks that fail validation: force their size to be big,.. + cex@(s, chunk, i, ms1) <- scale (* 3) arbitraryMoreChunk + pure $ counterexample (show cex) $ + -- ... and just use rejection sampling + isNothing ms1 ==> + pre_validateUtf8More_maximalPrefix s chunk i + +t_validateUtf8More_valid = property $ do + cex@(s, chunk1, i, s1, chunk2) <- validMoreChunks + pure $ counterexample (show cex) $ + pre_validateUtf8More_suffix s chunk1 i s1 .&&. + pre_validateUtf8More_append s chunk1 s1 chunk2 + +randomMoreChunk, arbitraryMoreChunk, validMoreChunk :: Gen (E.Utf8State, B.ByteString, Int, Maybe E.Utf8State) +randomMoreChunk = oneof [arbitraryMoreChunk, validMoreChunk] + +arbitraryMoreChunk = do + s <- randomUtf8State + chunk <- arbitrary + let (i, ms1) = E.validateUtf8More s chunk + pure (s, chunk, i, ms1) + +-- | Generate a random state by parsing a prefix of a Char +randomUtf8State :: Gen E.Utf8State +randomUtf8State = do + c <- arbitrary + chunk <- elements (B.inits (E.encodeUtf8 (T.singleton c))) + case E.validateUtf8Chunk chunk of + (_, Just s) -> pure s + (_, Nothing) -> error "should not happen" + +-- | Make a valid chunk, i.e., (s, chunk) such that +-- +-- validateUtf8More s chunk = (i, Just s1) +-- +-- Also returning i and s1 to not repeat work. +validMoreChunk = do + (s, chunk, i, s1, _chunk2) <- validMoreChunks + pure (s, chunk, i, Just s1) + +-- | Make a valid chunk by slicing a valid UTF8 bs, +-- and also provide a second chunk which is a valid extension +-- with 0.5 probability. +validMoreChunks :: Gen (E.Utf8State, B.ByteString, Int, E.Utf8State, B.ByteString) +validMoreChunks = do + bs <- E.encodeUtf8 <$> scale (* 3) arbitrary + -- Take an intermediate state. + -- No need to go too far since code points are at most 4 bytes long + i <- choose (0, 3) + let (bs0, bs1) = B.splitAt i bs + case E.validateUtf8Chunk bs0 of + (_, Just s) -> do + j <- choose (0, B.length bs1) + let (chunk1, chunk2') = B.splitAt j bs1 + case E.validateUtf8More s chunk1 of + (n1, Just s1) -> do + chunk2 <- oneof [pure chunk2', arbitrary] + pure (s, chunk1, n1, s1, chunk2) + (_, Nothing) -> error "should not happen" + (_, Nothing) -> error "should not happen" + +t_utf8_c = (E.strictBuilderToText . fst3 . E.decodeUtf8Chunk . E.encodeUtf8) `eq` id t_utf8 = (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = (EL.decodeUtf8 . EL.encodeUtf8) `eq` id @@ -49,6 +218,9 @@ tl_utf32LE = (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id t_utf32BE = (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id tl_utf32BE = (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id +fst3 :: (a, b, c) -> a +fst3 (a, _, _) = a + runBuilder :: B.Builder -> B.ByteString runBuilder = -- Use smallish buffers to exercise bufferFull case as well @@ -191,26 +363,124 @@ t_decode_with_error3 = t_decode_with_error4 = E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa" +t_decode_with_error1' = do + E.Some x1 bs1 f1 <- pure $ E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xc2]) + x1 @?= "" + bs1 @?= B.pack [0xc2] + E.Some x2 bs2 _ <- pure $ f1 $ B.pack [0x80, 0x80] + x2 @?= "\x80x" + bs2 @?= mempty t_decode_with_error2' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) of - E.Some x _ _ -> x === "xa" + E.Some x _ _ -> x @?= "xa" t_decode_with_error3' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97]) of - E.Some x _ _ -> x === "xaa" + E.Some x _ _ -> x @?= "xaa" t_decode_with_error4' = case E.streamDecodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97, 97, 97]) of - E.Some x _ _ -> x === "xaaa" -t_decode_with_error5' = ioProperty $ do + E.Some x _ _ -> x @?= "xaaa" +t_decode_with_error5' = do ret <- Exception.try $ Exception.evaluate $ E.streamDecodeUtf8 (B.pack [0x81]) - pure $ case ret of - Left (_ :: E.UnicodeException) -> True - Right{} -> False + case ret of + Left (_ :: E.UnicodeException) -> pure () + Right{} -> assertFailure "Unexpected success" + +testDecodeUtf8With :: (Maybe E.Utf8State -> IO r) -> E.Utf8State -> [Word8] -> T.Text -> IO r +testDecodeUtf8With k s xs expected = + let xs' = B.pack xs in + case E.decodeUtf8More s xs' of + (prefix, bs, s') -> do + let txt = E.strictBuilderToText prefix + txt @?= expected + if T.null txt then + bs @?= xs' + else + E.encodeUtf8 txt `B.append` bs @?= E.getPartialUtf8 s `B.append` xs' + k s' + +testDecodeUtf8 :: E.Utf8State -> [Word8] -> T.Text -> IO E.Utf8State +testDecodeUtf8 = testDecodeUtf8With (\ms -> case ms of + Just s -> pure s + Nothing -> assertFailure "Unexpected failure") + +testDecodeUtf8Fail :: E.Utf8State -> [Word8] -> T.Text -> IO () +testDecodeUtf8Fail = testDecodeUtf8With (\ms -> case ms of + Just _ -> assertFailure "Unexpected failure" + Nothing -> pure ()) + +t_decode_chunk1 = do + s1 <- testDecodeUtf8 E.startUtf8State [0xc2] "" + B.length (E.getPartialUtf8 s1) @?= 1 + testDecodeUtf8Fail s1 [0x80, 0x80] "\128" + +t_decode_chunk2 = do + s1 <- testDecodeUtf8 E.startUtf8State [0xf0] "" + s2 <- testDecodeUtf8 s1 [0x90, 0x80] "" + _ <- testDecodeUtf8 s2 [0x80, 0x41] "\65536A" + pure () t_infix_concat bs1 text bs2 = forAll (Blind <$> genDecodeErr Replace) $ \(Blind onErr) -> text `T.isInfixOf` E.decodeUtf8With onErr (B.concat [bs1, E.encodeUtf8 text, bs2]) +t_textToStrictBuilder = + (E.strictBuilderToText . E.textToStrictBuilder) `eq` id + +-- decodeUtf8Chunk splits the input bytestring +t_decodeUtf8Chunk_split chunk = + let (pre, suf, _ms) = E.decodeUtf8Chunk chunk + in s2b pre `B.append` suf === chunk + +-- decodeUtf8More mostly splits the input bytestring, +-- also inserting bytes from the partial code point in s. +-- +-- This is wrapped by t_decodeUtf8More_split to have more +-- likely valid chunks. +t_decodeUtf8More_split' s chunk = + let (pre, suf, _ms) = E.decodeUtf8More s chunk + in if B.length chunk > B.length suf + then s2b pre `B.append` suf === p2b s `B.append` chunk + else suf === chunk + +-- The output state of decodeUtf8More contains the suffix. +-- +-- Precondition (valid chunk): ms = Just s' +pre_decodeUtf8More_suffix s chunk = + let (_pre, suf, ms) = E.decodeUtf8More s chunk + in case ms of + Nothing -> discard + Just s' -> if B.length chunk > B.length suf + then p2b s' === suf + else p2b s' === p2b s `B.append` suf + +-- Decoding chunks separately is equivalent to decoding their concatenation. +pre_decodeUtf8More_append s chunk1 chunk2 = + let (pre1, _, ms1) = E.decodeUtf8More s chunk1 in + case ms1 of + Nothing -> discard + Just s1 -> + let (pre2, _, ms2) = E.decodeUtf8More s1 chunk2 in + let (pre3, _, ms3) = E.decodeUtf8More s (chunk1 `B.append` chunk2) in + (s2b (pre1 <> pre2), ms2) === (s2b pre3, ms3) + +-- Properties for any chunk +-- (but do try to generate valid chunks often enough) +t_decodeUtf8More1 = property $ do + cex@(s, chunk, _, _) <- randomMoreChunk + pure $ counterexample (show cex) $ + t_decodeUtf8More_split' s chunk + +-- Properties that require valid chunks +t_decodeUtf8More2 = property $ do + cex@(s, chunk, _, _, chunk2) <- validMoreChunks + pure $ counterexample (show cex) $ + pre_decodeUtf8More_suffix s chunk .&&. + pre_decodeUtf8More_append s chunk chunk2 + +s2b = E.encodeUtf8 . E.strictBuilderToText +p2b = E.getPartialUtf8 + testTranscoding :: TestTree testTranscoding = testGroup "transcoding" [ @@ -220,8 +490,8 @@ testTranscoding = testProperty "tl_latin1" tl_latin1, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', - testProperty "t_utf8_incr" t_utf8_incr, testProperty "t_utf8_undecoded" t_utf8_undecoded, + testProperty "t_utf8_incr" t_utf8_incr, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "t_utf16LE" t_utf16LE, @@ -248,10 +518,40 @@ testTranscoding = testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, - testProperty "t_decode_with_error2'" t_decode_with_error2', - testProperty "t_decode_with_error3'" t_decode_with_error3', - testProperty "t_decode_with_error4'" t_decode_with_error4', - testProperty "t_decode_with_error5'" t_decode_with_error5', + testCase "t_decode_with_error1'" t_decode_with_error1', + testCase "t_decode_with_error2'" t_decode_with_error2', + testCase "t_decode_with_error3'" t_decode_with_error3', + testCase "t_decode_with_error4'" t_decode_with_error4', + testCase "t_decode_with_error5'" t_decode_with_error5', testProperty "t_infix_concat" t_infix_concat + ], + testGroup "validate" [ + testProperty "t_validateUtf8More_validPrefix" t_validateUtf8More_validPrefix, + testProperty "t_validateUtf8More_maximalPrefix" t_validateUtf8More_maximalPrefix, + testProperty "t_validateUtf8More_valid" t_validateUtf8More_valid + ], + testGroup "streaming" [ + testProperty "t_utf8_c" t_utf8_c, + testCase "t_p_utf8_1" t_p_utf8_1, + testCase "t_p_utf8_2" t_p_utf8_2, + testCase "t_p_utf8_3" t_p_utf8_3, + testCase "t_p_utf8_4" t_p_utf8_4, + testCase "t_p_utf8_5" t_p_utf8_5, + testCase "t_p_utf8_6" t_p_utf8_6, + testCase "t_p_utf8_7" t_p_utf8_7, + testCase "t_p_utf8_8" t_p_utf8_8, + testCase "t_p_utf8_9" t_p_utf8_9, + testCase "t_p_utf8_0" t_p_utf8_0, + testCase "t_pn_utf8_1" t_pn_utf8_1, + testCase "t_pn_utf8_2" t_pn_utf8_2, + testCase "t_pn_utf8_3" t_pn_utf8_3, + testCase "t_decode_chunk1" t_decode_chunk1, + testCase "t_decode_chunk2" t_decode_chunk2, + testProperty "t_decodeUtf8Chunk_split" t_decodeUtf8Chunk_split, + testProperty "t_decodeUtf8More1" t_decodeUtf8More1, + testProperty "t_decodeUtf8More2" t_decodeUtf8More2 + ], + testGroup "strictBuilder" [ + testProperty "textToStrictBuilder" t_textToStrictBuilder ] ] diff --git a/text.cabal b/text.cabal index 445bb6e5..05946247 100644 --- a/text.cabal +++ b/text.cabal @@ -150,6 +150,7 @@ library Data.Text.Internal.Builder.RealFloat.Functions Data.Text.Internal.ByteStringCompat Data.Text.Internal.PrimCompat + Data.Text.Internal.Encoding Data.Text.Internal.Encoding.Fusion Data.Text.Internal.Encoding.Fusion.Common Data.Text.Internal.Encoding.Utf16 @@ -168,6 +169,7 @@ library Data.Text.Internal.Private Data.Text.Internal.Read Data.Text.Internal.Search + Data.Text.Internal.StrictBuilder Data.Text.Internal.Unsafe Data.Text.Internal.Unsafe.Char Data.Text.Lazy