haskell · Bodigrim · Sep 8, 2021 · Jul 14, 2021 · May 22, 2021 · May 22, 2021
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,24 @@
-### 1.3
-
+### 2.0
+
+* [Switch internal representation of text from UTF-16 to UTF-8](https://github.com/haskell/text/pull/365):
+  * Functions in `Data.Text.Array` now operate over arrays of `Word8` instead of `Word16`.
+  * Rename constructors of `Array` and `MArray` to `ByteArray` and `MutableByteArray`.
+  * Rename functions and types in `Data.Text.Foreign` to reflect switch
+    from `Word16` to `Word8`.
+  * Rename slicing functions in `Data.Text.Unsafe` to reflect switch
+    from `Word16` to `Word8`.
+  * Rename `Data.Text.Internal.Unsafe.Char.unsafeChr` to `unsafeChr16`.
+  * Change semantics and order of arguments of `Data.Text.Array.copyI`:
+    pass length, not end offset.
+  * Extend `Data.Text.Internal.Encoding.Utf8` to provide more UTF-8 related routines.
+  * Extend interface of `Data.Text.Array` with more utility functions.
+  * Add `instance Show Data.Text.Unsafe.Iter`.
+  * Add `Data.Text.measureOff`.
+  * Extend `Data.Text.Unsafe` with `iterArray` and `reverseIterArray`.
+  * Export `Data.Text.Internal.Lazy.equal`.
+  * Export `Data.Text.Internal.append`.
+  * Add `Data.Text.Internal.Private.spanAscii_`.
+  * Replacement characters in `decodeUtf8With` are no longer limited to Basic Multilingual Plane.
 * [Disable implicit fusion rules](https://github.com/haskell/text/pull/348)
 * [Add `Data.Text.Encoding.decodeUtf8Lenient`](https://github.com/haskell/text/pull/342)
 * [Remove `Data.Text.Internal.Unsafe.Shift`](https://github.com/haskell/text/pull/343)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
@@ -1225,6 +1225,8 @@ take n t@(Text arr off len)
 --
 -- This function is used to implement 'take', 'drop', 'splitAt' and 'length'
 -- and is useful on its own in streaming and parsing libraries.
+--
+-- @since 2.0
 measureOff :: Int -> Text -> Int
 measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
   cSsizeToInt $ unsafeDupablePerformIO $

diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
@@ -80,6 +80,8 @@ new (I# len#)
 {-# INLINE new #-}
 
 -- | Create an uninitialized mutable pinned array.
+--
+-- @since 2.0
 newPinned :: forall s. Int -> ST s (MArray s)
 newPinned (I# len#)
 #if defined(ASSERTS)
@@ -90,13 +92,15 @@ newPinned (I# len#)
       (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
 {-# INLINE newPinned #-}
 
+-- | @since 2.0
 newFilled :: Int -> Int -> ST s (MArray s)
 newFilled (I# len#) (I# c#) = ST $ \s1# ->
   case newByteArray# len# s1# of
     (# s2#, marr# #) -> case setByteArray# marr# 0# len# c# s2# of
       s3# -> (# s3#, MutableByteArray marr# #)
 {-# INLINE newFilled #-}
 
+-- | @since 2.0
 tile :: MArray s -> Int -> ST s ()
 tile marr tileLen = do
   totalLen <- getSizeofMArray marr
@@ -130,8 +134,10 @@ unsafeIndex (ByteArray arr) i@(I# i#) =
   case indexWord8Array# arr i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
--- sizeofMutableByteArray# is deprecated, because it is unsafe in the presence of
--- shrinkMutableByteArray# and resizeMutableByteArray#.
+-- | 'sizeofMutableByteArray#' is deprecated, because it is unsafe in the presence of
+-- 'shrinkMutableByteArray#' and 'resizeMutableByteArray#'.
+--
+-- @since 2.0
 getSizeofMArray :: MArray s -> ST s Int
 getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
   case getSizeofMutableByteArray# marr s0# of
@@ -185,12 +191,14 @@ run2 k = runST (do
                  return (arr,b))
 {-# INLINE run2 #-}
 
+-- | @since 2.0
 resizeM :: MArray s -> Int -> ST s (MArray s)
 resizeM (MutableByteArray ma) i@(I# i#) = ST $ \s1# ->
   case resizeMutableByteArray# ma i# s1# of
     (# s2#, newArr #) -> (# s2#, MutableByteArray newArr #)
 {-# INLINE resizeM #-}
 
+-- | @since 2.0
 shrinkM ::
 #if defined(ASSERTS)
   HasCallStack =>
@@ -253,6 +261,8 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
 {-# INLINE copyI #-}
 
 -- | Copy from pointer.
+--
+-- @since 2.0
 copyFromPointer
   :: MArray s               -- ^ Destination
   -> Int                    -- ^ Destination offset
@@ -270,6 +280,8 @@ copyFromPointer (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I#
 {-# INLINE copyFromPointer #-}
 
 -- | Copy to pointer.
+--
+-- @since 2.0
 copyToPointer
   :: Array                  -- ^ Source
   -> Int                    -- ^ Source offset
@@ -293,6 +305,8 @@ equal src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count == 0
 {-# INLINE equal #-}
 
 -- | Compare portions of two arrays. No bounds checking is performed.
+--
+-- @since 2.0
 compare :: Array -> Int -> Array -> Int -> Int -> Ordering
 compare src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count `Prelude.compare` 0
 {-# INLINE compare #-}

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
@@ -234,7 +234,7 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
             , isValidBS bs = do
               withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
-                unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff)
+                unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
 
             | dstOff + 4 > dstLen = do

diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs
@@ -59,6 +59,8 @@ import qualified Data.Text.Array as A
 -- the functions in the 'Data.Text.Encoding' module.
 
 -- | A type representing a number of UTF-8 code units.
+--
+-- @since 2.0
 newtype I8 = I8 Int
     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
 
@@ -86,6 +88,8 @@ fromPtr ptr (I8 len) = unsafeSTToIO $ do
 -- If @n@ would cause the 'Text' to end inside a code point, the
 -- end of the prefix will be advanced by several additional 'Word8' units
 -- to maintain its validity.
+--
+-- @since 2.0
 takeWord8 :: I8 -> Text -> Text
 takeWord8 = (fst .) . splitAtWord8
 
@@ -95,6 +99,8 @@ takeWord8 = (fst .) . splitAtWord8
 -- If @n@ would cause the 'Text' to begin inside a code point, the
 -- beginning of the suffix will be advanced by several additional 'Word8'
 -- unit to maintain its validity.
+--
+-- @since 2.0
 dropWord8 :: I8 -> Text -> Text
 dropWord8 = (snd .) . splitAtWord8
 

diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -68,6 +68,8 @@ between x y z = x >= y && x <= z
 --   | ord c < 0x800   = 2
 --   | ord c < 0x10000 = 3
 --   | otherwise       = 4
+
+-- | @since 2.0
 utf8Length :: Char -> Int
 utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800#) +# geChar# c (chr# 0x10000#)))
 {-# INLINE utf8Length #-}
@@ -82,6 +84,8 @@ utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800
 -- c `xor` I# (c# <=# 0#) is a branchless equivalent of c `max` 1.
 -- It is crucial to write c# <=# 0# and not c# ==# 0#, otherwise
 -- GHC is tempted to "optimize" by introduction of branches.
+
+-- | @since 2.0
 utf8LengthByLeader :: Word8 -> Int
 utf8LengthByLeader w = c `xor` I# (c# <=# 0#)
   where
@@ -256,11 +260,13 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
 
 newtype CodePoint = CodePoint Int
 
+-- | @since 2.0
 data DecoderResult
   = Accept !Char
   | Incomplete !DecoderState !CodePoint
   | Reject
 
+-- | @since 2.0
 utf8DecodeStart :: Word8 -> DecoderResult
 utf8DecodeStart w
   | st == utf8AcceptState = Accept (chr (word8ToInt w))
@@ -271,6 +277,7 @@ utf8DecodeStart w
     st = updateState cl utf8AcceptState
     cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w
 
+-- | @since 2.0
 utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
 utf8DecodeContinue w st (CodePoint cp)
   | st' == utf8AcceptState = Accept (chr cp')

diff --git a/src/Data/Text/Internal/Private.hs b/src/Data/Text/Internal/Private.hs
@@ -38,6 +38,8 @@ span_ p t@(Text arr off len) = (# hd,tl #)
 
 -- | For the sake of performance this function does not check
 -- that a char is in ASCII range; it is a responsibility of @p@.
+--
+-- @since 2.0
 spanAscii_ :: (Word8 -> Bool) -> Text -> (# Text, Text #)
 spanAscii_ p (Text arr off len) = (# hd, tl #)
   where hd = text arr off k

diff --git a/src/Data/Text/Internal/Unsafe/Char.hs b/src/Data/Text/Internal/Unsafe/Char.hs
@@ -39,6 +39,7 @@ ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
 {-# INLINE ord #-}
 
+-- | @since 2.0
 unsafeChr16 :: Word16 -> Char
 unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
 {-# INLINE unsafeChr16 #-}

diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs
@@ -79,6 +79,7 @@ iter ::
 iter (Text arr off _len) i = iterArray arr (off + i)
 {-# INLINE iter #-}
 
+-- | @since 2.0
 iterArray :: A.Array -> Int -> Iter
 iterArray arr j = Iter chr l
   where m0 = A.unsafeIndex arr j
@@ -107,6 +108,7 @@ reverseIter :: Text -> Int -> Iter
 reverseIter (Text arr off _len) i = reverseIterArray arr (off + i)
 {-# INLINE reverseIter #-}
 
+-- | @since 2.0
 reverseIterArray :: A.Array -> Int -> Iter
 reverseIterArray arr j
     | m0 <  0x80 = Iter (unsafeChr8 m0) (-1)
@@ -139,16 +141,22 @@ reverseIter_ (Text arr off _len) i
 -- | /O(1)/ Return the length of a 'Text' in units of 'Word8'.  This
 -- is useful for sizing a target array appropriately before using
 -- 'unsafeCopyToPtr'.
+--
+-- @since 2.0
 lengthWord8 :: Text -> Int
 lengthWord8 (Text _arr _off len) = len
 {-# INLINE lengthWord8 #-}
 
 -- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'.
+--
+-- @since 2.0
 takeWord8 :: Int -> Text -> Text
 takeWord8 k (Text arr off _len) = Text arr off k
 {-# INLINE takeWord8 #-}
 
 -- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'.
+--
+-- @since 2.0
 dropWord8 :: Int -> Text -> Text
 dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
 {-# INLINE dropWord8 #-}
diff --git a/text.cabal b/text.cabal
@@ -1,6 +1,6 @@
 cabal-version:  2.2
 name:           text
-version:        1.2.5.0
+version:        2.0
 
 homepage:       https://github.com/haskell/text
 bug-reports:    https://github.com/haskell/text/issues