Permalink
Browse files

Rewrite encodeUtf8 for speed

This was inspired by a patch from Simon Meier, who wrote a direct
implementation of encodeUtf8 using his 'blaze-builder' package.  His code
showed a very impressive speedup.  My code is similar in both structure
and performance, its chief difference being that it doesn't require
'blaze-builder'.

--HG--
extra : convert_revision : 1b338ee3a345ac1e437be1f5d8cd0919d9690c14
  • Loading branch information...
1 parent 7b070bd commit 8dbe7ad8163d4d6cbe00ffa73b67cb1c36ef855f @bos bos committed Oct 14, 2010
Showing with 60 additions and 5 deletions.
  1. +60 −5 Data/Text/Encoding.hs
View
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
-- |
-- Module : Data.Text.Encoding
-- Copyright : (c) Tom Harper 2008-2009,
@@ -34,17 +35,29 @@ module Data.Text.Encoding
-- * Encoding Text to ByteStrings
, encodeUtf8
+ , encodeUtf8'
, encodeUtf16LE
, encodeUtf16BE
, encodeUtf32LE
, encodeUtf32BE
) where
-import Data.ByteString (ByteString)
-import qualified Data.Text.Fusion as F
+import Data.Bits ((.&.))
+import Data.ByteString as B
+import Data.ByteString.Internal as B
import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
+import Data.Text.Encoding.Utf16 (chr2)
+import Data.Text.Internal (Text(..))
+import Data.Text.UnsafeChar (ord)
+import Data.Text.UnsafeShift (shiftL, shiftR)
+import Data.Word (Word8)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (plusPtr)
+import Foreign.Storable (poke)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Data.Text.Array as A
import qualified Data.Text.Encoding.Fusion as E
-import Data.Text.Internal (Text)
+import qualified Data.Text.Fusion as F
-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
decodeASCII :: ByteString -> Text
@@ -62,9 +75,51 @@ decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE decodeUtf8 #-}
-- | Encode text using UTF-8 encoding.
+encodeUtf8' :: Text -> ByteString
+encodeUtf8' txt = E.unstream (E.restreamUtf8 (F.stream txt))
+{-# INLINE encodeUtf8' #-}
+
+-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> ByteString
-encodeUtf8 txt = E.unstream (E.restreamUtf8 (F.stream txt))
-{-# INLINE encodeUtf8 #-}
+encodeUtf8 (Text arr off len) = unsafePerformIO $ do
+ let size0 = min len 4
+ mallocByteString size0 >>= start size0 off 0
+ where
+ start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
+ where
+ loop n1 m1 ptr = go n1 m1
+ where
+ go !n !m
+ | n-off == len = return $! PS fp 0 m
+ | size-m < 4 = {-# SCC "encodeUtf8/resize" #-} do
+ let newSize = size `shiftL` 1
+ fp' <- mallocByteString newSize
+ withForeignPtr fp' $ \ptr' -> memcpy ptr' ptr (fromIntegral m)
+ start newSize n m fp'
+ | otherwise = do
+ let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
+ w = A.unsafeIndex arr n
+ case undefined of
+ _| w <= 0x7F -> do
+ poke8 m w
+ go (n+1) (m+1)
+ | w <= 0x7FF -> do
+ poke8 m $ (w `shiftR` 6) + 0xC0
+ poke8 (m+1) $ (w .&. 0x3f) + 0x80
+ go (n+1) (m+2)
+ | 0xD800 <= w && w <= 0xDBFF -> do
+ let c = ord $ chr2 w (A.unsafeIndex arr (n+1))
+ poke8 m $ (c `shiftR` 18) + 0xF0
+ poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
+ poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
+ poke8 (m+3) $ (c .&. 0x3F) + 0x80
+ go (n+2) (m+4)
+ | otherwise -> do
+ poke8 m $ (w `shiftR` 12) + 0xE0
+ poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
+ poke8 (m+2) $ (w .&. 0x3F) + 0x80
+ go (n+1) (m+3)
+{- INLINE encodeUtf8 #-}
-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text

0 comments on commit 8dbe7ad

Please sign in to comment.