diff --git a/Blaze/ByteString/Builder.hs b/Blaze/ByteString/Builder.hs index 6e04505..a1e8db7 100644 --- a/Blaze/ByteString/Builder.hs +++ b/Blaze/ByteString/Builder.hs @@ -69,8 +69,10 @@ module Blaze.ByteString.Builder -- * Executing builders , toLazyByteString - -- , toLazyByteStringWith - -- , toByteString + , toLazyByteStringWith + , toByteString + , toByteStringIO + , toByteStringIOWith ) where diff --git a/Blaze/ByteString/Builder/Char/Utf8.hs b/Blaze/ByteString/Builder/Char/Utf8.hs index 5e432d5..6270687 100644 --- a/Blaze/ByteString/Builder/Char/Utf8.hs +++ b/Blaze/ByteString/Builder/Char/Utf8.hs @@ -35,45 +35,26 @@ import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TS -- imported for documentation links import Blaze.ByteString.Builder.Internal -import Blaze.ByteString.Builder.Write -- | Write a UTF-8 encoded Unicode character to a buffer. -- --- Note that the control flow of 'writeChar' is more complicated than the one --- of 'writeWord8', as the size of the write depends on the 'Char' written. --- Therefore, --- --- > fromWrite $ writeChar a `mappend` writeChar b --- --- must not always be faster than --- --- > fromChar a `mappend` fromChar b --- --- Use benchmarking to make informed decisions. --- - --- FIXME: Use a Write that always checks if 4 bytes are available and only take --- care of the precise pointer advance once the data has been written. Either --- formulate it using continuation passing or returning the increment using the --- IO action. The latter is probably simpler and better understandable. --- +{-# INLINE writeChar #-} writeChar :: Char -> Write -writeChar = encodeCharUtf8 f1 f2 f3 f4 +writeChar c = boundedWrite 4 (encodeCharUtf8 f1 f2 f3 f4 c) where - f1 x = Write 1 $ \ptr -> poke ptr x + f1 x1 = writeN 1 $ \op -> do pokeByteOff op 0 x1 - f2 x1 x2 = Write 2 $ \ptr -> do poke ptr x1 - poke (ptr `plusPtr` 1) x2 + f2 x1 x2 = writeN 2 $ \op -> do pokeByteOff op 0 x1 + pokeByteOff op 1 x2 + + f3 x1 x2 x3 = writeN 3 $ \op -> do pokeByteOff op 0 x1 + pokeByteOff op 1 x2 + pokeByteOff op 2 x3 - f3 x1 x2 x3 = Write 3 $ \ptr -> do poke ptr x1 - poke (ptr `plusPtr` 1) x2 - poke (ptr `plusPtr` 2) x3 - - f4 x1 x2 x3 x4 = Write 4 $ \ptr -> do poke ptr x1 - poke (ptr `plusPtr` 1) x2 - poke (ptr `plusPtr` 2) x3 - poke (ptr `plusPtr` 3) x4 -{-# INLINE writeChar #-} + f4 x1 x2 x3 x4 = writeN 4 $ \op -> do pokeByteOff op 0 x1 + pokeByteOff op 1 x2 + pokeByteOff op 2 x3 + pokeByteOff op 3 x4 -- | Encode a Unicode character to another datatype, using UTF-8. This function -- acts as an abstract way of encoding characters, as it is unaware of what @@ -113,7 +94,7 @@ fromChar = fromWriteSingleton writeChar -- | /O(n)/. Serialize a Unicode 'String' using the UTF-8 encoding. -- fromString :: String -> Builder -fromString = fromWrite1List writeChar +fromString = fromWriteList writeChar -- Performance note: ^^^ -- -- fromWrite2List made things slightly worse for the blaze-html benchmarks diff --git a/Blaze/ByteString/Builder/Html/Utf8.hs b/Blaze/ByteString/Builder/Html/Utf8.hs index bbbaf9a..bfa3a3e 100644 --- a/Blaze/ByteString/Builder/Html/Utf8.hs +++ b/Blaze/ByteString/Builder/Html/Utf8.hs @@ -36,17 +36,22 @@ import qualified Data.Text as TS import qualified Data.Text.Lazy as TL import Blaze.ByteString.Builder +import Blaze.ByteString.Builder.Internal import Blaze.ByteString.Builder.Char.Utf8 -- | Write a HTML escaped and UTF-8 encoded Unicode character to a bufffer. -- writeHtmlEscapedChar :: Char -> Write -writeHtmlEscapedChar '<' = writeByteString "<" -writeHtmlEscapedChar '>' = writeByteString ">" -writeHtmlEscapedChar '&' = writeByteString "&" -writeHtmlEscapedChar '"' = writeByteString """ -writeHtmlEscapedChar '\'' = writeByteString "'" -writeHtmlEscapedChar c = writeChar c +writeHtmlEscapedChar c0 = + boundedWrite 6 (io c0) + -- WARNING: Don't forget to change the bound if you change the bytestrings. + where + io '<' = runWrite $ writeByteString "<" + io '>' = runWrite $ writeByteString ">" + io '&' = runWrite $ writeByteString "&" + io '"' = runWrite $ writeByteString """ + io '\'' = runWrite $ writeByteString "'" + io c = runWrite $ writeChar c {-# INLINE writeHtmlEscapedChar #-} -- | /O(1)./ Serialize a HTML escaped Unicode character using the UTF-8 @@ -59,7 +64,7 @@ fromHtmlEscapedChar = fromWriteSingleton writeHtmlEscapedChar -- encoding. -- fromHtmlEscapedString :: String -> Builder -fromHtmlEscapedString = fromWrite1List writeHtmlEscapedChar +fromHtmlEscapedString = fromWriteList writeHtmlEscapedChar -- | /O(n)/. Serialize a value by 'Show'ing it and then, HTML escaping and -- UTF-8 encoding the resulting 'String'. diff --git a/Blaze/ByteString/Builder/Internal.hs b/Blaze/ByteString/Builder/Internal.hs index bc856c7..77f4cfe 100644 --- a/Blaze/ByteString/Builder/Internal.hs +++ b/Blaze/ByteString/Builder/Internal.hs @@ -32,24 +32,18 @@ module Blaze.ByteString.Builder.Internal ( , putBuildStepCont -- * Writes - , Write - , WriteIO - , writeN - , exactWrite - , boundedWrite - , fromWrite - , fromWriteSingleton - , fromWriteList - - -- ** Writing storables - , writeStorable - , fromStorable - , fromStorables + , module Blaze.ByteString.Builder.Internal.Write -- * Execution , toLazyByteString + , toLazyByteStringWith + , toByteString + , toByteStringIO + , toByteStringIOWith -- * Deafult Sizes + , defaultFirstBufferSize + , defaultMinimalBufferSize , defaultBufferSize , defaultMaximalCopySize ) where @@ -62,10 +56,8 @@ import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Internal as L import Blaze.ByteString.Builder.Internal.Types -import Blaze.ByteString.Builder.Internal.Buffer import Blaze.ByteString.Builder.Internal.Write - ------------------------------------------------------------------------------ -- Internal global constants. ------------------------------------------------------------------------------ @@ -121,12 +113,238 @@ defaultMaximalCopySize = 2 * defaultMinimalBufferSize -- 'L.ByteString'. The remaining part of the buffer is spilled, if the -- reamining free space is smaller than the minimal desired buffer size. -- +{-# INLINE flush #-} flush :: Builder flush = fromBuildStepCont step where step k !(BufRange op _) = return $ insertByteString op S.empty k -{-# INLINE flush #-} +-- | Run a 'Builder' with the given buffer sizes. +-- +-- Use this function for integrating the 'Builder' type with other libraries +-- that generate lazy bytestrings. +-- +-- Note that the builders should guarantee that on average the desired chunk +-- size is attained. Builders may decide to start a new buffer and not +-- completely fill the existing buffer, if this is faster. However, they should +-- not spill too much of the buffer, if they cannot compensate for it. +-- +-- A call @toLazyByteStringWith bufSize minBufSize firstBufSize@ will generate +-- a lazy bytestring according to the following strategy. First, we allocate +-- a buffer of size @firstBufSize@ and start filling it. If it overflows, we +-- allocate a buffer of size @minBufSize@ and copy the first buffer to it in +-- order to avoid generating a too small chunk. Finally, every next buffer will +-- be of size @bufSize@. This, slow startup strategy is required to achieve +-- good speed for short (<200 bytes) resulting bytestrings, as for them the +-- allocation cost is of a large buffer cannot be compensated. Moreover, this +-- strategy also allows us to avoid spilling too much memory for short +-- resulting bytestrings. +-- +-- Note that setting @firstBufSize >= minBufSize@ implies that the first buffer +-- is no longer copied but allocated and filled directly. Hence, setting +-- @firstBufSize = bufSize@ means that all chunks will use an underlying buffer +-- of size @bufSize@. This is recommended, if you know that you always output +-- more than @minBufSize@ bytes. +toLazyByteStringWith + :: Int -- ^ Buffer size (upper-bounds the resulting chunk size). + -> Int -- ^ Minimal free buffer space for continuing filling + -- the same buffer after a 'flush' or a direct bytestring + -- insertion. This corresponds to the minimal desired + -- chunk size. + -> Int -- ^ Size of the first buffer to be used and copied for + -- larger resulting sequences + -> Builder -- ^ Builder to run. + -> L.ByteString -- ^ Lazy bytestring to output after the builder is + -- finished. + -> L.ByteString -- ^ Resulting lazy bytestring +toLazyByteStringWith bufSize minBufSize firstBufSize (Builder b) k = + S.inlinePerformIO $ fillFirstBuffer (b (buildStep finalStep)) + where + finalStep (BufRange pf _) = return $ Done pf () + -- fill a first very small buffer, if we need more space then copy it + -- to the new buffer of size 'minBufSize'. This way we don't pay the + -- allocation cost of the big 'bufSize' buffer, when outputting only + -- small sequences. + fillFirstBuffer !step0 + | minBufSize <= firstBufSize = fillNewBuffer firstBufSize step0 + | otherwise = do + fpbuf <- S.mallocByteString firstBufSize + withForeignPtr fpbuf $ \pf -> do + let !pe = pf `plusPtr` firstBufSize + mkbs pf' = S.PS fpbuf 0 (pf' `minusPtr` pf) + {-# INLINE mkbs #-} + next <- runBuildStep step0 (BufRange pf pe) + case next of + Done pf' _ + | pf' == pf -> return k + | otherwise -> return $ L.Chunk (mkbs pf') k + + BufferFull newSize pf' nextStep -> do + let !l = pf' `minusPtr` pf + fillNewBuffer (max (l + newSize) minBufSize) $ buildStep $ + \(BufRange pfNew peNew) -> do + copyBytes pfNew pf l + let !br' = BufRange (pfNew `plusPtr` l) peNew + runBuildStep nextStep br' + + InsertByteString pf' bs nextStep + | pf' == pf -> + return $ L.Chunk bs (S.inlinePerformIO $ fillNewBuffer bufSize nextStep) + | otherwise -> + return $ L.Chunk (mkbs pf') + (L.Chunk bs (S.inlinePerformIO $ fillNewBuffer bufSize nextStep)) + + -- allocate and fill a new buffer + fillNewBuffer !size !step0 = do + fpbuf <- S.mallocByteString size + withForeignPtr fpbuf $ fillBuffer fpbuf + where + fillBuffer fpbuf !pbuf = fill pbuf step0 + where + !pe = pbuf `plusPtr` size + fill !pf !step = do + next <- runBuildStep step (BufRange pf pe) + let mkbs pf' = S.PS fpbuf (pf `minusPtr` pbuf) (pf' `minusPtr` pf) + {-# INLINE mkbs #-} + case next of + Done pf' _ + | pf' == pf -> return k + | otherwise -> return $ L.Chunk (mkbs pf') k + + BufferFull newSize pf' nextStep + | pf' == pf -> + fillNewBuffer (max newSize bufSize) nextStep + | otherwise -> + return $ L.Chunk (mkbs pf') + (S.inlinePerformIO $ + fillNewBuffer (max newSize bufSize) nextStep) + + InsertByteString pf' bs nextStep + | pf' == pf -> + return $ L.Chunk bs (S.inlinePerformIO $ fill pf' nextStep) + | minBufSize < pe `minusPtr` pf' -> + return $ L.Chunk (mkbs pf') + (L.Chunk bs (S.inlinePerformIO $ fill pf' nextStep)) + | otherwise -> + return $ L.Chunk (mkbs pf') + (L.Chunk bs (S.inlinePerformIO $ fillNewBuffer bufSize nextStep)) + + +-- | Extract the lazy 'L.ByteString' from the builder by running it with default +-- buffer sizes. Use this function, if you do not have any special +-- considerations with respect to buffer sizes. +-- +-- @ 'toLazyByteString' b = 'toLazyByteStringWith' 'defaultBufferSize' 'defaultMinimalBufferSize' 'defaultFirstBufferSize' b L.empty@ +-- +-- Note that @'toLazyByteString'@ is a 'Monoid' homomorphism. +-- +-- > toLazyByteString mempty == mempty +-- > toLazyByteString (x `mappend` y) == toLazyByteString x `mappend` toLazyByteString y +-- +-- However, in the second equation, the left-hand-side is generally faster to +-- execute. +-- +toLazyByteString :: Builder -> L.ByteString +toLazyByteString b = toLazyByteStringWith + defaultBufferSize defaultMinimalBufferSize defaultFirstBufferSize b L.empty +{-# INLINE toLazyByteString #-} + +-- | Pack the chunks of a lazy bytestring into a single strict bytestring. +packChunks :: L.ByteString -> S.ByteString +packChunks lbs = do + S.unsafeCreate (fromIntegral $ L.length lbs) (copyChunks lbs) + where + copyChunks !L.Empty !_pf = return () + copyChunks !(L.Chunk (S.PS fpbuf o l) lbs') !pf = do + withForeignPtr fpbuf $ \pbuf -> + copyBytes pf (pbuf `plusPtr` o) l + copyChunks lbs' (pf `plusPtr` l) + +-- | Run the builder to construct a strict bytestring containing the sequence +-- of bytes denoted by the builder. This is done by first serializing to a lazy bytestring and then packing its +-- chunks to a appropriately sized strict bytestring. +-- +-- > toByteString = packChunks . toLazyByteString +-- +-- Note that @'toByteString'@ is a 'Monoid' homomorphism. +-- +-- > toByteString mempty == mempty +-- > toByteString (x `mappend` y) == toByteString x `mappend` toByteString y +-- +-- However, in the second equation, the left-hand-side is generally faster to +-- execute. +-- +toByteString :: Builder -> S.ByteString +toByteString = packChunks . toLazyByteString + + +-- | @toByteStringIOWith bufSize io b@ runs the builder @b@ with a buffer of +-- at least the size @bufSize@ and executes the 'IO' action @io@ whenever the +-- buffer is full. +-- +-- Compared to 'toLazyByteStringWith' this function requires less allocation, +-- as the output buffer is only allocated once at the start of the +-- serialization and whenever something bigger than the current buffer size has +-- to be copied into the buffer, which should happen very seldomly for the +-- default buffer size of 32kb. Hence, the pressure on the garbage collector is +-- reduced, which can be an advantage when building long sequences of bytes. +-- +toByteStringIOWith :: Int -- ^ Buffer size (upper bounds + -- the number of bytes forced + -- per call to the 'IO' action). + -> (S.ByteString -> IO ()) -- ^ 'IO' action to execute per + -- full buffer, which is + -- referenced by a strict + -- 'S.ByteString'. + -> Builder -- ^ 'Builder' to run. + -> IO () -- ^ Resulting 'IO' action. +toByteStringIOWith bufSize io (Builder b) = + fillBuffer bufSize (b (buildStep finalStep)) + where + finalStep !(BufRange pf _) = return $ Done pf () + + fillBuffer !size step = do + S.mallocByteString size >>= fill + where + fill fpbuf = do + let !pf = unsafeForeignPtrToPtr fpbuf + !br = BufRange pf (pf `plusPtr` size) + -- safe due to later reference of fpbuf + -- BETTER than withForeignPtr, as we lose a tail call otherwise + signal <- runBuildStep step br + case signal of + Done pf' _ -> io $ S.PS fpbuf 0 (pf' `minusPtr` pf) + + BufferFull minSize pf' nextStep -> do + io $ S.PS fpbuf 0 (pf' `minusPtr` pf) + fillBuffer (max bufSize minSize) nextStep + + InsertByteString pf' bs nextStep -> do + io $ S.PS fpbuf 0 (pf' `minusPtr` pf) + io bs + fillBuffer bufSize nextStep + +-- | Run the builder with a 'defaultBufferSize'd buffer and execute the given +-- 'IO' action whenever the buffer is full or gets flushed. +-- +-- @ 'toByteStringIO' = 'toByteStringIOWith' 'defaultBufferSize'@ +-- +-- This is a 'Monoid' homomorphism in the following sense. +-- +-- > toByteStringIO io mempty == return () +-- > toByteStringIO io (x `mappend` y) == toByteStringIO io x >> toByteStringIO io y +-- +toByteStringIO :: (S.ByteString -> IO ()) -> Builder -> IO () +toByteStringIO = toByteStringIOWith defaultBufferSize +{-# INLINE toByteStringIO #-} + + +------------------------------------------------------------------------------ +-- Draft of new builder/put execution code +------------------------------------------------------------------------------ + +{- FIXME: Generalize this code such that it can replace the above clunky + - implementations. -- | A monad for lazily composing lazy bytestrings using continuations. newtype LBSM a = LBSM { unLBSM :: (a, L.ByteString -> L.ByteString) } @@ -181,3 +399,5 @@ test2 x = L.toChunks $ toLazyByteString2 $ fromBuilder $ mconcat ] -} + +-} diff --git a/Blaze/ByteString/Builder/Internal/Buffer.hs b/Blaze/ByteString/Builder/Internal/Buffer.hs index 997c271..a5115c8 100644 --- a/Blaze/ByteString/Builder/Internal/Buffer.hs +++ b/Blaze/ByteString/Builder/Internal/Buffer.hs @@ -24,6 +24,8 @@ module Blaze.ByteString.Builder.Internal.Buffer ( , allocBuffer , reuseBuffer , nextSlice + , updateEndOfSlice + , execBuildStep -- ** Conversion to bytestings , unsafeFreezeBuffer diff --git a/Blaze/ByteString/Builder/Internal/Write.hs b/Blaze/ByteString/Builder/Internal/Write.hs index 895a474..443cfeb 100644 --- a/Blaze/ByteString/Builder/Internal/Write.hs +++ b/Blaze/ByteString/Builder/Internal/Write.hs @@ -2,6 +2,7 @@ -- | -- Module : Blaze.ByteString.Builder.Internal.WriteIO -- Copyright : (c) 2010 Simon Meier +-- (c) 2010 Jasper van der Jeugt -- License : BSD3-style (see LICENSE) -- -- Maintainer : Simon Meier @@ -18,6 +19,7 @@ module Blaze.ByteString.Builder.Internal.Write ( , writeN , exactWrite , boundedWrite + , runWrite -- * Constructing builders from writes , fromWrite @@ -33,7 +35,6 @@ module Blaze.ByteString.Builder.Internal.Write ( import Foreign -import Data.Char import Data.Monoid import Control.Monad @@ -56,12 +57,18 @@ import Blaze.ByteString.Builder.Internal.Types -- -- | A write to a buffer. +-- +-- FIXME: Find better name: what about 'Poke' ? newtype WriteIO = WriteIO { runWriteIO :: Ptr Word8 -> IO (Ptr Word8) } -- | A write of a bounded number of bytes. data Write = Write {-# UNPACK #-} !Int WriteIO +-- | Extract the 'WriteIO' action of a write. +{-# INLINE runWrite #-} +runWrite :: Write -> WriteIO +runWrite (Write _ wio) = wio instance Monoid WriteIO where mempty = WriteIO $ return @@ -157,66 +164,6 @@ fromWriteList write = -{- ------------------------------------------------------------------------------- --- Testing the abstraction ------------------------------------------------------------------------------- - --- TODO: Move! - --- Utf-8 encoding ------------------ - -bwriteChar :: Char -> Write -bwriteChar c = Write 4 (encodeCharUtf8 f1 f2 f3 f4 c) - where - f1 x1 = writeN 1 $ \op -> do pokeByteOff op 0 x1 - - f2 x1 x2 = writeN 2 $ \op -> do pokeByteOff op 0 x1 - pokeByteOff op 1 x2 - - f3 x1 x2 x3 = writeN 3 $ \op -> do pokeByteOff op 0 x1 - pokeByteOff op 1 x2 - pokeByteOff op 2 x3 - - f4 x1 x2 x3 x4 = writeN 4 $ \op -> do pokeByteOff op 0 x1 - pokeByteOff op 1 x2 - pokeByteOff op 2 x3 - pokeByteOff op 3 x4 -{-# INLINE bwriteChar #-} - --- | Encode a Unicode character to another datatype, using UTF-8. This function --- acts as an abstract way of encoding characters, as it is unaware of what --- needs to happen with the resulting bytes: you have to specify functions to --- deal with those. --- -encodeCharUtf8 :: (Word8 -> a) -- ^ 1-byte UTF-8 - -> (Word8 -> Word8 -> a) -- ^ 2-byte UTF-8 - -> (Word8 -> Word8 -> Word8 -> a) -- ^ 3-byte UTF-8 - -> (Word8 -> Word8 -> Word8 -> Word8 -> a) -- ^ 4-byte UTF-8 - -> Char -- ^ Input 'Char' - -> a -- ^ Result -encodeCharUtf8 f1 f2 f3 f4 c = case ord c of - x | x <= 0x7F -> f1 $ fromIntegral x - | x <= 0x07FF -> - let x1 = fromIntegral $ (x `shiftR` 6) + 0xC0 - x2 = fromIntegral $ (x .&. 0x3F) + 0x80 - in f2 x1 x2 - | x <= 0xFFFF -> - let x1 = fromIntegral $ (x `shiftR` 12) + 0xE0 - x2 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80 - x3 = fromIntegral $ (x .&. 0x3F) + 0x80 - in f3 x1 x2 x3 - | otherwise -> - let x1 = fromIntegral $ (x `shiftR` 18) + 0xF0 - x2 = fromIntegral $ ((x `shiftR` 12) .&. 0x3F) + 0x80 - x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80 - x4 = fromIntegral $ (x .&. 0x3F) + 0x80 - in f4 x1 x2 x3 x4 -{-# INLINE encodeCharUtf8 #-} - --} - ------------------------------------------------------------------------------ -- Writing storables ------------------------------------------------------------------------------