Skip to content

Commit

Permalink
[RFC] Builder: Efficiently handle literal strings (#132)
Browse files Browse the repository at this point in the history
* Test naive String Builder

* Test and benchmark cstring

* Efficiently copy CStrings

* Benchmark UTF-8 strings

* Test cstringUtf8 and encoding of NULL

* Really test encoding of NULL

* Fix compatibility with older GHCs

* Fix encoding of NULL

Co-authored-by: Bodigrim <andrew.lelechenko@gmail.com>
  • Loading branch information
bgamari and Bodigrim committed Aug 26, 2020
1 parent fc9409e commit 155bf8a
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 6 deletions.
32 changes: 29 additions & 3 deletions Data/ByteString/Builder.hs
@@ -1,4 +1,4 @@
{-# LANGUAGE CPP, BangPatterns #-}
{-# LANGUAGE CPP, BangPatterns, MagicHash #-}
{-# OPTIONS_GHC -fno-warn-unused-imports -fno-warn-orphans #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
Expand Down Expand Up @@ -263,6 +263,8 @@ import Data.ByteString.Builder.ASCII
import Data.String (IsString(..))
import System.IO (Handle)
import Foreign
import GHC.Base (unpackCString#, unpackCStringUtf8#,
unpackFoldrCString#, build)

-- HADDOCK only imports
import qualified Data.ByteString as S (concat)
Expand Down Expand Up @@ -432,10 +434,20 @@ char8 :: Char -> Builder
char8 = P.primFixed P.char8

-- | Char8 encode a 'String'.
{-# INLINE string8 #-}
{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite
string8 :: String -> Builder
string8 = P.primMapListFixed P.char8

-- GHC desugars string literals with unpackCString# which the simplifier tends
-- to promptly turn into build (unpackFoldrCString# s), so we match on both.
{-# RULES
"string8/unpackCString#" forall s.
string8 (unpackCString# s) = P.cstring s

"string8/unpackFoldrCString#" forall s.
string8 (build (unpackFoldrCString# s)) = P.cstring s
#-}

------------------------------------------------------------------------------
-- UTF-8 encoding
------------------------------------------------------------------------------
Expand All @@ -446,9 +458,23 @@ charUtf8 :: Char -> Builder
charUtf8 = P.primBounded P.charUtf8

-- | UTF-8 encode a 'String'.
{-# INLINE stringUtf8 #-}
--
-- Note that 'stringUtf8' performs no codepoint validation and consequently may
-- emit invalid UTF-8 if asked (e.g. single surrogates).
{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite
stringUtf8 :: String -> Builder
stringUtf8 = P.primMapListBounded P.charUtf8

{-# RULES
"stringUtf8/unpackCStringUtf8#" forall s.
stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s

"stringUtf8/unpackCString#" forall s.
stringUtf8 (unpackCString# s) = P.cstring s

"stringUtf8/unpackFoldrCString#" forall s.
stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s
#-}

instance IsString Builder where
fromString = stringUtf8
62 changes: 61 additions & 1 deletion Data/ByteString/Builder/Prim.hs
@@ -1,4 +1,5 @@
{-# LANGUAGE CPP, BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-}
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
#if __GLASGOW_HASKELL__ == 700
-- This is needed as a workaround for an old bug in GHC 7.0.1 (Trac #4498)
Expand Down Expand Up @@ -438,6 +439,9 @@ module Data.ByteString.Builder.Prim (
-- a decimal number with UTF-8 encoded characters.
, charUtf8

, cstring
, cstringUtf8

{-
-- * Testing support
-- | The following four functions are intended for testing use
Expand Down Expand Up @@ -473,13 +477,16 @@ import Data.ByteString.Builder.Prim.ASCII
#if MIN_VERSION_base(4,4,0)
#if MIN_VERSION_base(4,7,0)
import Foreign
import Foreign.C.Types
#else
import Foreign hiding (unsafeForeignPtrToPtr)
#endif
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
#else
import Foreign
#endif
import GHC.Exts
import GHC.IO

------------------------------------------------------------------------------
-- Creating Builders from bounded primitives
Expand Down Expand Up @@ -672,6 +679,60 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

#if !MIN_VERSION_base(4,7,0)
-- eqWord# et al. return Bools prior to GHC 7.6
isTrue# :: Bool -> Bool
isTrue# x = x
#endif

-- | A null-terminated ASCII encoded 'CString'. Null characters are not representable.
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k !br@(BufferRange op0@(Ptr op0#) ope)
| isTrue# (ch `eqWord#` 0##) = k br
| op0 == ope =
return $ bufferFull defaultChunkSize op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#

-- | A null-terminated UTF-8 encoded 'CString'. Null characters can be encoded as
-- @0xc0 0x80@.
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k !br@(BufferRange op0@(Ptr op0#) ope)
| isTrue# (ch `eqWord#` 0##) = k br
| op0 == ope =
return $ bufferFull defaultChunkSize op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| isTrue# (ch `eqWord#` 0xc0##)
, isTrue# (indexWord8OffAddr# addr 1# `eqWord#` 0x80##) = do
IO $ \s -> case writeWord8OffAddr# op0# 0# 0## s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#

------------------------------------------------------------------------------
-- Char8 encoding
------------------------------------------------------------------------------
Expand Down Expand Up @@ -736,4 +797,3 @@ encodeCharUtf8 f1 f2 f3 f4 c = case ord c of
x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (x .&. 0x3F) + 0x80
in f4 x1 x2 x3 x4

8 changes: 8 additions & 0 deletions bench/BenchAll.hs
@@ -1,6 +1,7 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
-- |
-- Copyright : (c) 2011 Simon Meier
-- License : BSD3-style (see LICENSE)
Expand All @@ -14,6 +15,7 @@ module Main (main) where

import Data.Foldable (foldMap)
import Data.Monoid
import Data.String
import Gauge
import Prelude hiding (words)

Expand All @@ -37,6 +39,8 @@ import qualified Blaze.Text as Blaze
import qualified "bytestring" Data.ByteString as OldS
import qualified "bytestring" Data.ByteString.Lazy as OldL

import Paths_bench_bytestring

import Foreign

import System.Random
Expand Down Expand Up @@ -240,6 +244,10 @@ main = do
[ benchB' "mempty" () (const mempty)
, benchB' "ensureFree 8" () (const (ensureFree 8))
, benchB' "intHost 1" 1 intHost
, benchB' "UTF-8 String (naive)" "hello world\0" fromString
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
, benchB' "String (naive)" "hello world!" fromString
, benchB' "String" () $ \() -> P.cstring "hello world!"#
]

, bgroup "Encoding wrappers"
Expand Down
15 changes: 13 additions & 2 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
@@ -1,4 +1,4 @@
{-# LANGUAGE CPP, ScopedTypeVariables #-}
{-# LANGUAGE CPP, ScopedTypeVariables, MagicHash #-}

-- |
-- Copyright : (c) 2011 Simon Meier
Expand All @@ -14,21 +14,32 @@ module Data.ByteString.Builder.Prim.Tests (tests) where

import Data.Char (ord)
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as LC
import Data.ByteString.Builder
import qualified Data.ByteString.Builder.Prim as BP
import Data.ByteString.Builder.Prim.TestUtils

#if defined(HAVE_TEST_FRAMEWORK)
import Test.Framework
import Test.Framework.Providers.QuickCheck2
#else
import TestFramework
#endif


tests :: [Test]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB ]
, testsCombinatorsB, [testCString, testCStringUtf8] ]

testCString :: Test
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: Test
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"

------------------------------------------------------------------------------
-- Binary
Expand Down

0 comments on commit 155bf8a

Please sign in to comment.