Skip to content

Commit

Permalink
Make Data.ByteString.Lazy.Char8.lines less strict
Browse files Browse the repository at this point in the history
The current implementation of `lines` in Data.ByteString.Lazy.Char8 is too
strict.  When a "line" spans multiple chunks it traverses all the chunks
to the first line boundary before constructing the list head.

For example, `lines <$> getContents` reading a large file with no line breaks
does not make the first chunk of the (only) line available until the entire
file is read into memory.

Now that `Data.ByteString.break` is optimised for the `(== c)` case, we can get
efficient code for the common many lines per-chunk use-case, without being
needlessly strict.  Tests added to make sure that the first chunk is available
prompty without looking further.
  • Loading branch information
hs-viktor committed Dec 7, 2022
1 parent d4933c6 commit 827df30
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 54 deletions.
74 changes: 20 additions & 54 deletions Data/ByteString/Lazy/Char8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S (ByteString) -- typename only
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B
import Data.List.NonEmpty (NonEmpty(..), (<|))
import Data.ByteString.Lazy.Internal
import Data.ByteString.Lazy.ReadInt
import Data.ByteString.Lazy.ReadNat
Expand Down Expand Up @@ -856,59 +857,31 @@ unzip :: [(Char, Char)] -> (ByteString, ByteString)
unzip ls = (pack (fmap fst ls), pack (fmap snd ls))
{-# INLINE unzip #-}

-- | 'lines' breaks a ByteString up into a list of ByteStrings at
-- | 'lines' lazily splits a ByteString into a list of ByteStrings at
-- newline Chars (@'\\n'@). The resulting strings do not contain newlines.
--
-- As of bytestring 0.9.0.3, this function is stricter than its
-- list cousin.
-- The first chunk of the result is only strict in the first chunk of the
-- input.
--
-- Note that it __does not__ regard CR (@'\\r'@) as a newline character.
--
lines :: ByteString -> [ByteString]
lines Empty = []
lines (Chunk c0 cs0) = loop0 c0 cs0
where
-- this is a really performance sensitive function but the
-- chunked representation makes the general case a bit expensive
-- however assuming a large chunk size and normalish line lengths
-- we will find line endings much more frequently than chunk
-- endings so it makes sense to optimise for that common case.
-- So we partition into two special cases depending on whether we
-- are keeping back a list of chunks that will eventually be output
-- once we get to the end of the current line.

-- the common special case where we have no existing chunks of
-- the current line
loop0 :: S.ByteString -> ByteString -> [ByteString]
loop0 c cs =
case B.elemIndex (c2w '\n') c of
Nothing -> case cs of
Empty | B.null c -> []
| otherwise -> [Chunk c Empty]
(Chunk c' cs')
| B.null c -> loop0 c' cs'
| otherwise -> loop c' [c] cs'

Just n | n /= 0 -> Chunk (B.unsafeTake n c) Empty
: loop0 (B.unsafeDrop (n+1) c) cs
| otherwise -> Empty
: loop0 (B.unsafeTail c) cs

-- the general case when we are building a list of chunks that are
-- part of the same line
loop :: S.ByteString -> [S.ByteString] -> ByteString -> [ByteString]
loop c line cs =
case B.elemIndex (c2w '\n') c of
Nothing ->
case cs of
Empty -> let !c' = revChunks (c : line)
in [c']

(Chunk c' cs') -> loop c' (c : line) cs'

Just n ->
let !c' = revChunks (B.unsafeTake n c : line)
in c' : loop0 (B.unsafeDrop (n+1) c) cs
lines (Chunk c0 cs0) = let l :| ls = lines1 c0 cs0 in l : ls
where
lines1 :: S.ByteString -> ByteString -> NonEmpty ByteString
lines1 c cs
| len > 1 = c1 <| lines1 (B.unsafeDrop 1 t0) cs
| len == 1 = c1 :| lines cs
| otherwise = Chunk c l :| ls
where
(h0, t0) = B.break (== 0x0a) c
len = B.length t0
c1 = if B.null h0 then Empty else Chunk h0 Empty
~(l:|ls) = lines2 cs

lines2 :: ByteString -> NonEmpty ByteString
lines2 Empty = Empty :| []
lines2 (Chunk c cs) = lines1 c cs

-- | 'unlines' joins lines, appending a terminating newline after each.
--
Expand Down Expand Up @@ -950,10 +923,3 @@ hPutStrLn h ps = hPut h ps >> hPut h (L.singleton 0x0a)
--
putStrLn :: ByteString -> IO ()
putStrLn = hPutStrLn stdout

-- ---------------------------------------------------------------------
-- Internal utilities

-- reverse a list of possibly-empty chunks into a lazy ByteString
revChunks :: [S.ByteString] -> ByteString
revChunks = List.foldl' (flip chunk) Empty
23 changes: 23 additions & 0 deletions tests/Properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,29 @@ prop_unsafeTail xs = not (P.null xs) ==> P.tail xs === P.unsafeTail xs
prop_unsafeLast xs = not (P.null xs) ==> P.last xs === P.unsafeLast xs
prop_unsafeInit xs = not (P.null xs) ==> P.init xs === P.unsafeInit xs

prop_lines_empty_invariant =
True === case LC.lines (LC.pack "\nfoo\n") of
Empty : _ -> True
_ -> False

prop_lines_lazy =
take 2 (LC.lines (LC.append (LC.pack "a\nb\n") undefined)) === [LC.pack "a", LC.pack "b"]

prop_lines_lazy2 =
c === case LC.lines (Chunk c undefined) of
Chunk c _ : _ -> c
_ -> P.empty
where
c = C.pack "etc..."

prop_lines_lazy3 =
c === case LC.lines d of
Chunk c _ : _ -> c
_ -> P.empty
where
c = C.pack "etc..."
d = Chunk c d

prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x

class (Bounded a, Integral a, Show a) => RdInt a where
Expand Down Expand Up @@ -684,6 +704,9 @@ misc_tests =
, testProperty "unsafeIndex" prop_unsafeIndexBB

, testProperty "lines_lazy" prop_lines_lazy
, testProperty "lines_lazy2" prop_lines_lazy2
, testProperty "lines_lazy3" prop_lines_lazy3
, testProperty "lines_invar" prop_lines_empty_invariant
, testProperty "strip" prop_strip
, testProperty "isSpace" prop_isSpaceWord8

Expand Down

0 comments on commit 827df30

Please sign in to comment.