Make Data.ByteString.Lazy.Char8.lines less strict

The current implementation of `lines` in Data.ByteString.Lazy.Char8 is too strict. When a "line" spans multiple chunks it traverses all the chunks to the first line boundary before constructing the list head. For example, `lines <$> getContents` reading a large file with no line breaks does not make the first chunk of the (only) line available until the entire file is read into memory. Now that `Data.ByteString.break` is optimised for the `(== c)` case, we can get efficient code for the common many lines per-chunk use-case, without being needlessly strict. Tests added to make sure that the first chunk is available prompty without looking further.
haskell · Dec 7, 2022 · 827df30 · 827df30
1 parent d4933c6
commit 827df30
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 54 deletions.
diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs
@@ -247,6 +247,7 @@ import qualified Data.ByteString.Lazy as L
 import qualified Data.ByteString as S (ByteString) -- typename only
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Unsafe as B
+import Data.List.NonEmpty (NonEmpty(..), (<|))
 import Data.ByteString.Lazy.Internal
 import Data.ByteString.Lazy.ReadInt
 import Data.ByteString.Lazy.ReadNat
@@ -856,59 +857,31 @@ unzip :: [(Char, Char)] -> (ByteString, ByteString)
 unzip ls = (pack (fmap fst ls), pack (fmap snd ls))
 {-# INLINE unzip #-}
 
--- | 'lines' breaks a ByteString up into a list of ByteStrings at
+-- | 'lines' lazily splits a ByteString into a list of ByteStrings at
 -- newline Chars (@'\\n'@). The resulting strings do not contain newlines.
---
--- As of bytestring 0.9.0.3, this function is stricter than its
--- list cousin.
+-- The first chunk of the result is only strict in the first chunk of the
+-- input.
 --
 -- Note that it __does not__ regard CR (@'\\r'@) as a newline character.
 --
 lines :: ByteString -> [ByteString]
 lines Empty          = []
-lines (Chunk c0 cs0) = loop0 c0 cs0
-    where
-    -- this is a really performance sensitive function but the
-    -- chunked representation makes the general case a bit expensive
-    -- however assuming a large chunk size and normalish line lengths
-    -- we will find line endings much more frequently than chunk
-    -- endings so it makes sense to optimise for that common case.
-    -- So we partition into two special cases depending on whether we
-    -- are keeping back a list of chunks that will eventually be output
-    -- once we get to the end of the current line.
-
-    -- the common special case where we have no existing chunks of
-    -- the current line
-    loop0 :: S.ByteString -> ByteString -> [ByteString]
-    loop0 c cs =
-        case B.elemIndex (c2w '\n') c of
-            Nothing -> case cs of
-                           Empty  | B.null c  -> []
-                                  | otherwise -> [Chunk c Empty]
-                           (Chunk c' cs')
-                               | B.null c  -> loop0 c'     cs'
-                               | otherwise -> loop  c' [c] cs'
-
-            Just n | n /= 0    -> Chunk (B.unsafeTake n c) Empty
-                                : loop0 (B.unsafeDrop (n+1) c) cs
-                   | otherwise -> Empty
-                                : loop0 (B.unsafeTail c) cs
-
-    -- the general case when we are building a list of chunks that are
-    -- part of the same line
-    loop :: S.ByteString -> [S.ByteString] -> ByteString -> [ByteString]
-    loop c line cs =
-        case B.elemIndex (c2w '\n') c of
-            Nothing ->
-                case cs of
-                    Empty -> let !c' = revChunks (c : line)
-                              in [c']
-
-                    (Chunk c' cs') -> loop c' (c : line) cs'
-
-            Just n ->
-                let !c' = revChunks (B.unsafeTake n c : line)
-                 in c' : loop0 (B.unsafeDrop (n+1) c) cs
+lines (Chunk c0 cs0) = let l :| ls = lines1 c0 cs0 in l : ls
+  where
+    lines1 :: S.ByteString -> ByteString -> NonEmpty ByteString
+    lines1 c cs
+        | len > 1   = c1 <| lines1 (B.unsafeDrop 1 t0) cs
+        | len == 1  = c1 :| lines cs
+        | otherwise = Chunk c l :| ls
+      where
+        (h0, t0) = B.break (== 0x0a) c
+        len      = B.length t0
+        c1       = if B.null h0 then Empty else Chunk h0 Empty
+        ~(l:|ls) = lines2 cs
+
+    lines2 :: ByteString -> NonEmpty ByteString
+    lines2 Empty        = Empty :| []
+    lines2 (Chunk c cs) = lines1 c cs
 
 -- | 'unlines' joins lines, appending a terminating newline after each.
 --
@@ -950,10 +923,3 @@ hPutStrLn h ps = hPut h ps >> hPut h (L.singleton 0x0a)
 --
 putStrLn :: ByteString -> IO ()
 putStrLn = hPutStrLn stdout
-
--- ---------------------------------------------------------------------
--- Internal utilities
-
--- reverse a list of possibly-empty chunks into a lazy ByteString
-revChunks :: [S.ByteString] -> ByteString
-revChunks = List.foldl' (flip chunk) Empty
diff --git a/tests/Properties.hs b/tests/Properties.hs
@@ -82,9 +82,29 @@ prop_unsafeTail xs = not (P.null xs) ==> P.tail xs === P.unsafeTail xs
 prop_unsafeLast xs = not (P.null xs) ==> P.last xs === P.unsafeLast xs
 prop_unsafeInit xs = not (P.null xs) ==> P.init xs === P.unsafeInit xs
 
+prop_lines_empty_invariant =
+     True === case LC.lines (LC.pack "\nfoo\n") of
+        Empty : _ -> True
+        _         -> False
+
 prop_lines_lazy =
     take 2 (LC.lines (LC.append (LC.pack "a\nb\n") undefined)) === [LC.pack "a", LC.pack "b"]
 
+prop_lines_lazy2 =
+     c === case LC.lines (Chunk c undefined) of
+        Chunk c _ : _ -> c
+        _             -> P.empty
+  where
+    c = C.pack "etc..."
+
+prop_lines_lazy3 =
+     c === case LC.lines d of
+        Chunk c _ : _ -> c
+        _             -> P.empty
+  where
+    c = C.pack "etc..."
+    d = Chunk c d
+
 prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x
 
 class (Bounded a, Integral a, Show a) => RdInt a where
@@ -684,6 +704,9 @@ misc_tests =
     , testProperty "unsafeIndex"    prop_unsafeIndexBB
 
     , testProperty "lines_lazy"     prop_lines_lazy
+    , testProperty "lines_lazy2"    prop_lines_lazy2
+    , testProperty "lines_lazy3"    prop_lines_lazy3
+    , testProperty "lines_invar"    prop_lines_empty_invariant
     , testProperty "strip"          prop_strip
     , testProperty "isSpace"        prop_isSpaceWord8