Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lazy dropEnd and friends #395

Merged
merged 17 commits into from
Jul 28, 2021
144 changes: 144 additions & 0 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,18 @@ module Data.ByteString.Lazy (

-- ** Breaking strings
take,
takeEnd,
drop,
dropEnd,
splitAt,
takeWhile,
takeWhileEnd,
dropWhile,
dropWhileEnd,
span,
spanEnd,
break,
breakEnd,
group,
groupBy,
inits,
Expand Down Expand Up @@ -220,13 +226,15 @@ import Prelude hiding
,getContents,getLine,putStr,putStrLn ,zip,zipWith,unzip,notElem)

import qualified Data.List as L -- L for list/lazy
import qualified Data.Bifunctor as BF
import qualified Data.ByteString as P (ByteString) -- type name only
import qualified Data.ByteString as S -- S for strict (hmm...)
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Unsafe as S
import Data.ByteString.Lazy.Internal

import Control.Monad (mplus)
import Data.Maybe (listToMaybe)
import Data.Word (Word8)
import Data.Int (Int64)
import System.IO (Handle,openBinaryFile,stdin,stdout,withBinaryFile,IOMode(..)
Expand Down Expand Up @@ -684,6 +692,28 @@ take i cs0 = take' i cs0
then Chunk (S.take (fromIntegral n) c) Empty
else Chunk c (take' (n - fromIntegral (S.length c)) cs)

-- | /O(c)/ @'takeEnd' n xs@ is equivalent to @'drop' ('length' xs - n) xs@.
-- Takes @n@ elements from end of bytestring.
--
-- >>> takeEnd 3 "abcdefg"
-- "efg"
-- >>> takeEnd 0 "abcdefg"
-- ""
-- >>> takeEnd 4 "abc"
-- "abc"
--
-- @since 0.11.2.0
takeEnd :: Int64 -> ByteString -> ByteString
takeEnd i _ | i <= 0 = Empty
takeEnd i cs0 = takeEnd' i cs0
where takeEnd' 0 _ = Empty
takeEnd' n cs =
snd $ foldrChunks takeTuple (n,Empty) cs
takeTuple _ (0, cs) = (0, cs)
takeTuple c (n, cs)
| n > fromIntegral (S.length c) = (n - fromIntegral (S.length c), Chunk c cs)
| otherwise = (0, Chunk (S.takeEnd (fromIntegral n) c) cs)

-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
-- elements, or @[]@ if @n > 'length' xs@.
drop :: Int64 -> ByteString -> ByteString
Expand All @@ -696,6 +726,43 @@ drop i cs0 = drop' i cs0
then Chunk (S.drop (fromIntegral n) c) cs
else drop' (n - fromIntegral (S.length c)) cs

-- | /O(n)/ @'dropEnd' n xs@ is equivalent to @'take' ('length' xs - n) xs@.
-- Drops @n@ elements from end of bytestring.
--
-- >>> dropEnd 3 "abcdefg"
-- "abcd"
-- >>> dropEnd 0 "abcdefg"
-- "abcdefg"
-- >>> dropEnd 4 "abc"
-- ""
--
-- @since 0.11.2.0
dropEnd :: Int64 -> ByteString -> ByteString
dropEnd i p | i <= 0 = p
dropEnd i p = go [] [] 0 0 p
where go hss tss acc h (Chunk c cs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add a comment, explaining meaning of arguments? AFAIU hss and tss model a bidirectional queue (aka deque), acc tracks total length of hss + tss, and h tracks the length of head hss.

Computing length of head hss is O(1) operation, so I think we can simplify a bit by not passing h argument.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed the need for the head of hss as its length is already included in the accumulated length. Furthermore, in a Deque, calculating the length of itshead might trigger a re-balancing which I think would not be ideal (e.g. it would always re balance on the second chunk). This seems a bit cleaner, although we might trigger getOutput a bit more often than before.

| h >= acc - i = go hss (c : tss) (acc + len c) h cs
| otherwise =
let (output, hss', tss', acc') = getOutput [] hss (c : tss) ( acc + len c)
in L.foldl (flip chunk) (go hss' tss' acc' (hLen hss') cs) output
go hss tss _ _ Empty = dropChunks (tss ++ L.reverse hss) (fromIntegral i)

len c = fromIntegral (S.length c)
hLen cs = maybe 0 len (listToMaybe cs)

getOutput out [] [] acc = (out, [], [], acc)
Copy link
Contributor

@Bodigrim Bodigrim Jul 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, it would be nice to document meaning of arguments and return values.

What I can suggest is creating data Deque = Deque { hss :: [S.ByteString], tss :: [S.ByteString], acc :: Int64 } (bonus points for better fields names). Then go becomes go :: Deque -> ByteString -> ByteString, and getOutput :: [S.ByteString] -> Deque -> ([S.ByteString], Deque). This reduces cognitive load for a reader.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe now with the use of the Deque, arguments are quite easy to follow. I'd happily add comments though if necessary

getOutput out [] bss acc = getOutput out (L.reverse bss) [] acc
getOutput out (x:xs) bss acc =
if len x <= acc - i - len x
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks suspicious, there are len x on both sides. Wouldn't len x <= acc - i be enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, its redundant

then getOutput (x:out) xs bss (acc - len x)
else (out, x:xs, bss, acc)

dropChunks [] _ = Empty
dropChunks (c : cs) n =
case S.length c of
l | l <= fromIntegral n -> dropChunks cs (fromIntegral n - l)
| otherwise -> L.foldl' (flip chunk) Empty (S.dropEnd (fromIntegral n) c : cs)

-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
splitAt i cs0 | i <= 0 = (Empty, cs0)
Expand All @@ -722,6 +789,23 @@ takeWhile f = takeWhile'
n | n < S.length c -> Chunk (S.take n c) Empty
| otherwise -> Chunk c (takeWhile' cs)

-- | Returns the longest (possibly empty) suffix of elements
-- satisfying the predicate.
--
-- @'takeWhileEnd' p@ is equivalent to @'reverse' . 'takeWhile' p . 'reverse'@.
Comment on lines +806 to +809
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An example would be nice to have here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've used Chunk (pack [1,2]) (Chunk (pack [3,4,6])) Empty as an example of a lazy bytestring. Hope it's not too verbose, but textual representation of a bytestring is always tricky.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that would be helpful for users who will mostly not be aware of ByteString's internal constructors. If you want to represent the bytes as numbers, you can use the OverloadedLists syntax, e.g. [1,2,3,4,6].

--
-- @since 0.11.2.0
takeWhileEnd :: (Word8 -> Bool) -> ByteString -> ByteString
takeWhileEnd f = takeWhileEnd'
where takeWhileEnd' Empty = Empty
takeWhileEnd' cs =
snd $ foldrChunks takeTuple (True,Empty) cs
takeTuple _ (False, bs) = (False,bs)
takeTuple c (True,bs) =
case S.takeWhileEnd f c of
c' | S.length c' == S.length c -> (True, Chunk c bs)
| otherwise -> (False, fromStrict c' `append` bs)

-- | Similar to 'P.dropWhile',
-- drops the longest (possibly empty) prefix of elements
-- satisfying the predicate and returns the remainder.
Expand All @@ -733,6 +817,25 @@ dropWhile f = dropWhile'
n | n < S.length c -> Chunk (S.drop n c) cs
| otherwise -> dropWhile' cs

-- | Similar to 'P.dropWhileEnd',
-- drops the longest (possibly empty) suffix of elements
-- satisfying the predicate and returns the remainder.
--
-- @'dropWhileEnd' p@ is equivalent to @'reverse' . 'dropWhile' p . 'reverse'@.
3kyro marked this conversation as resolved.
Show resolved Hide resolved
--
-- @since 0.11.2.0
dropWhileEnd :: (Word8 -> Bool) -> ByteString -> ByteString
dropWhileEnd f = go []
where go acc (Chunk c cs)
| f (S.last c) = go (c : acc) cs
| otherwise = L.foldl (flip Chunk) (go [] cs) (c : acc)
go acc Empty = dropAcc acc
dropAcc [] = Empty
dropAcc (x : xs) =
case S.dropWhileEnd f x of
x' | S.null x' -> dropAcc xs
| otherwise -> L.foldl' (flip Chunk) Empty (x' : xs)

-- | Similar to 'P.break',
-- returns the longest (possibly empty) prefix of elements which __do not__
-- satisfy the predicate and the remainder of the string.
Expand All @@ -750,6 +853,28 @@ break f = break'
| otherwise -> let (cs', cs'') = break' cs
in (Chunk c cs', cs'')


-- | Returns the longest (possibly empty) suffix of elements which __do not__
-- satisfy the predicate and the remainder of the string.
--
-- 'breakEnd' @p@ is equivalent to @'spanEnd' (not . p)@ and to @('takeWhileEnd' (not . p) &&& 'dropWhileEnd' (not . p))@.
--
-- @since 0.11.2.0
breakEnd :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
breakEnd f = go []
where go acc (Chunk c cs)
| f (S.last c) = L.foldl (flip $ BF.first . Chunk) (go [] cs) (c : acc)
| otherwise = go (c : acc) cs
go acc Empty = dropAcc acc
dropAcc [] = (Empty, Empty)
dropAcc (x : xs) =
case S.breakEnd f x of
(x', x'') | S.null x' -> let (y, y') = dropAcc xs
in (y, y' `append` fromStrict x)
| otherwise ->
L.foldl' (flip $ BF.first . Chunk) (fromStrict x', fromStrict x'') xs


--
-- TODO
--
Expand Down Expand Up @@ -799,6 +924,25 @@ spanByte c (LPS ps) = case (spanByte' ps) of (a,b) -> (LPS a, LPS b)
span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
span p = break (not . p)

-- | Returns the longest (possibly empty) suffix of elements
-- satisfying the predicate and the remainder of the string.
--
-- 'spanEnd' @p@ is equivalent to @'breakEnd' (not . p)@ and to @('takeWhileEnd' p &&& 'dropWhileEnd' p)@.
--
-- We have
--
-- > spanEnd (not . isSpace) "x y z" == ("x y ", "z")
--
-- and
--
-- > spanEnd (not . isSpace) ps
-- > ==
-- > let (x, y) = span (not . isSpace) (reverse ps) in (reverse y, reverse x)
--
-- @since 0.11.2.0
spanEnd :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
spanEnd p = breakEnd (not . p)

-- | /O(n)/ Splits a 'ByteString' into components delimited by
-- separators, where the predicate returns True for a separator element.
-- The resulting components do not contain the separators. Two adjacent
Expand Down
54 changes: 53 additions & 1 deletion Data/ByteString/Lazy/Char8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,18 @@ module Data.ByteString.Lazy.Char8 (

-- ** Breaking strings
take,
takeEnd,
drop,
dropEnd,
splitAt,
takeWhile,
takeWhileEnd,
dropWhile,
dropWhileEnd,
span,
spanEnd,
break,
breakEnd,
group,
groupBy,
inits,
Expand Down Expand Up @@ -203,7 +209,7 @@ module Data.ByteString.Lazy.Char8 (
import Data.ByteString.Lazy
(fromChunks, toChunks
,empty,null,length,tail,init,append,reverse,transpose,cycle
,concat,take,drop,splitAt,intercalate
,concat,take,takeEnd,drop,dropEnd,splitAt,intercalate
,isPrefixOf,isSuffixOf,group,inits,tails,copy
,stripPrefix,stripSuffix
,hGetContents, hGet, hPut, getContents
Expand Down Expand Up @@ -456,22 +462,68 @@ takeWhile :: (Char -> Bool) -> ByteString -> ByteString
takeWhile f = L.takeWhile (f . w2c)
{-# INLINE takeWhile #-}

-- | Returns the longest (possibly empty) suffix of elements
-- satisfying the predicate.
--
-- @'takeWhileEnd' p@ is equivalent to @'reverse' . 'takeWhile' p . 'reverse'@.
--
-- @since 0.11.2.0
takeWhileEnd :: (Char -> Bool) -> ByteString -> ByteString
takeWhileEnd f = L.takeWhileEnd (f . w2c)
{-# INLINE takeWhileEnd #-}

-- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
dropWhile :: (Char -> Bool) -> ByteString -> ByteString
dropWhile f = L.dropWhile (f . w2c)
{-# INLINE dropWhile #-}

-- | Similar to 'P.dropWhileEnd',
-- drops the longest (possibly empty) suffix of elements
-- satisfying the predicate and returns the remainder.
--
-- @'dropWhileEnd' p@ is equivalent to @'reverse' . 'dropWhile' p . 'reverse'@.
--
-- @since 0.11.2.0
dropWhileEnd :: (Char -> Bool) -> ByteString -> ByteString
dropWhileEnd f = L.dropWhileEnd (f . w2c)
{-# INLINE dropWhileEnd #-}

-- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
break f = L.break (f . w2c)
{-# INLINE break #-}

-- | 'breakEnd' behaves like 'break' but from the end of the 'ByteString'
--
-- breakEnd p == spanEnd (not.p)
--
-- @since 0.11.2.0
breakEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
breakEnd f = L.breakEnd (f . w2c)
{-# INLINE breakEnd #-}

-- | 'span' @p xs@ breaks the ByteString into two segments. It is
-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
span f = L.span (f . w2c)
{-# INLINE span #-}

-- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
-- We have
--
-- > spanEnd (not.isSpace) "x y z" == ("x y ","z")
--
-- and
--
-- > spanEnd (not . isSpace) ps
-- > ==
-- > let (x,y) = span (not.isSpace) (reverse ps) in (reverse y, reverse x)
--
-- @since 0.11.2.0
spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
spanEnd f = L.spanEnd (f . w2c)
{-# INLINE spanEnd #-}

{-
-- | 'breakChar' breaks its ByteString argument at the first occurence
-- of the specified Char. It is more efficient than 'break' as it is
Expand Down
15 changes: 10 additions & 5 deletions tests/Properties/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ tests =
\(toElem -> c) x -> (B.unpack *** B.unpack) (B.break (/= c) x) === break (/= c) (B.unpack x)
, testProperty "break span" $
\f x -> B.break f x === B.span (not . f) x
#ifndef BYTESTRING_LAZY
, testProperty "breakEnd" $
\f x -> B.breakEnd f x === swap ((B.reverse *** B.reverse) (B.break f (B.reverse x)))
, testProperty "breakEnd" $
\f x -> B.breakEnd f x === B.spanEnd (not . f) x

#ifndef BYTESTRING_LAZY
, testProperty "break breakSubstring" $
\(toElem -> c) x -> B.break (== c) x === B.breakSubstring (B.singleton c) x
, testProperty "breakSubstring" $
Expand Down Expand Up @@ -248,7 +249,6 @@ tests =
\x -> B.unpack (B.takeWhile isSpace x) === takeWhile isSpace (B.unpack x)
#endif

#ifndef BYTESTRING_LAZY
, testProperty "dropEnd" $
\n x -> B.dropEnd n x === B.take (B.length x - n) x
, testProperty "dropWhileEnd" $
Expand All @@ -257,7 +257,6 @@ tests =
\n x -> B.takeEnd n x === B.drop (B.length x - n) x
, testProperty "takeWhileEnd" $
\f x -> B.takeWhileEnd f x === B.reverse (B.takeWhile f (B.reverse x))
#endif

#ifdef BYTESTRING_LAZY
, testProperty "invariant" $
Expand All @@ -278,6 +277,14 @@ tests =
\x (toElem -> c) -> B.compareLength (B.snoc x c <> undefined) (B.length x) === GT
, testProperty "compareLength 5" $
\x n -> B.compareLength x n === compare (B.length x) n
, testProperty "dropEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
, testProperty "dropWhileEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropWhileEnd (const False) (B.singleton c <> undefined)) === B.singleton c
, testProperty "breakEnd lazy" $
\(toElem -> c) -> B.take 1 (fst $ B.breakEnd (const True) (B.singleton c <> undefined)) === B.singleton c
, testProperty "spanEnd lazy" $
\(toElem -> c) -> B.take 1 (fst $ B.spanEnd (const False) (B.singleton c <> undefined)) === B.singleton c
#endif

, testProperty "length" $
Expand Down Expand Up @@ -350,10 +357,8 @@ tests =
\(toElem -> c) x -> (B.unpack *** B.unpack) (B.span (== c) x) === span (== c) (B.unpack x)
, testProperty "span /=" $
\(toElem -> c) x -> (B.unpack *** B.unpack) (B.span (/= c) x) === span (/= c) (B.unpack x)
#ifndef BYTESTRING_LAZY
, testProperty "spanEnd" $
\f x -> B.spanEnd f x === swap ((B.reverse *** B.reverse) (B.span f (B.reverse x)))
#endif
, testProperty "split" $
\(toElem -> c) x -> map B.unpack (B.split c x) === split c (B.unpack x)
, testProperty "split empty" $
Expand Down