Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove findSubstring and findSubstrings #181

Merged
merged 2 commits into from
Aug 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 2 additions & 58 deletions Data/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,6 @@ module Data.ByteString (

-- ** Search for arbitrary substrings
breakSubstring, -- :: ByteString -> ByteString -> (ByteString,ByteString)
findSubstring, -- :: ByteString -> ByteString -> Maybe Int
findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down Expand Up @@ -1337,10 +1335,9 @@ stripSuffix bs1@(PS _ _ l1) bs2@(PS _ _ l2)
| bs1 `isSuffixOf` bs2 = Just (unsafeTake (l2 - l1) bs2)
| otherwise = Nothing

-- | Check whether one string is a substring of another. @isInfixOf
-- p s@ is equivalent to @not (null (findSubstrings p s))@.
-- | Check whether one string is a substring of another.
isInfixOf :: ByteString -> ByteString -> Bool
isInfixOf p s = isJust (findSubstring p s)
isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)

-- | Break a string on a substring, returning a pair of the part of the
-- string prior to the match, and the rest of the string.
Expand All @@ -1349,14 +1346,6 @@ isInfixOf p s = isJust (findSubstring p s)
--
-- > break (== c) l == breakSubstring (singleton c) l
--
-- and:
--
-- > findSubstring s l ==
-- > if null s then Just 0
-- > else case breakSubstring s l of
-- > (x,y) | null y -> Nothing
-- > | otherwise -> Just (length x)
--
-- For example, to tokenise a string, dropping delimiters:
--
-- > tokenise x y = h : if null t then [] else tokenise x (drop (length x) t)
Expand Down Expand Up @@ -1426,51 +1415,6 @@ breakSubstring pat =
w' = mask .&. ((w `shiftL` 8) .|. b)
{-# INLINE shift #-}

-- | Get the first index of a substring in another string,
-- or 'Nothing' if the string is not found.
-- @findSubstring p s@ is equivalent to @listToMaybe (findSubstrings p s)@.
findSubstring :: ByteString -- ^ String to search for.
-> ByteString -- ^ String to seach in.
-> Maybe Int
findSubstring pat src
| null pat && null src = Just 0
| null b = Nothing
| otherwise = Just (length a)
where (a, b) = breakSubstring pat src

{-# DEPRECATED findSubstring "findSubstring is deprecated in favour of breakSubstring." #-}

-- | Find the indices of all non-overlapping occurences of a substring in a
-- string.
--
-- Note, prior to @0.10.6.0@ this function returned the indices of all
-- possibly-overlapping matches.
findSubstrings :: ByteString -- ^ String to search for.
-> ByteString -- ^ String to seach in.
-> [Int]
findSubstrings pat src
| null pat = [0 .. ls]
| otherwise = search 0
where
lp = length pat
ls = length src
search !n
| (n > ls - lp) || null b = []
| otherwise = let k = n + length a
in k : search (k + lp)
where
(a, b) = breakSubstring pat (unsafeDrop n src)

-- In
-- [0.10.6.0](<https://github.com/haskell/bytestring/commit/2160e091e215fecc9177d55a37cd50fc253ba86a?w=1>)
-- 'findSubstrings' was refactored to call an improved 'breakString'
-- implementation, but the refactored code no longer matches overlapping
-- strings. The behaviour change appears to be inadvertent, but the function
-- had already been deprecated for more than seven years. At this time
-- (@0.10.10.1@), the deprecation was twelve years in the past.
--
{-# DEPRECATED findSubstrings "findSubstrings is deprecated in favour of breakSubstring." #-}

-- ---------------------------------------------------------------------
-- Zipping

Expand Down
4 changes: 1 addition & 3 deletions Data/ByteString/Char8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@ module Data.ByteString.Char8 (

-- ** Search for arbitrary substrings
breakSubstring, -- :: ByteString -> ByteString -> (ByteString,ByteString)
findSubstring, -- :: ByteString -> ByteString -> Maybe Int
findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down Expand Up @@ -249,7 +247,7 @@ import Data.ByteString (empty,null,length,tail,init,append
,concat,take,drop,splitAt,intercalate
,sort,isPrefixOf,isSuffixOf,isInfixOf
,stripPrefix,stripSuffix
,findSubstring,findSubstrings,breakSubstring,copy,group
,breakSubstring,copy,group

,getLine, getContents, putStr, interact
,readFile, writeFile, appendFile
Expand Down
2 changes: 0 additions & 2 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,6 @@ module Data.ByteString.Lazy (

-- ** Search for arbitrary substrings
-- isSubstringOf, -- :: ByteString -> ByteString -> Bool
-- findSubstring, -- :: ByteString -> ByteString -> Maybe Int
-- findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down
74 changes: 0 additions & 74 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -280,80 +280,6 @@ main = do
]
]

, bgroup "substrings"
[ bgroup "easy"
[ bench "easy1" . nf (uncurry S.findSubstrings)
$ easySubstrings 1 1000000
, bench "easy4" . nf (uncurry S.findSubstrings)
$ easySubstrings 4 1000000
, bench "easy16" . nf (uncurry S.findSubstrings)
$ easySubstrings 16 1000000
, bench "easy64" . nf (uncurry S.findSubstrings)
$ easySubstrings 64 1000000
, bench "easy128" . nf (uncurry S.findSubstrings)
$ easySubstrings 128 1000000
, bench "easy1024" . nf (uncurry S.findSubstrings)
$ easySubstrings 1024 1000000
]
, bgroup "random"
[ bench "random1" . nf (uncurry S.findSubstrings)
$ randomSubstrings 1 1000000
, bench "random4" . nf (uncurry S.findSubstrings)
$ randomSubstrings 4 1000000
, bench "random16" . nf (uncurry S.findSubstrings)
$ randomSubstrings 16 1000000
, bench "random64" . nf (uncurry S.findSubstrings)
$ randomSubstrings 64 1000000
, bench "random128" . nf (uncurry S.findSubstrings)
$ randomSubstrings 128 1000000
, bench "random1024" . nf (uncurry S.findSubstrings)
$ randomSubstrings 1024 1000000

]
, bgroup "hard"
[ bench "hard1" . nf (uncurry S.findSubstrings)
$ hardSubstrings 1 1000000
, bench "hard4" . nf (uncurry S.findSubstrings)
$ hardSubstrings 4 1000000
, bench "hard16" . nf (uncurry S.findSubstrings)
$ hardSubstrings 16 1000000
, bench "hard64" . nf (uncurry S.findSubstrings)
$ hardSubstrings 64 1000000
, bench "hard128" . nf (uncurry S.findSubstrings)
$ hardSubstrings 128 1000000
, bench "hard1024" . nf (uncurry S.findSubstrings)
$ hardSubstrings 1024 1000000
]
, bgroup "pathological"
[ bench "pathological1" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 1 1000000
, bench "pathological4" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 4 1000000
, bench "pathological16" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 16 1000000
, bench "pathological64" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 64 1000000
, bench "pathological128" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 128 1000000
, bench "pathological1024" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 1024 1000000
]
, bgroup "html"
[ bench "html1" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 1 1000000
, bench "html4" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 4 1000000
, bench "html16" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 16 1000000
, bench "html64" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 64 1000000
, bench "html128" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 128 1000000
, bench "html1024" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 1024 1000000
]
]

, bgroup "Data.ByteString.Builder.Prim"
[ benchFE "char7" $ toEnum >$< P.char7
, benchFE "char8" $ toEnum >$< P.char8
Expand Down
51 changes: 0 additions & 51 deletions tests/Properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,48 +1252,6 @@ prop_initsBB xs = inits xs == map P.unpack (P.inits (P.pack xs))

prop_tailsBB xs = tails xs == map P.unpack (P.tails (P.pack xs))

-- The correspondence between the test 'ByteString' and naive test 'String'
-- must be injective, otherwise the ByteString may find matches at positions
-- that don't match in the "corresponding" string. To that end, we start
-- with and pack a Word8 array, rather than a unicode String.
--
prop_findSubstringsBB :: [Word8] -> Int -> Int -> Bool
prop_findSubstringsBB ws x l
= let bstr = P.pack ws
-- we look for some random substring of the test string
slice = C.take l $ C.drop x bstr
str = C.unpack bstr
substr = C.unpack slice
in C.findSubstrings slice bstr == naive_findSubstrings substr str
where
-- naive reference implementation
-- Note, overlapping matches have been broken since 2015, so at this
-- point just test for the current behaviour.
naive_findSubstrings :: String -> String -> [Int]
naive_findSubstrings p q
| null p = [0..length q]
| otherwise = go 0 (length p) p (length q) q
go n !lp p !lq q =
if (lp > lq)
then []
else if p `isPrefixOf` q
then n : go (n + lp) lp p (lq - lp) (drop lp q)
else go (n + 1) lp p (lq - 1) (tail q)

-- See above re injective string -> bytestring correspondence.
prop_findSubstringBB :: [Word8] -> Int -> Int -> Bool
prop_findSubstringBB ws x l
= let bstr = P.pack ws
-- we look for some random substring of the test string
slice = C.take l $ C.drop x bstr
str = C.unpack bstr
substr = C.unpack slice
in C.findSubstring slice bstr == naive_findSubstring substr str
where
-- naive reference implementation
naive_findSubstring :: String -> String -> Maybe Int
naive_findSubstring p q = listToMaybe [x | x <- [0..length q], p `isPrefixOf` drop x q]

-- correspondance between break and breakSubstring
prop_breakSubstringBB c l
= P.break (== c) l == P.breakSubstring (P.singleton c) l
Expand All @@ -1304,12 +1262,6 @@ prop_breakSubstring_isInfixOf s l
(x,y) | P.null y -> False
| otherwise -> True

prop_breakSubstring_findSubstring s l
= P.findSubstring s l == if P.null s then Just 0
else case P.breakSubstring s l of
(x,y) | P.null y -> Nothing
| otherwise -> Just (P.length x)

prop_replicate1BB c = forAll arbitrarySizedIntegral $ \n ->
P.unpack (P.replicate n c) == replicate n c
prop_replicate2BB c = forAll arbitrarySizedIntegral $ \n ->
Expand Down Expand Up @@ -2277,10 +2229,7 @@ bb_tests =
, testProperty "copy" prop_copyLL
, testProperty "inits" prop_initsBB
, testProperty "tails" prop_tailsBB
, testProperty "findSubstrings "prop_findSubstringsBB
, testProperty "findSubstring "prop_findSubstringBB
, testProperty "breakSubstring 1"prop_breakSubstringBB
, testProperty "breakSubstring 2"prop_breakSubstring_findSubstring
, testProperty "breakSubstring 3"prop_breakSubstring_isInfixOf

, testProperty "replicate1" prop_replicate1BB
Expand Down