Skip to content

Commit

Permalink
Merge pull request #181 from bgamari/wip/remove-findSubstring
Browse files Browse the repository at this point in the history
Remove findSubstring and findSubstrings
  • Loading branch information
Bodigrim authored Aug 23, 2020
2 parents 7ec0f76 + 5839612 commit 829771b
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 188 deletions.
60 changes: 2 additions & 58 deletions Data/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,6 @@ module Data.ByteString (

-- ** Search for arbitrary substrings
breakSubstring, -- :: ByteString -> ByteString -> (ByteString,ByteString)
findSubstring, -- :: ByteString -> ByteString -> Maybe Int
findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down Expand Up @@ -1336,10 +1334,9 @@ stripSuffix bs1@(BS _ l1) bs2@(BS _ l2)
| bs1 `isSuffixOf` bs2 = Just (unsafeTake (l2 - l1) bs2)
| otherwise = Nothing

-- | Check whether one string is a substring of another. @isInfixOf
-- p s@ is equivalent to @not (null (findSubstrings p s))@.
-- | Check whether one string is a substring of another.
isInfixOf :: ByteString -> ByteString -> Bool
isInfixOf p s = isJust (findSubstring p s)
isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)

-- | Break a string on a substring, returning a pair of the part of the
-- string prior to the match, and the rest of the string.
Expand All @@ -1348,14 +1345,6 @@ isInfixOf p s = isJust (findSubstring p s)
--
-- > break (== c) l == breakSubstring (singleton c) l
--
-- and:
--
-- > findSubstring s l ==
-- > if null s then Just 0
-- > else case breakSubstring s l of
-- > (x,y) | null y -> Nothing
-- > | otherwise -> Just (length x)
--
-- For example, to tokenise a string, dropping delimiters:
--
-- > tokenise x y = h : if null t then [] else tokenise x (drop (length x) t)
Expand Down Expand Up @@ -1425,51 +1414,6 @@ breakSubstring pat =
w' = mask .&. ((w `shiftL` 8) .|. b)
{-# INLINE shift #-}

-- | Get the first index of a substring in another string,
-- or 'Nothing' if the string is not found.
-- @findSubstring p s@ is equivalent to @listToMaybe (findSubstrings p s)@.
findSubstring :: ByteString -- ^ String to search for.
-> ByteString -- ^ String to seach in.
-> Maybe Int
findSubstring pat src
| null pat && null src = Just 0
| null b = Nothing
| otherwise = Just (length a)
where (a, b) = breakSubstring pat src

{-# DEPRECATED findSubstring "findSubstring is deprecated in favour of breakSubstring." #-}

-- | Find the indices of all non-overlapping occurences of a substring in a
-- string.
--
-- Note, prior to @0.10.6.0@ this function returned the indices of all
-- possibly-overlapping matches.
findSubstrings :: ByteString -- ^ String to search for.
-> ByteString -- ^ String to seach in.
-> [Int]
findSubstrings pat src
| null pat = [0 .. ls]
| otherwise = search 0
where
lp = length pat
ls = length src
search !n
| (n > ls - lp) || null b = []
| otherwise = let k = n + length a
in k : search (k + lp)
where
(a, b) = breakSubstring pat (unsafeDrop n src)

-- In
-- [0.10.6.0](<https://github.com/haskell/bytestring/commit/2160e091e215fecc9177d55a37cd50fc253ba86a?w=1>)
-- 'findSubstrings' was refactored to call an improved 'breakString'
-- implementation, but the refactored code no longer matches overlapping
-- strings. The behaviour change appears to be inadvertent, but the function
-- had already been deprecated for more than seven years. At this time
-- (@0.10.10.1@), the deprecation was twelve years in the past.
--
{-# DEPRECATED findSubstrings "findSubstrings is deprecated in favour of breakSubstring." #-}

-- ---------------------------------------------------------------------
-- Zipping

Expand Down
4 changes: 1 addition & 3 deletions Data/ByteString/Char8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@ module Data.ByteString.Char8 (

-- ** Search for arbitrary substrings
breakSubstring, -- :: ByteString -> ByteString -> (ByteString,ByteString)
findSubstring, -- :: ByteString -> ByteString -> Maybe Int
findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down Expand Up @@ -249,7 +247,7 @@ import Data.ByteString (empty,null,length,tail,init,append
,concat,take,drop,splitAt,intercalate
,sort,isPrefixOf,isSuffixOf,isInfixOf
,stripPrefix,stripSuffix
,findSubstring,findSubstrings,breakSubstring,copy,group
,breakSubstring,copy,group

,getLine, getContents, putStr, interact
,readFile, writeFile, appendFile
Expand Down
2 changes: 0 additions & 2 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,6 @@ module Data.ByteString.Lazy (

-- ** Search for arbitrary substrings
-- isSubstringOf, -- :: ByteString -> ByteString -> Bool
-- findSubstring, -- :: ByteString -> ByteString -> Maybe Int
-- findSubstrings, -- :: ByteString -> ByteString -> [Int]

-- * Searching ByteStrings

Expand Down
74 changes: 0 additions & 74 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -280,80 +280,6 @@ main = do
]
]

, bgroup "substrings"
[ bgroup "easy"
[ bench "easy1" . nf (uncurry S.findSubstrings)
$ easySubstrings 1 1000000
, bench "easy4" . nf (uncurry S.findSubstrings)
$ easySubstrings 4 1000000
, bench "easy16" . nf (uncurry S.findSubstrings)
$ easySubstrings 16 1000000
, bench "easy64" . nf (uncurry S.findSubstrings)
$ easySubstrings 64 1000000
, bench "easy128" . nf (uncurry S.findSubstrings)
$ easySubstrings 128 1000000
, bench "easy1024" . nf (uncurry S.findSubstrings)
$ easySubstrings 1024 1000000
]
, bgroup "random"
[ bench "random1" . nf (uncurry S.findSubstrings)
$ randomSubstrings 1 1000000
, bench "random4" . nf (uncurry S.findSubstrings)
$ randomSubstrings 4 1000000
, bench "random16" . nf (uncurry S.findSubstrings)
$ randomSubstrings 16 1000000
, bench "random64" . nf (uncurry S.findSubstrings)
$ randomSubstrings 64 1000000
, bench "random128" . nf (uncurry S.findSubstrings)
$ randomSubstrings 128 1000000
, bench "random1024" . nf (uncurry S.findSubstrings)
$ randomSubstrings 1024 1000000

]
, bgroup "hard"
[ bench "hard1" . nf (uncurry S.findSubstrings)
$ hardSubstrings 1 1000000
, bench "hard4" . nf (uncurry S.findSubstrings)
$ hardSubstrings 4 1000000
, bench "hard16" . nf (uncurry S.findSubstrings)
$ hardSubstrings 16 1000000
, bench "hard64" . nf (uncurry S.findSubstrings)
$ hardSubstrings 64 1000000
, bench "hard128" . nf (uncurry S.findSubstrings)
$ hardSubstrings 128 1000000
, bench "hard1024" . nf (uncurry S.findSubstrings)
$ hardSubstrings 1024 1000000
]
, bgroup "pathological"
[ bench "pathological1" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 1 1000000
, bench "pathological4" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 4 1000000
, bench "pathological16" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 16 1000000
, bench "pathological64" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 64 1000000
, bench "pathological128" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 128 1000000
, bench "pathological1024" . nf (uncurry S.findSubstrings)
$ pathologicalSubstrings 1024 1000000
]
, bgroup "html"
[ bench "html1" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 1 1000000
, bench "html4" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 4 1000000
, bench "html16" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 16 1000000
, bench "html64" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 64 1000000
, bench "html128" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 128 1000000
, bench "html1024" . nfIO . fmap (uncurry S.findSubstrings)
$ htmlSubstrings wikiPage 1024 1000000
]
]

, bgroup "Data.ByteString.Builder.Prim"
[ benchFE "char7" $ toEnum >$< P.char7
, benchFE "char8" $ toEnum >$< P.char8
Expand Down
51 changes: 0 additions & 51 deletions tests/Properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,48 +1252,6 @@ prop_initsBB xs = inits xs == map P.unpack (P.inits (P.pack xs))

prop_tailsBB xs = tails xs == map P.unpack (P.tails (P.pack xs))

-- The correspondence between the test 'ByteString' and naive test 'String'
-- must be injective, otherwise the ByteString may find matches at positions
-- that don't match in the "corresponding" string. To that end, we start
-- with and pack a Word8 array, rather than a unicode String.
--
prop_findSubstringsBB :: [Word8] -> Int -> Int -> Bool
prop_findSubstringsBB ws x l
= let bstr = P.pack ws
-- we look for some random substring of the test string
slice = C.take l $ C.drop x bstr
str = C.unpack bstr
substr = C.unpack slice
in C.findSubstrings slice bstr == naive_findSubstrings substr str
where
-- naive reference implementation
-- Note, overlapping matches have been broken since 2015, so at this
-- point just test for the current behaviour.
naive_findSubstrings :: String -> String -> [Int]
naive_findSubstrings p q
| null p = [0..length q]
| otherwise = go 0 (length p) p (length q) q
go n !lp p !lq q =
if (lp > lq)
then []
else if p `isPrefixOf` q
then n : go (n + lp) lp p (lq - lp) (drop lp q)
else go (n + 1) lp p (lq - 1) (tail q)

-- See above re injective string -> bytestring correspondence.
prop_findSubstringBB :: [Word8] -> Int -> Int -> Bool
prop_findSubstringBB ws x l
= let bstr = P.pack ws
-- we look for some random substring of the test string
slice = C.take l $ C.drop x bstr
str = C.unpack bstr
substr = C.unpack slice
in C.findSubstring slice bstr == naive_findSubstring substr str
where
-- naive reference implementation
naive_findSubstring :: String -> String -> Maybe Int
naive_findSubstring p q = listToMaybe [x | x <- [0..length q], p `isPrefixOf` drop x q]

-- correspondance between break and breakSubstring
prop_breakSubstringBB c l
= P.break (== c) l == P.breakSubstring (P.singleton c) l
Expand All @@ -1304,12 +1262,6 @@ prop_breakSubstring_isInfixOf s l
(x,y) | P.null y -> False
| otherwise -> True

prop_breakSubstring_findSubstring s l
= P.findSubstring s l == if P.null s then Just 0
else case P.breakSubstring s l of
(x,y) | P.null y -> Nothing
| otherwise -> Just (P.length x)

prop_replicate1BB c = forAll arbitrarySizedIntegral $ \n ->
P.unpack (P.replicate n c) == replicate n c
prop_replicate2BB c = forAll arbitrarySizedIntegral $ \n ->
Expand Down Expand Up @@ -2277,10 +2229,7 @@ bb_tests =
, testProperty "copy" prop_copyLL
, testProperty "inits" prop_initsBB
, testProperty "tails" prop_tailsBB
, testProperty "findSubstrings "prop_findSubstringsBB
, testProperty "findSubstring "prop_findSubstringBB
, testProperty "breakSubstring 1"prop_breakSubstringBB
, testProperty "breakSubstring 2"prop_breakSubstring_findSubstring
, testProperty "breakSubstring 3"prop_breakSubstring_isInfixOf

, testProperty "replicate1" prop_replicate1BB
Expand Down

0 comments on commit 829771b

Please sign in to comment.