Skip to content

Commit

Permalink
Merge pull request #200
Browse files Browse the repository at this point in the history
Fix usage of size hints which resulted in serious bugs
such as operations like `(== 1) . length . filter (== ',')`  (see #197)
giving wrong results.
  • Loading branch information
hvr committed Dec 16, 2017
2 parents ccbfabe + cfb8278 commit 548ca22
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 33 deletions.
15 changes: 15 additions & 0 deletions Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
-- with Unicode text (including normalization, regular expressions,
-- non-standard encodings, text breaking, and locales), see
-- <http://hackage.haskell.org/package/text-icu the text-icu package >.
--

module Data.Text
(
Expand All @@ -43,6 +44,9 @@ module Data.Text
-- * Acceptable data
-- $replacement

-- * Definition of character
-- $character_definition

-- * Fusion
-- $fusion

Expand Down Expand Up @@ -247,6 +251,17 @@ import qualified GHC.Exts as Exts
import Text.Printf (PrintfArg, formatArg, formatString)
#endif

-- $character-definition
--
-- This package uses the word /character/ to denote Unicode code points.
--
-- Note that this is not the same thing as a grapheme (e.g. a composition of
-- code points that form one visual symbol). For instance, consider the grapheme
-- @ä@. This symbol has two Unicode representations: a single code-point
-- representation @U+00E4@ (the @LATIN SMALL LETTER A WITH DIAERESIS@ code
-- point), and a two code point representation @U+0061@ (the @A@ code point) and
-- @U+0308@ (the @COMBINING DIAERESIS@ code point).

-- $strict
--
-- This package provides both strict and lazy 'Text' types. The
Expand Down
50 changes: 28 additions & 22 deletions Data/Text/Internal/Fusion/Common.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
{-# LANGUAGE PatternGuards, BangPatterns, MagicHash, Rank2Types #-}
-- |
-- Module : Data.Text.Internal.Fusion.Common
-- Copyright : (c) Bryan O'Sullivan 2009, 2012
Expand Down Expand Up @@ -117,7 +117,7 @@ import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
import GHC.Types (Char(..), Int(..))

singleton :: Char -> Stream Char
singleton c = Stream next False 1
singleton c = Stream next False (codePointsSize 1)
where next False = Yield c True
next True = Done
{-# INLINE [0] singleton #-}
Expand Down Expand Up @@ -175,7 +175,7 @@ data C s = C0 !s

-- | /O(n)/ Adds a character to the front of a Stream Char.
cons :: Char -> Stream Char -> Stream Char
cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len+1)
cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len + codePointsSize 1)
where
next (C1 s) = Yield w (C0 s)
next (C0 s) = case next0 s of
Expand All @@ -189,7 +189,7 @@ data Snoc a = N

-- | /O(n)/ Adds a character to the end of a stream.
snoc :: Stream Char -> Char -> Stream Char
snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len+1)
snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len + codePointsSize 1)
where
next (J xs) = case next0 xs of
Done -> Yield w N
Expand Down Expand Up @@ -237,7 +237,7 @@ uncons :: Stream Char -> Maybe (Char, Stream Char)
uncons (Stream next s0 len) = loop_uncons s0
where
loop_uncons !s = case next s of
Yield x s1 -> Just (x, Stream next s1 (len-1))
Yield x s1 -> Just (x, Stream next s1 (len - codePointsSize 1))
Skip s' -> loop_uncons s'
Done -> Nothing
{-# INLINE [0] uncons #-}
Expand All @@ -260,7 +260,7 @@ last (Stream next s0 _len) = loop0_last s0
-- | /O(1)/ Returns all characters after the head of a Stream Char, which must
-- be non-empty.
tail :: Stream Char -> Stream Char
tail (Stream next0 s0 len) = Stream next (C0 s0) (len-1)
tail (Stream next0 s0 len) = Stream next (C0 s0) (len - codePointsSize 1)
where
next (C0 s) = case next0 s of
Done -> emptyError "tail"
Expand All @@ -278,7 +278,7 @@ data Init s = Init0 !s
-- | /O(1)/ Returns all but the last character of a Stream Char, which
-- must be non-empty.
init :: Stream Char -> Stream Char
init (Stream next0 s0 len) = Stream next (Init0 s0) (len-1)
init (Stream next0 s0 len) = Stream next (Init0 s0) (len - codePointsSize 1)
where
next (Init0 s) = case next0 s of
Done -> emptyError "init"
Expand Down Expand Up @@ -318,11 +318,14 @@ lengthI (Stream next s0 _len) = loop_length 0 s0
-- greater than the number or if the stream can't possibly be as long
-- as the number supplied, and hence be more efficient.
compareLengthI :: Integral a => Stream Char -> a -> Ordering
compareLengthI (Stream next s0 len) n =
case compareSize len (fromIntegral n) of
Just o -> o
Nothing -> loop_cmp 0 s0
compareLengthI (Stream next s0 len) n
-- Note that @len@ tracks code units whereas we want to compare the length
-- in code points. Specifically, a stream with hint @len@ may consist of
-- anywhere from @len/2@ to @len@ code points.
| Just r <- compareSize len n' = r
| otherwise = loop_cmp 0 s0
where
n' = codePointsSize $ fromIntegral n
loop_cmp !z s = case next s of
Done -> compare z n
Skip s' -> loop_cmp z s'
Expand Down Expand Up @@ -368,7 +371,7 @@ data I s = I1 !s
-- | /O(n)/ Take a character and place it between each of the
-- characters of a 'Stream Char'.
intersperse :: Char -> Stream Char -> Stream Char
intersperse c (Stream next0 s0 len) = Stream next (I1 s0) len
intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
where
next (I1 s) = case next0 s of
Done -> Done
Expand All @@ -393,9 +396,11 @@ intersperse c (Stream next0 s0 len) = Stream next (I1 s0) len
-- functions may map one input character to two or three output
-- characters.

-- | Map a 'Stream' through the given case-mapping function.
caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
-> Stream Char -> Stream Char
caseConvert remap (Stream next0 s0 len) = Stream next (CC s0 '\0' '\0') len
caseConvert remap (Stream next0 s0 len) =
Stream next (CC s0 '\0' '\0') (len `unionSize` 3*len)
where
next (CC s '\0' _) =
case next0 s of
Expand Down Expand Up @@ -458,7 +463,7 @@ toLower = caseConvert lowerMapping
-- Fox\" is correctly title cased&#x2014;but this function will
-- capitalize /every/ word.
toTitle :: Stream Char -> Stream Char
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') len
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
where
next (CC (letter :*: s) '\0' _) =
case next0 s of
Expand All @@ -479,7 +484,7 @@ data Justify i s = Just1 !i !s

justifyLeftI :: Integral a => a -> Char -> Stream Char -> Stream Char
justifyLeftI k c (Stream next0 s0 len) =
Stream next (Just1 0 s0) (larger (fromIntegral k) len)
Stream next (Just1 0 s0) (larger (fromIntegral k * charSize c + len) len)
where
next (Just1 n s) =
case next0 s of
Expand Down Expand Up @@ -699,7 +704,7 @@ replicateI n (Stream next0 s0 len) =
-- (a,b), in which case, a is the next Char in the string, and b is
-- the seed value for further production.
unfoldr :: (a -> Maybe (Char,a)) -> a -> Stream Char
unfoldr f s0 = Stream next s0 1 -- HINT maybe too low
unfoldr f s0 = Stream next s0 unknownSize
where
{-# INLINE next #-}
next !s = case f s of
Expand All @@ -713,7 +718,7 @@ unfoldr f s0 = Stream next s0 1 -- HINT maybe too low
-- 'unfoldr' when the length of the result is known.
unfoldrNI :: Integral a => a -> (b -> Maybe (Char,b)) -> b -> Stream Char
unfoldrNI n f s0 | n < 0 = empty
| otherwise = Stream next (0 :*: s0) (fromIntegral (n*2)) -- HINT maybe too high
| otherwise = Stream next (0 :*: s0) (maxSize $ fromIntegral (n*2))
where
{-# INLINE next #-}
next (z :*: s) = case f s of
Expand All @@ -730,7 +735,7 @@ unfoldrNI n f s0 | n < 0 = empty
-- length of the stream.
take :: Integral a => a -> Stream Char -> Stream Char
take n0 (Stream next0 s0 len) =
Stream next (n0 :*: s0) (smaller len (fromIntegral (max 0 n0)))
Stream next (n0 :*: s0) (smaller len (codePointsSize $ fromIntegral n0))
where
{-# INLINE next #-}
next (n :*: s) | n <= 0 = Done
Expand All @@ -748,7 +753,7 @@ data Drop a s = NS !s
-- is greater than the length of the stream.
drop :: Integral a => a -> Stream Char -> Stream Char
drop n0 (Stream next0 s0 len) =
Stream next (JS n0 s0) (len - fromIntegral (max 0 n0))
Stream next (JS n0 s0) (len - codePointsSize (fromIntegral n0))
where
{-# INLINE next #-}
next (JS n s)
Expand All @@ -766,7 +771,7 @@ drop n0 (Stream next0 s0 len) =
-- | 'takeWhile', applied to a predicate @p@ and a stream, returns the
-- longest prefix (possibly empty) of elements that satisfy @p@.
takeWhile :: (Char -> Bool) -> Stream Char -> Stream Char
takeWhile p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high
takeWhile p (Stream next0 s0 len) = Stream next s0 (len - unknownSize)
where
{-# INLINE next #-}
next !s = case next0 s of
Expand All @@ -778,7 +783,7 @@ takeWhile p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high

-- | @'dropWhile' p xs@ returns the suffix remaining after @'takeWhile' p xs@.
dropWhile :: (Char -> Bool) -> Stream Char -> Stream Char
dropWhile p (Stream next0 s0 len) = Stream next (L s0) len -- HINT maybe too high
dropWhile p (Stream next0 s0 len) = Stream next (L s0) (len - unknownSize)
where
{-# INLINE next #-}
next (L s) = case next0 s of
Expand Down Expand Up @@ -857,7 +862,8 @@ indexI (Stream next s0 _len) n0
-- returns a stream containing those characters that satisfy the
-- predicate.
filter :: (Char -> Bool) -> Stream Char -> Stream Char
filter p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high
filter p (Stream next0 s0 len) =
Stream next s0 (len - unknownSize) -- HINT maybe too high
where
next !s = case next0 s of
Done -> Done
Expand Down
38 changes: 32 additions & 6 deletions Data/Text/Internal/Fusion/Size.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
module Data.Text.Internal.Fusion.Size
(
Size
, exactly
-- * Sizes
, exactSize
, maxSize
, betweenSize
, unknownSize
, unionSize
, charSize
, codePointsSize
-- * Querying sizes
, exactly
, smaller
, larger
, upperBound
Expand All @@ -32,11 +37,13 @@ module Data.Text.Internal.Fusion.Size
, isEmpty
) where

import Data.Char (ord)
import Data.Text.Internal (mul)
#if defined(ASSERTS)
import Control.Exception (assert)
#endif

-- | A size in UTF-16 code units.
data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
| Unknown -- ^ Unknown size.
deriving (Eq, Show)
Expand All @@ -46,6 +53,17 @@ exactly (Between na nb) | na == nb = Just na
exactly _ = Nothing
{-# INLINE exactly #-}

-- | The 'Size' of the given code point.
charSize :: Char -> Size
charSize c
| ord c < 0x10000 = exactSize 1
| otherwise = exactSize 2

-- | The 'Size' of @n@ code points.
codePointsSize :: Int -> Size
codePointsSize n = Between n (2*n)
{-# INLINE codePointsSize #-}

exactSize :: Int -> Size
exactSize n =
#if defined(ASSERTS)
Expand All @@ -71,6 +89,10 @@ betweenSize m n =
Between m n
{-# INLINE betweenSize #-}

unionSize :: Size -> Size -> Size
unionSize (Between a b) (Between c d) = Between (min a c) (max b d)
unionSize _ _ = Unknown

unknownSize :: Size
unknownSize = Unknown
{-# INLINE unknownSize #-}
Expand Down Expand Up @@ -140,11 +162,15 @@ lowerBound _ (Between n _) = n
lowerBound k _ = k
{-# INLINE lowerBound #-}

compareSize :: Size -> Int -> Maybe Ordering
compareSize (Between ma mb) n
| mb < n = Just LT
| ma > n = Just GT
| ma == n && mb == n = Just EQ
-- | Determine the ordering relationship between two 'Size's, or 'Nothing' in
-- the indeterminate case.
compareSize :: Size -> Size -> Maybe Ordering
compareSize (Between ma mb) (Between na nb)
| mb < na = Just LT
| ma > nb = Just GT
| ma == mb
, ma == na
, ma == nb = Just EQ
compareSize _ _ = Nothing


Expand Down
8 changes: 4 additions & 4 deletions Data/Text/Internal/Fusion/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,16 @@ instance (Ord a) => Ord (Stream a) where
-- unstreaming functions must be able to cope with the hint being too
-- small or too large.
--
-- The size hint tries to track the UTF-16 code points in a stream,
-- but often counts the number of characters instead. It can easily
-- The size hint tries to track the UTF-16 code units in a stream,
-- but often counts the number of code points instead. It can easily
-- undercount if, for instance, a transformed stream contains astral
-- plane characters (those above 0x10000).
-- plane code points (those above 0x10000).

data Stream a =
forall s. Stream
(s -> Step s a) -- stepper function
!s -- current state
!Size -- size hint
!Size -- size hint in code units

-- | /O(n)/ Determines if two streams are equal.
eq :: (Eq a) => Stream a -> Stream a -> Bool
Expand Down
11 changes: 11 additions & 0 deletions tests/Tests/Regressions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,22 @@ mapAccumL_resize = do
assertEqual "mapAccumL should correctly size buffers for two-word results"
(count * 2) (T.lengthWord16 (snd val))

-- See GitHub #197
t197 :: IO ()
t197 =
assertBool "length (filter (==',') \"0,00\") should be 1" (currencyParser "0,00")
where
currencyParser x = cond == 1
where
cond = length fltr
fltr = filter (== ',') x

tests :: F.Test
tests = F.testGroup "Regressions"
[ F.testCase "hGetContents_crash" hGetContents_crash
, F.testCase "lazy_encode_crash" lazy_encode_crash
, F.testCase "mapAccumL_resize" mapAccumL_resize
, F.testCase "replicate_crash" replicate_crash
, F.testCase "utf8_decode_unsafe" utf8_decode_unsafe
, F.testCase "t197" t197
]
2 changes: 1 addition & 1 deletion text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ test-suite tests
include-dirs: include

ghc-options:
-Wall -threaded -O0 -rtsopts
-Wall -threaded -O1 -rtsopts

cpp-options:
-DASSERTS -DHAVE_DEEPSEQ -DTEST_SUITE
Expand Down

0 comments on commit 548ca22

Please sign in to comment.