Permalink
Browse files

First stab at lazy text support

--HG--
extra : convert_revision : afe709f5182304af650e67bc669334531ee12016
  • Loading branch information...
1 parent 9f53a93 commit 1175a7401efda26dd64ab675cd6f17ff23f75350 @bos bos committed Feb 28, 2009
Showing with 330 additions and 274 deletions.
  1. +5 −23 Data/Text.hs
  2. +5 −51 Data/Text/Fusion.hs
  3. +8 −0 Data/Text/Internal.hs
  4. +95 −0 Data/Text/Lazy/Internal.hs
  5. +204 −200 tests/Properties.hs
  6. +9 −0 tests/QuickCheckUtils.hs
  7. +4 −0 text.cabal
View
@@ -158,8 +158,10 @@ import Data.Word (Word16)
import Data.String (IsString(..))
import qualified Data.Text.Fusion as S
+import qualified Data.Text.Fusion.Internal as S
import Data.Text.Fusion (Stream(..), Step(..), stream, reverseStream, unstream)
-import Data.Text.Internal (Text(..), empty, text)
+
+import Data.Text.Internal (Text(..), empty, text, textP)
import qualified Prelude as P
import Data.Text.Unsafe (iter, iter_, unsafeHead, unsafeTail)
import Data.Text.UnsafeChar (unsafeChr)
@@ -195,26 +197,13 @@ instance IsString Text where
--
-- This function is subject to array fusion.
pack :: String -> Text
-pack str = (unstream (stream_list str))
- where
- stream_list s0 = S.Stream next s0 (P.length s0) -- total guess
- where
- next [] = S.Done
- next (x:xs) = S.Yield x xs
+pack = unstream . S.streamList
{-# INLINE [1] pack #-}
--- TODO: Has to do validation! -- No, it doesn't, the
-- | /O(n)/ Convert a Text into a String.
-- Subject to array fusion.
unpack :: Text -> String
-unpack txt = (unstream_list (stream txt))
- where
- unstream_list (S.Stream next s0 _len) = unfold s0
- where
- unfold !s = case next s of
- S.Done -> []
- S.Skip s' -> unfold s'
- S.Yield x s' -> x : unfold s'
+unpack = S.unstreamList . stream
{-# INLINE [1] unpack #-}
-- | /O(1)/ Convert a character into a Text.
@@ -312,13 +301,6 @@ last (Text arr off len)
S.last (stream t) = last t
#-}
--- | Construct a 'Text' without invisibly pinning its byte array in
--- memory if its length has dwindled to zero.
-textP :: A.Array Word16 -> Int -> Int -> Text
-textP arr off len | len == 0 = empty
- | otherwise = text arr off len
-{-# INLINE textP #-}
-
-- | /O(1)/ Returns all characters after the head of a 'Text', which
-- must be non-empty. Subject to array fusion.
tail :: Text -> Text
View
@@ -1,4 +1,4 @@
-{-# LANGUAGE ExistentialQuantification, BangPatterns, MagicHash #-}
+{-# LANGUAGE BangPatterns, MagicHash #-}
-- |
-- Module : Data.Text.Fusion
@@ -22,6 +22,7 @@ module Data.Text.Fusion
-- * Creation and elimination
, stream
+ , streamList
, unstream
, reverseStream
, empty
@@ -114,40 +115,13 @@ import Data.Char (ord)
import Data.Text.Internal (Text(..))
import Data.Text.UnsafeChar (unsafeChr, unsafeWrite, unsafeWriteRev)
import qualified Data.Text.Array as A
+import Data.Text.Fusion.Internal
import qualified Data.Text.Internal as I
import qualified Data.Text.Encoding.Utf16 as U16
import qualified Prelude as P
default(Int)
-infixl 2 :!:
-data PairS a b = !a :!: !b
-
--- | Allow a function over a stream to switch between two states.
-data Switch = S1 | S2
-
-data Stream a =
- forall s. Stream
- (s -> Step s a) -- stepper function
- !s -- current state
- {-# UNPACK #-}!Int -- length hint
-
--- The length hint in a Stream has two roles. If its value is zero,
--- we trust it, and treat the stream as empty. Otherwise, we treat it
--- as a hint: it should usually be accurate, so we use it when
--- unstreaming to decide what size array to allocate. However, the
--- unstreaming functions must be able to cope with the hint being too
--- small or too large.
---
--- The size hint tries to track the UTF-16 code points in a stream,
--- but often counts the number of characters instead. It can easily
--- undercount if, for instance, a transformed stream contains astral
--- plane characters (those above 0x10000).
-
-data Step s a = Done
- | Skip !s
- | Yield !a !s
-
-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
stream :: Text -> Stream Char
stream (Text arr off len) = Stream next off len
@@ -182,7 +156,7 @@ reverseStream (Text arr off len) = Stream next (off+len-1) len
unstream :: Stream Char -> Text
unstream (Stream next0 s0 len)
| len == 0 = I.empty
- | otherwise = Text (P.fst a) 0 (P.snd a)
+ | otherwise = I.textP (P.fst a) 0 (P.snd a)
where
a = runST (A.unsafeNew len >>= (\arr -> loop arr len s0 0))
loop arr !top !s !i
@@ -198,26 +172,6 @@ unstream (Stream next0 s0 len)
{-# INLINE [0] unstream #-}
{-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-}
--- | The empty stream.
-empty :: Stream Char
-empty = Stream next () 0
- where next _ = Done
-{-# INLINE [0] empty #-}
-
--- | /O(n)/ Determines if two streams are equal.
-eq :: Ord a => Stream a -> Stream a -> Bool
-eq (Stream next1 s1 _) (Stream next2 s2 _) = cmp (next1 s1) (next2 s2)
- where
- cmp Done Done = True
- cmp Done _ = False
- cmp _ Done = False
- cmp (Skip s1') (Skip s2') = cmp (next1 s1') (next2 s2')
- cmp (Skip s1') x2 = cmp (next1 s1') x2
- cmp x1 (Skip s2') = cmp x1 (next2 s2')
- cmp (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
- cmp (next1 s1') (next2 s2')
-{-# SPECIALISE eq :: Stream Char -> Stream Char -> Bool #-}
-
streamError :: String -> String -> a
streamError func msg = P.error $ "Data.Text.Fusion." ++ func ++ ": " ++ msg
@@ -403,7 +357,7 @@ intersperse c (Stream next0 s0 len) = Stream next (s0 :!: Nothing :!: S1) len
reverse :: Stream Char -> Text
reverse (Stream next s len0)
| len0 == 0 = I.empty
- | otherwise = Text arr off' len'
+ | otherwise = I.textP arr off' len'
where
len0' = max len0 4
(arr, (off', len')) = A.run2 (A.unsafeNew len0' >>= loop s (len0'-1) len0')
View
@@ -21,6 +21,7 @@ module Data.Text.Internal
Text(..)
-- * Construction
, text
+ , textP
-- * Code that must be here for accessibility
, empty
-- * Debugging
@@ -56,6 +57,13 @@ empty :: Text
empty = Text A.empty 0 0
{-# INLINE [1] empty #-}
+-- | Construct a 'Text' without invisibly pinning its byte array in
+-- memory if its length has dwindled to zero.
+textP :: A.Array Word16 -> Int -> Int -> Text
+textP arr off len | len == 0 = empty
+ | otherwise = text arr off len
+{-# INLINE textP #-}
+
-- | A useful 'show'-like function for debugging purposes.
showText :: Text -> String
showText (Text arr off len) =
View
@@ -0,0 +1,95 @@
+{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
+-- |
+-- Module : Data.Text.Lazy.Internal
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- A module containing semi-public 'Text' internals. This exposes the
+-- 'Text' representation and low level construction functions.
+-- Modules which extend the 'Text' system may need to use this module.
+-- Regular users should not.
+module Data.Text.Lazy.Internal
+ (
+ Text(..)
+ , chunk
+ , foldrChunks
+ , foldlChunks
+ -- * Data type invariant and abstraction functions
+ , invariant
+ , checkInvariant
+ , showStructure
+ -- * Chunk allocation sizes
+ , defaultChunkSize
+ , smallChunkSize
+ , chunkOverhead
+ ) where
+
+import qualified Data.Text.Internal as T
+import qualified Data.Text as T
+import Data.Typeable (Typeable)
+import Data.Word (Word16)
+import Foreign.Storable (sizeOf)
+
+data Text = Empty
+ | Chunk {-# UNPACK #-} !T.Text Text
+ deriving (Typeable)
+
+-- | The data type invariant: Every 'Text' is either 'Empty' or
+-- consists of non-null 'T.Text's. All functions must preserve this,
+-- and the QC properties must check this.
+invariant :: Text -> Bool
+invariant Empty = True
+invariant (Chunk (T.Text _ _ len) cs) = len > 0 && invariant cs
+
+showStructure :: Text -> String
+showStructure Empty = "Empty"
+showStructure (Chunk t Empty) = "Chunk " ++ show t ++ " Empty"
+showStructure (Chunk t ts) =
+ "Chunk " ++ show t ++ " (" ++ showStructure ts ++ ")"
+
+-- | In a form that checks the invariant lazily.
+checkInvariant :: Text -> Text
+checkInvariant Empty = Empty
+checkInvariant (Chunk c@(T.Text _ _ len) cs)
+ | len > 0 = Chunk c (checkInvariant cs)
+ | otherwise = error $ "Data.Text.Lazy: invariant violation: "
+ ++ showStructure (Chunk c cs)
+
+-- | Smart constructor for 'Chunk'. Guarantees the data type invariant.
+chunk :: T.Text -> Text -> Text
+{-# INLINE chunk #-}
+chunk t@(T.Text _ _ len) ts | len == 0 = ts
+ | otherwise = Chunk t ts
+
+-- | Consume the chunks of a lazy 'Text' with a natural right fold.
+foldrChunks :: (T.Text -> a -> a) -> a -> Text -> a
+foldrChunks f z = go
+ where go Empty = z
+ go (Chunk c cs) = f c (go cs)
+{-# INLINE foldrChunks #-}
+
+-- | Consume the chunks of a lazy 'Text' with a strict, tail-recursive,
+-- accumulating left fold.
+foldlChunks :: (a -> T.Text -> a) -> a -> Text -> a
+foldlChunks f z = go z
+ where go !a Empty = a
+ go !a (Chunk c cs) = go (f a c) cs
+{-# INLINE foldlChunks #-}
+
+-- | Currently set to 32k, less the memory management overhead.
+defaultChunkSize :: Int
+defaultChunkSize = 32 * k - chunkOverhead
+ where k = 1024 `div` sizeOf (undefined :: Word16)
+
+-- | Currently set to 4k, less the memory management overhead.
+smallChunkSize :: Int
+smallChunkSize = 4 * k - chunkOverhead
+ where k = 1024 `div` sizeOf (undefined :: Word16)
+
+-- | The memory management overhead. Currently this is tuned for GHC only.
+chunkOverhead :: Int
+chunkOverhead = 2 * sizeOf (undefined :: Int)
Oops, something went wrong.

0 comments on commit 1175a74

Please sign in to comment.