Permalink
Browse files

Reduce the amount of code generated for string literals

Previously, every Text literal generated a big wad of inefficient
code:

    {-# LANGUAGE OverloadedStrings #-}
    foo :: Text
    foo = "foo"

This would first convert to a String, then to a Text.  To make
matters worse, the code for conversion from String to Text was
inlined at every site where a string literal occurred (expected,
but undesired, behaviour).

In this change, we introduce a direct conversion from Addr# to each
of the Text types, and we ensure that uses of these never result in
excessive code generation.
  • Loading branch information...
1 parent 0d6d979 commit 2b2cb084c4689c06f1a7851ff8eb1e412eb02c1b @bos bos committed Apr 8, 2012
Showing with 69 additions and 6 deletions.
  1. +17 −2 Data/Text.hs
  2. +35 −2 Data/Text/Fusion/Common.hs
  3. +17 −2 Data/Text/Lazy.hs
View
@@ -1,9 +1,9 @@
-{-# LANGUAGE BangPatterns, CPP, Rank2Types, UnboxedTuples #-}
+{-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
-- |
-- Module : Data.Text
--- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
+-- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
--
@@ -222,6 +222,8 @@ import Data.ByteString (ByteString)
import qualified Data.Text.Lazy as L
import Data.Int (Int64)
#endif
+import qualified GHC.CString as GHC
+import GHC.Prim (Addr#)
-- $strict
--
@@ -372,6 +374,19 @@ unpack :: Text -> String
unpack = S.unstreamList . stream
{-# INLINE [1] unpack #-}
+-- | /O(n)/ Convert a literal string into a Text.
+unpackCString# :: Addr# -> Text
+unpackCString# addr# = unstream (S.streamCString# addr#)
+{-# NOINLINE unpackCString# #-}
+
+{-# RULES "TEXT literal" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCString# a)))
+ = unpackCString# a #-}
+
+{-# RULES "TEXT literal UTF8" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a)))
+ = unpackCString# a #-}
+
-- | /O(1)/ Convert a character into a Text. Subject to fusion.
-- Performs replacement on invalid scalar values.
singleton :: Char -> Text
View
@@ -1,7 +1,7 @@
-{-# LANGUAGE BangPatterns, Rank2Types #-}
+{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
-- |
-- Module : Data.Text.Fusion.Common
--- Copyright : (c) Bryan O'Sullivan 2009
+-- Copyright : (c) Bryan O'Sullivan 2009, 2012
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
@@ -17,6 +17,7 @@ module Data.Text.Fusion.Common
singleton
, streamList
, unstreamList
+ , streamCString#
-- * Basic interface
, cons
@@ -104,10 +105,13 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
(&&), fromIntegral, otherwise)
import qualified Data.List as L
import qualified Prelude as P
+import Data.Bits (shiftL)
import Data.Int (Int64)
import Data.Text.Fusion.Internal
import Data.Text.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping)
import Data.Text.Fusion.Size
+import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
+import GHC.Types (Char(..), Int(..))
singleton :: Char -> Stream Char
singleton c = Stream next False 1
@@ -131,6 +135,35 @@ unstreamList (Stream next s0 _len) = unfold s0
{-# RULES "STREAM streamList/unstreamList fusion" forall s. streamList (unstreamList s) = s #-}
+-- | Stream the UTF-8-like packed encoding used by GHC to represent
+-- constant strings in generated code.
+--
+-- This encoding uses the byte sequence "\xc0\x80" to represent NUL,
+-- and the string is NUL-terminated.
+streamCString# :: Addr# -> Stream Char
+streamCString# addr = Stream step 0 unknownSize
+ where
+ step !i
+ | b == 0 = Done
+ | b <= 0x7f = Yield (C# b#) (i+1)
+ | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1
+ in Yield c (i+2)
+ | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) +
+ (next 1 `shiftL` 6) +
+ next 2
+ in Yield c (i+3)
+ | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) +
+ (next 1 `shiftL` 12) +
+ (next 2 `shiftL` 6) +
+ next 3
+ in Yield c (i+4)
+ where b = I# (ord# b#)
+ next n = I# (ord# (at# (i+n))) - 0x80
+ b# = at# i
+ at# (I# i#) = indexCharOffAddr# addr i#
+ chr (I# i#) = C# (chr# i#)
+{-# INLINE [0] streamCString# #-}
+
-- ----------------------------------------------------------------------------
-- * Basic stream functions
View
@@ -1,8 +1,8 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
-{-# LANGUAGE BangPatterns, CPP #-}
+{-# LANGUAGE BangPatterns, MagicHash, CPP #-}
-- |
-- Module : Data.Text.Lazy
--- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2010, 2012 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
@@ -217,6 +217,8 @@ import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldlChunks, foldrChunks
import Data.Text.Internal (firstf, safe, textP)
import qualified Data.Text.Util as U
import Data.Text.Lazy.Search (indices)
+import qualified GHC.CString as GHC
+import GHC.Prim (Addr#)
-- $fusion
--
@@ -351,6 +353,19 @@ unpack :: Text -> String
unpack t = S.unstreamList (stream t)
{-# INLINE [1] unpack #-}
+-- | /O(n)/ Convert a literal string into a Text.
+unpackCString# :: Addr# -> Text
+unpackCString# addr# = unstream (S.streamCString# addr#)
+{-# NOINLINE unpackCString# #-}
+
+{-# RULES "TEXT literal" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCString# a)))
+ = unpackCString# a #-}
+
+{-# RULES "TEXT literal UTF8" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a)))
+ = unpackCString# a #-}
+
-- | /O(1)/ Convert a character into a Text. Subject to fusion.
-- Performs replacement on invalid scalar values.
singleton :: Char -> Text

0 comments on commit 2b2cb08

Please sign in to comment.