Permalink
Browse files

Interleave type conversion and parsing

This drastically reduces memory usage of decode as we now need to hold
on to O(n * sizeof(target_type)) bytes instead of O(n * sizeof(Vector
ByteString)) bytes. The latter is typically much larger, due to the high
constant-factor overhead of Vector and ByteString.
  • Loading branch information...
1 parent d926bd1 commit 76451b47331b0558ecdb2ad8bebbe22bdaabb3d9 @tibbe tibbe committed Feb 13, 2013
Showing with 99 additions and 47 deletions.
  1. +79 −39 Data/Csv/Encoding.hs
  2. +1 −1 Data/Csv/Incremental.hs
  3. +1 −6 Data/Csv/Parser.hs
  4. +7 −0 Data/Csv/Types.hs
  5. +11 −1 Data/Csv/Util.hs
View
@@ -10,7 +10,7 @@
--
-- Encoding and decoding of data types into CSV.
module Data.Csv.Encoding
- (
+ (
-- * Encoding and decoding
decode
, decodeByName
@@ -31,7 +31,8 @@ module Data.Csv.Encoding
import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
toLazyByteString)
import Blaze.ByteString.Builder.Char8 (fromString)
-import Control.Applicative ((*>), (<$>), (<*>), pure)
+import Control.Applicative ((*>), (<|>), optional, pure)
+import Data.Attoparsec.Char8 (endOfInput, endOfLine)
import qualified Data.Attoparsec.ByteString.Lazy as AL
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
@@ -45,10 +46,14 @@ import Data.Word (Word8)
import Prelude hiding (unlines)
import Data.Csv.Compat.Monoid ((<>))
-import Data.Csv.Conversion
-import Data.Csv.Parser
-import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Conversion (FromNamedRecord, FromRecord, ToNamedRecord,
+ ToRecord, parseNamedRecord, parseRecord, runParser,
+ toNamedRecord, toRecord)
+import Data.Csv.Parser hiding (csv, csvWithHeader)
+import qualified Data.Csv.Parser as Parser
+import Data.Csv.Types hiding (toNamedRecord)
+import qualified Data.Csv.Types as Types
+import Data.Csv.Util (blankLine)
-- TODO: 'encode' isn't as efficient as it could be.
@@ -97,22 +102,9 @@ decodeWith :: FromRecord a
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
-decodeWith = decodeWithC (runParser . parseCsv)
+decodeWith = decodeWithC csv
{-# INLINE [1] decodeWith #-}
-parseCsv :: FromRecord a => Csv -> Parser (Vector a)
-parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)
-
-mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
-mapM' f = go
- where
- go [] = return []
- go (x:xs) = do
- !y <- f x
- ys <- go xs
- return (y : ys)
-{-# INLINE mapM' #-}
-
{-# RULES
"idDecodeWith" decodeWith = idDecodeWith
#-}
@@ -121,14 +113,17 @@ mapM' f = go
-- conversion is performed.
idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
-> Either String (Vector (Vector B.ByteString))
-idDecodeWith = decodeWithC pure
-
-decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
- -> Either String a
-decodeWithC convert !opts skipHeader = decodeWithP parser convert
+idDecodeWith = decodeWithC Parser.csv
+
+-- | Decode CSV data using the provided parser, skipping a leading
+-- header if 'skipHeader' is 'True'. Returns 'Left' @errMsg@ on
+-- failure.
+decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
+ -> BL8.ByteString -> Either String a
+decodeWithC p !opts skipHeader = decodeWithP parser
where parser
- | skipHeader = header (decDelimiter opts) *> csv opts
- | otherwise = csv opts
+ | skipHeader = header (decDelimiter opts) *> p opts
+ | otherwise = p opts
{-# INLINE decodeWithC #-}
-- | Like 'decodeByName', but lets you customize how the CSV data is
@@ -137,12 +132,7 @@ decodeByNameWith :: FromNamedRecord a
=> DecodeOptions -- ^ Decoding options
-> L.ByteString -- ^ CSV data
-> Either String (Header, Vector a)
-decodeByNameWith !opts =
- decodeWithP (csvWithHeader opts)
- (\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
-
-parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
-parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
+decodeByNameWith !opts = decodeWithP (csvWithHeader opts)
-- | Options that controls how data is encoded. These options can be
-- used to e.g. encode data in a tab-separated format instead of in a
@@ -237,12 +227,62 @@ prependToAll :: Builder -> [Builder] -> [Builder]
prependToAll _ [] = []
prependToAll sep (x:xs) = sep <> x : prependToAll sep xs
-decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b
-decodeWithP p to s =
+decodeWithP :: AL.Parser a -> L.ByteString -> Either String a
+decodeWithP p s =
case AL.parse p s of
- AL.Done _ v -> case to v of
- Right a -> Right a
- Left msg -> Left $ "conversion error: " ++ msg
+ AL.Done _ v -> Right v
AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
show (BL8.unpack left)
-{-# INLINE decodeWithP #-}
+{-# INLINE decodeWithP #-}
+
+-- These alternative implementation of the 'csv' and 'csvWithHeader'
+-- parsers from the 'Parser' module performs the
+-- 'FromRecord'/'FromNamedRecord' conversions ont-the-fly, thereby
+-- avoiding the need to hold a big 'CSV' value in memory. The 'CSV'
+-- type has a quite large memory overhead due to high constant
+-- overheads of 'B.ByteString' and 'V.Vector'.
+
+-- TODO: Check that the error messages don't duplicate prefixes, as in
+-- "parse error: conversion error: ...".
+
+-- | Parse a CSV file that does not include a header.
+csv :: FromRecord a => DecodeOptions -> AL.Parser (V.Vector a)
+csv !opts = do
+ vals <- records
+ _ <- optional endOfLine
+ endOfInput
+ return $! V.fromList vals
+ where
+ records = do
+ !r <- record (decDelimiter opts)
+ if blankLine r
+ then (endOfLine *> records) <|> pure []
+ else case runParser (parseRecord r) of
+ Left msg -> fail $ "conversion error: " ++ msg
+ Right val -> do
+ !vals <- (endOfLine *> records) <|> pure []
+ return (val : vals)
+{-# INLINE csv #-}
+
+-- | Parse a CSV file that includes a header.
+csvWithHeader :: FromNamedRecord a => DecodeOptions
+ -> AL.Parser (Header, V.Vector a)
+csvWithHeader !opts = do
+ !hdr <- header (decDelimiter opts)
+ vals <- records hdr
+ _ <- optional endOfLine
+ endOfInput
+ let !v = V.fromList vals
+ return (hdr, v)
+ where
+ records hdr = do
+ !r <- record (decDelimiter opts)
+ if blankLine r
+ then (endOfLine *> records hdr) <|> pure []
+ else case runParser (convert hdr r) of
+ Left msg -> fail $ "conversion error: " ++ msg
+ Right val -> do
+ !vals <- (endOfLine *> records hdr) <|> pure []
+ return (val : vals)
+
+ convert hdr = parseNamedRecord . Types.toNamedRecord hdr
@@ -45,7 +45,7 @@ import qualified Data.Vector as V
import Data.Csv.Conversion hiding (Parser, record, toNamedRecord)
import qualified Data.Csv.Conversion as Conversion
import Data.Csv.Parser
-import Data.Csv.Types
+import Data.Csv.Types hiding (toNamedRecord)
-- $feed-header
--
View
@@ -35,13 +35,12 @@ import Data.Attoparsec.Types (Parser)
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
-import qualified Data.HashMap.Strict as HM
import Data.Monoid (mappend, mempty)
import qualified Data.Vector as V
import Data.Word (Word8)
import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Util ((<$!>), blankLine)
-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
@@ -103,9 +102,6 @@ csvWithHeader !opts = do
let !v = V.fromList vals
return (hdr, v)
-toNamedRecord :: Header -> Record -> NamedRecord
-toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
-
-- | Parse a header, including the terminating line separator.
header :: Word8 -- ^ Field delimiter
-> AL.Parser Header
@@ -118,7 +114,6 @@ name !delim = field delim
removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
- where blankLine v = V.length v == 1 && (S.null (V.head v))
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
View
@@ -7,11 +7,13 @@ module Data.Csv.Types
, Name
, NamedRecord
, Field
+ , toNamedRecord
) where
import qualified Data.ByteString as S
import qualified Data.HashMap.Strict as HM
import Data.Vector (Vector)
+import qualified Data.Vector as V
-- | CSV data represented as a Haskell vector of vector of
-- bytestrings.
@@ -34,3 +36,8 @@ type NamedRecord = HM.HashMap S.ByteString S.ByteString
-- | A single field within a record.
type Field = S.ByteString
+
+-- | Convert a 'Record' to a 'NamedRecord' by attaching column names.
+-- The 'Header' and 'Record' must be of the same length.
+toNamedRecord :: Header -> Record -> NamedRecord
+toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
View
@@ -1,4 +1,10 @@
-module Data.Csv.Util ((<$!>)) where
+module Data.Csv.Util
+ ((<$!>)
+ , blankLine
+ ) where
+
+import qualified Data.ByteString as B
+import qualified Data.Vector as V
-- | A strict version of 'Data.Functor.<$>' for monads.
(<$!>) :: Monad m => (a -> b) -> m a -> m b
@@ -8,3 +14,7 @@ f <$!> m = do
{-# INLINE (<$!>) #-}
infixl 4 <$!>
+
+-- | Is this an empty record (i.e. a blank line)?
+blankLine :: V.Vector B.ByteString -> Bool
+blankLine v = V.length v == 1 && (B.null (V.head v))

0 comments on commit 76451b4

Please sign in to comment.