Skip to content

Commit

Permalink
Interleave type conversion and parsing
Browse files Browse the repository at this point in the history
This drastically reduces memory usage of decode as we now need to hold
on to O(n * sizeof(target_type)) bytes instead of O(n * sizeof(Vector
ByteString)) bytes. The latter is typically much larger, due to the high
constant-factor overhead of Vector and ByteString.
  • Loading branch information
tibbe committed Feb 24, 2013
1 parent d926bd1 commit 76451b4
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 47 deletions.
118 changes: 79 additions & 39 deletions Data/Csv/Encoding.hs
Expand Up @@ -10,7 +10,7 @@
--
-- Encoding and decoding of data types into CSV.
module Data.Csv.Encoding
(
(
-- * Encoding and decoding
decode
, decodeByName
Expand All @@ -31,7 +31,8 @@ module Data.Csv.Encoding
import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
toLazyByteString)
import Blaze.ByteString.Builder.Char8 (fromString)
import Control.Applicative ((*>), (<$>), (<*>), pure)
import Control.Applicative ((*>), (<|>), optional, pure)
import Data.Attoparsec.Char8 (endOfInput, endOfLine)
import qualified Data.Attoparsec.ByteString.Lazy as AL
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
Expand All @@ -45,10 +46,14 @@ import Data.Word (Word8)
import Prelude hiding (unlines)

import Data.Csv.Compat.Monoid ((<>))
import Data.Csv.Conversion
import Data.Csv.Parser
import Data.Csv.Types
import Data.Csv.Util ((<$!>))
import Data.Csv.Conversion (FromNamedRecord, FromRecord, ToNamedRecord,
ToRecord, parseNamedRecord, parseRecord, runParser,
toNamedRecord, toRecord)
import Data.Csv.Parser hiding (csv, csvWithHeader)
import qualified Data.Csv.Parser as Parser
import Data.Csv.Types hiding (toNamedRecord)
import qualified Data.Csv.Types as Types
import Data.Csv.Util (blankLine)

-- TODO: 'encode' isn't as efficient as it could be.

Expand Down Expand Up @@ -97,22 +102,9 @@ decodeWith :: FromRecord a
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
decodeWith = decodeWithC (runParser . parseCsv)
decodeWith = decodeWithC csv
{-# INLINE [1] decodeWith #-}

parseCsv :: FromRecord a => Csv -> Parser (Vector a)
parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)

mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
mapM' f = go
where
go [] = return []
go (x:xs) = do
!y <- f x
ys <- go xs
return (y : ys)
{-# INLINE mapM' #-}

{-# RULES
"idDecodeWith" decodeWith = idDecodeWith
#-}
Expand All @@ -121,14 +113,17 @@ mapM' f = go
-- conversion is performed.
idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
-> Either String (Vector (Vector B.ByteString))
idDecodeWith = decodeWithC pure

decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
-> Either String a
decodeWithC convert !opts skipHeader = decodeWithP parser convert
idDecodeWith = decodeWithC Parser.csv

-- | Decode CSV data using the provided parser, skipping a leading
-- header if 'skipHeader' is 'True'. Returns 'Left' @errMsg@ on
-- failure.
decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
-> BL8.ByteString -> Either String a
decodeWithC p !opts skipHeader = decodeWithP parser
where parser
| skipHeader = header (decDelimiter opts) *> csv opts
| otherwise = csv opts
| skipHeader = header (decDelimiter opts) *> p opts
| otherwise = p opts
{-# INLINE decodeWithC #-}

-- | Like 'decodeByName', but lets you customize how the CSV data is
Expand All @@ -137,12 +132,7 @@ decodeByNameWith :: FromNamedRecord a
=> DecodeOptions -- ^ Decoding options
-> L.ByteString -- ^ CSV data
-> Either String (Header, Vector a)
decodeByNameWith !opts =
decodeWithP (csvWithHeader opts)
(\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))

parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
decodeByNameWith !opts = decodeWithP (csvWithHeader opts)

-- | Options that controls how data is encoded. These options can be
-- used to e.g. encode data in a tab-separated format instead of in a
Expand Down Expand Up @@ -237,12 +227,62 @@ prependToAll :: Builder -> [Builder] -> [Builder]
prependToAll _ [] = []
prependToAll sep (x:xs) = sep <> x : prependToAll sep xs

decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b
decodeWithP p to s =
decodeWithP :: AL.Parser a -> L.ByteString -> Either String a
decodeWithP p s =
case AL.parse p s of
AL.Done _ v -> case to v of
Right a -> Right a
Left msg -> Left $ "conversion error: " ++ msg
AL.Done _ v -> Right v
AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
show (BL8.unpack left)
{-# INLINE decodeWithP #-}
{-# INLINE decodeWithP #-}

-- These alternative implementation of the 'csv' and 'csvWithHeader'
-- parsers from the 'Parser' module performs the
-- 'FromRecord'/'FromNamedRecord' conversions ont-the-fly, thereby

This comment has been minimized.

Copy link
@gregorycollins

gregorycollins Feb 25, 2013

s/ont/on/ :)

This comment has been minimized.

Copy link
@tibbe

tibbe Feb 25, 2013

Author Collaborator

Done

-- avoiding the need to hold a big 'CSV' value in memory. The 'CSV'
-- type has a quite large memory overhead due to high constant
-- overheads of 'B.ByteString' and 'V.Vector'.

-- TODO: Check that the error messages don't duplicate prefixes, as in
-- "parse error: conversion error: ...".

-- | Parse a CSV file that does not include a header.
csv :: FromRecord a => DecodeOptions -> AL.Parser (V.Vector a)
csv !opts = do
vals <- records
_ <- optional endOfLine
endOfInput
return $! V.fromList vals
where
records = do
!r <- record (decDelimiter opts)
if blankLine r
then (endOfLine *> records) <|> pure []
else case runParser (parseRecord r) of
Left msg -> fail $ "conversion error: " ++ msg
Right val -> do
!vals <- (endOfLine *> records) <|> pure []
return (val : vals)
{-# INLINE csv #-}

-- | Parse a CSV file that includes a header.
csvWithHeader :: FromNamedRecord a => DecodeOptions
-> AL.Parser (Header, V.Vector a)
csvWithHeader !opts = do
!hdr <- header (decDelimiter opts)
vals <- records hdr
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
return (hdr, v)
where
records hdr = do
!r <- record (decDelimiter opts)
if blankLine r
then (endOfLine *> records hdr) <|> pure []
else case runParser (convert hdr r) of
Left msg -> fail $ "conversion error: " ++ msg
Right val -> do
!vals <- (endOfLine *> records hdr) <|> pure []
return (val : vals)

convert hdr = parseNamedRecord . Types.toNamedRecord hdr
2 changes: 1 addition & 1 deletion Data/Csv/Incremental.hs
Expand Up @@ -45,7 +45,7 @@ import qualified Data.Vector as V
import Data.Csv.Conversion hiding (Parser, record, toNamedRecord)
import qualified Data.Csv.Conversion as Conversion
import Data.Csv.Parser
import Data.Csv.Types
import Data.Csv.Types hiding (toNamedRecord)

-- $feed-header
--
Expand Down
7 changes: 1 addition & 6 deletions Data/Csv/Parser.hs
Expand Up @@ -35,13 +35,12 @@ import Data.Attoparsec.Types (Parser)
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.HashMap.Strict as HM
import Data.Monoid (mappend, mempty)
import qualified Data.Vector as V
import Data.Word (Word8)

import Data.Csv.Types
import Data.Csv.Util ((<$!>))
import Data.Csv.Util ((<$!>), blankLine)

-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
Expand Down Expand Up @@ -103,9 +102,6 @@ csvWithHeader !opts = do
let !v = V.fromList vals
return (hdr, v)

toNamedRecord :: Header -> Record -> NamedRecord
toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v

-- | Parse a header, including the terminating line separator.
header :: Word8 -- ^ Field delimiter
-> AL.Parser Header
Expand All @@ -118,7 +114,6 @@ name !delim = field delim

removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
where blankLine v = V.length v == 1 && (S.null (V.head v))

-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
Expand Down
7 changes: 7 additions & 0 deletions Data/Csv/Types.hs
Expand Up @@ -7,11 +7,13 @@ module Data.Csv.Types
, Name
, NamedRecord
, Field
, toNamedRecord
) where

import qualified Data.ByteString as S
import qualified Data.HashMap.Strict as HM
import Data.Vector (Vector)
import qualified Data.Vector as V

-- | CSV data represented as a Haskell vector of vector of
-- bytestrings.
Expand All @@ -34,3 +36,8 @@ type NamedRecord = HM.HashMap S.ByteString S.ByteString

-- | A single field within a record.
type Field = S.ByteString

-- | Convert a 'Record' to a 'NamedRecord' by attaching column names.
-- The 'Header' and 'Record' must be of the same length.
toNamedRecord :: Header -> Record -> NamedRecord
toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
12 changes: 11 additions & 1 deletion Data/Csv/Util.hs
@@ -1,4 +1,10 @@
module Data.Csv.Util ((<$!>)) where
module Data.Csv.Util
((<$!>)
, blankLine
) where

import qualified Data.ByteString as B
import qualified Data.Vector as V

-- | A strict version of 'Data.Functor.<$>' for monads.
(<$!>) :: Monad m => (a -> b) -> m a -> m b
Expand All @@ -8,3 +14,7 @@ f <$!> m = do
{-# INLINE (<$!>) #-}

infixl 4 <$!>

-- | Is this an empty record (i.e. a blank line)?
blankLine :: V.Vector B.ByteString -> Bool
blankLine v = V.length v == 1 && (B.null (V.head v))

0 comments on commit 76451b4

Please sign in to comment.