Interleave type conversion and parsing

This drastically reduces memory usage of decode as we now need to hold on to O(n * sizeof(target_type)) bytes instead of O(n * sizeof(Vector ByteString)) bytes. The latter is typically much larger, due to the high constant-factor overhead of Vector and ByteString.
haskell-hvr · Feb 24, 2013 · 76451b4 · gregorycollins · Feb 25, 2013 · tibbe
1 parent d926bd1
commit 76451b4
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 47 deletions.
diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
@@ -10,7 +10,7 @@
 --
 -- Encoding and decoding of data types into CSV.
 module Data.Csv.Encoding
-    (     
+    (
     -- * Encoding and decoding
       decode
     , decodeByName
@@ -31,7 +31,8 @@ module Data.Csv.Encoding
 import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
                                  toLazyByteString)
 import Blaze.ByteString.Builder.Char8 (fromString)
-import Control.Applicative ((*>), (<$>), (<*>), pure)
+import Control.Applicative ((*>), (<|>), optional, pure)
+import Data.Attoparsec.Char8 (endOfInput, endOfLine)
 import qualified Data.Attoparsec.ByteString.Lazy as AL
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as B8
@@ -45,10 +46,14 @@ import Data.Word (Word8)
 import Prelude hiding (unlines)
 
 import Data.Csv.Compat.Monoid ((<>))
-import Data.Csv.Conversion
-import Data.Csv.Parser
-import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Conversion (FromNamedRecord, FromRecord, ToNamedRecord,
+                            ToRecord, parseNamedRecord, parseRecord, runParser,
+                            toNamedRecord, toRecord)
+import Data.Csv.Parser hiding (csv, csvWithHeader)
+import qualified Data.Csv.Parser as Parser
+import Data.Csv.Types hiding (toNamedRecord)
+import qualified Data.Csv.Types as Types
+import Data.Csv.Util (blankLine)
 
 -- TODO: 'encode' isn't as efficient as it could be.
 
@@ -97,22 +102,9 @@ decodeWith :: FromRecord a
                              -- skipped
            -> L.ByteString   -- ^ CSV data
            -> Either String (Vector a)
-decodeWith = decodeWithC (runParser . parseCsv)
+decodeWith = decodeWithC csv
 {-# INLINE [1] decodeWith #-}
 
-parseCsv :: FromRecord a => Csv -> Parser (Vector a)
-parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)
-
-mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
-mapM' f = go
-  where
-    go [] = return []
-    go (x:xs) = do
-        !y <- f x
-        ys <- go xs
-        return (y : ys)
-{-# INLINE mapM' #-}
-
 {-# RULES
     "idDecodeWith" decodeWith = idDecodeWith
  #-}
@@ -121,14 +113,17 @@ mapM' f = go
 -- conversion is performed.
 idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
              -> Either String (Vector (Vector B.ByteString))
-idDecodeWith = decodeWithC pure
-
-decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
-            -> Either String a
-decodeWithC convert !opts skipHeader = decodeWithP parser convert
+idDecodeWith = decodeWithC Parser.csv
+
+-- | Decode CSV data using the provided parser, skipping a leading
+-- header if 'skipHeader' is 'True'. Returns 'Left' @errMsg@ on
+-- failure.
+decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
+            -> BL8.ByteString -> Either String a
+decodeWithC p !opts skipHeader = decodeWithP parser
   where parser
-            | skipHeader = header (decDelimiter opts) *> csv opts
-            | otherwise  = csv opts
+            | skipHeader = header (decDelimiter opts) *> p opts
+            | otherwise  = p opts
 {-# INLINE decodeWithC #-}
 
 -- | Like 'decodeByName', but lets you customize how the CSV data is
@@ -137,12 +132,7 @@ decodeByNameWith :: FromNamedRecord a
                  => DecodeOptions  -- ^ Decoding options
                  -> L.ByteString   -- ^ CSV data
                  -> Either String (Header, Vector a)
-decodeByNameWith !opts =
-    decodeWithP (csvWithHeader opts)
-    (\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
-
-parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
-parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
+decodeByNameWith !opts = decodeWithP (csvWithHeader opts)
 
 -- | Options that controls how data is encoded. These options can be
 -- used to e.g. encode data in a tab-separated format instead of in a
@@ -237,12 +227,62 @@ prependToAll :: Builder -> [Builder] -> [Builder]
 prependToAll _   []     = []
 prependToAll sep (x:xs) = sep <> x : prependToAll sep xs
 
-decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b
-decodeWithP p to s =
+decodeWithP :: AL.Parser a -> L.ByteString -> Either String a
+decodeWithP p s =
     case AL.parse p s of
-      AL.Done _ v     -> case to v of
-          Right a  -> Right a
-          Left msg -> Left $ "conversion error: " ++ msg
+      AL.Done _ v     -> Right v
       AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
                             show (BL8.unpack left)
-{-# INLINE decodeWithP #-}
+{-# INLINE decodeWithP #-}
+
+-- These alternative implementation of the 'csv' and 'csvWithHeader'
+-- parsers from the 'Parser' module performs the
+-- 'FromRecord'/'FromNamedRecord' conversions ont-the-fly, thereby
+-- avoiding the need to hold a big 'CSV' value in memory. The 'CSV'
+-- type has a quite large memory overhead due to high constant
+-- overheads of 'B.ByteString' and 'V.Vector'.
+
+-- TODO: Check that the error messages don't duplicate prefixes, as in
+-- "parse error: conversion error: ...".
+
+-- | Parse a CSV file that does not include a header.
+csv :: FromRecord a => DecodeOptions -> AL.Parser (V.Vector a)
+csv !opts = do
+    vals <- records
+    _ <- optional endOfLine
+    endOfInput
+    return $! V.fromList vals
+  where
+    records = do
+        !r <- record (decDelimiter opts)
+        if blankLine r
+            then (endOfLine *> records) <|> pure []
+            else case runParser (parseRecord r) of
+                Left msg  -> fail $ "conversion error: " ++ msg
+                Right val -> do
+                    !vals <- (endOfLine *> records) <|> pure []
+                    return (val : vals)
+{-# INLINE csv #-}
+
+-- | Parse a CSV file that includes a header.
+csvWithHeader :: FromNamedRecord a => DecodeOptions
+              -> AL.Parser (Header, V.Vector a)
+csvWithHeader !opts = do
+    !hdr <- header (decDelimiter opts)
+    vals <- records hdr
+    _ <- optional endOfLine
+    endOfInput
+    let !v = V.fromList vals
+    return (hdr, v)
+  where
+    records hdr = do
+        !r <- record (decDelimiter opts)
+        if blankLine r
+            then (endOfLine *> records hdr) <|> pure []
+            else case runParser (convert hdr r) of
+                Left msg  -> fail $ "conversion error: " ++ msg
+                Right val -> do
+                    !vals <- (endOfLine *> records hdr) <|> pure []
+                    return (val : vals)
+
+    convert hdr = parseNamedRecord . Types.toNamedRecord hdr
diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs
@@ -45,7 +45,7 @@ import qualified Data.Vector as V
 import Data.Csv.Conversion hiding (Parser, record, toNamedRecord)
 import qualified Data.Csv.Conversion as Conversion
 import Data.Csv.Parser
-import Data.Csv.Types
+import Data.Csv.Types hiding (toNamedRecord)
 
 -- $feed-header
 --

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
@@ -35,13 +35,12 @@ import Data.Attoparsec.Types (Parser)
 import qualified Data.Attoparsec.Zepto as Z
 import qualified Data.ByteString as S
 import qualified Data.ByteString.Unsafe as S
-import qualified Data.HashMap.Strict as HM
 import Data.Monoid (mappend, mempty)
 import qualified Data.Vector as V
 import Data.Word (Word8)
 
 import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Util ((<$!>), blankLine)
 
 -- | Options that controls how data is decoded. These options can be
 -- used to e.g. decode tab-separated data instead of comma-separated
@@ -103,9 +102,6 @@ csvWithHeader !opts = do
     let !v = V.fromList vals
     return (hdr, v)
 
-toNamedRecord :: Header -> Record -> NamedRecord
-toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
-
 -- | Parse a header, including the terminating line separator.
 header :: Word8  -- ^ Field delimiter
        -> AL.Parser Header
@@ -118,7 +114,6 @@ name !delim = field delim
 
 removeBlankLines :: [Record] -> [Record]
 removeBlankLines = filter (not . blankLine)
-  where blankLine v = V.length v == 1 && (S.null (V.head v))
 
 -- | Parse a record, not including the terminating line separator. The
 -- terminating line separate is not included as the last record in a

diff --git a/Data/Csv/Types.hs b/Data/Csv/Types.hs
@@ -7,11 +7,13 @@ module Data.Csv.Types
     , Name
     , NamedRecord
     , Field
+    , toNamedRecord
     ) where
 
 import qualified Data.ByteString as S
 import qualified Data.HashMap.Strict as HM
 import Data.Vector (Vector)
+import qualified Data.Vector as V
 
 -- | CSV data represented as a Haskell vector of vector of
 -- bytestrings.
@@ -34,3 +36,8 @@ type NamedRecord = HM.HashMap S.ByteString S.ByteString
 
 -- | A single field within a record.
 type Field = S.ByteString
+
+-- | Convert a 'Record' to a 'NamedRecord' by attaching column names.
+-- The 'Header' and 'Record' must be of the same length.
+toNamedRecord :: Header -> Record -> NamedRecord
+toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
diff --git a/Data/Csv/Util.hs b/Data/Csv/Util.hs
@@ -1,4 +1,10 @@
-module Data.Csv.Util ((<$!>)) where
+module Data.Csv.Util
+      ((<$!>)
+    , blankLine
+    ) where
+
+import qualified Data.ByteString as B
+import qualified Data.Vector as V
 
 -- | A strict version of 'Data.Functor.<$>' for monads.
 (<$!>) :: Monad m => (a -> b) -> m a -> m b
@@ -8,3 +14,7 @@ f <$!> m = do
 {-# INLINE (<$!>) #-}
 
 infixl 4 <$!>
+
+-- | Is this an empty record (i.e. a blank line)?
+blankLine :: V.Vector B.ByteString -> Bool
+blankLine v = V.length v == 1 && (B.null (V.head v))