From 359b69153bcc3f187c485e90f733b74d4e9a5c19 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Sat, 20 Oct 2012 20:05:42 +0400 Subject: [PATCH 01/21] Add parsers for space-delimited data files --- Data/Csv/Parser.hs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index aaba190..5d5e1d5 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -16,12 +16,16 @@ module Data.Csv.Parser ( DecodeOptions(..) , defaultDecodeOptions + -- * CSV , csv , csvWithHeader , header , record , name , field + -- * Tables + , table + , recordTable ) where import Blaze.ByteString.Builder (fromByteString, toByteString) @@ -133,6 +137,37 @@ unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && c /= delim && c /= cr) +-- | +table :: AL.Parser Csv +table = do + vals <- recordTable `sepBy1` endOfLine + _ <- optional endOfLine + endOfInput + return $ V.fromList $ removeBlankLines vals +{-# INLINE table #-} + +-- | Parse record for space-separated files. +recordTable :: AL.Parser Record +recordTable = V.fromList <$> fieldTable `sepBy1` delimTable +{-# INLINE recordTable #-} + +fieldTable :: AL.Parser Field +fieldTable = do + mb <- A.peekWord8 + case mb of + Just b | b == doubleQuote -> escapedField + _ -> unescapedFieldTable + +unescapedFieldTable :: AL.Parser Field +unescapedFieldTable = A.takeWhile (\c -> c /= doubleQuote && + c /= newline && + c /= cr && + c /= 9 && + c /= 32 ) + +delimTable :: AL.Parser () +delimTable = () <$ AL.many1 (A.satisfy $ \c -> c == 32 || c == 9) + dquote :: AL.Parser Char dquote = char '"' From 6800c8b1ed4c560acd831ab4df50a2e70d751acc Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Sat, 20 Oct 2012 20:20:17 +0400 Subject: [PATCH 02/21] Add function to decode space-delimited files --- Data/Csv.hs | 3 +++ Data/Csv/Encoding.hs | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Data/Csv.hs b/Data/Csv.hs index dfb4dcb..e839a0e 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -32,6 +32,9 @@ module Data.Csv , encodeWith , encodeByNameWith + -- * Space-delimited + , decodeTable + -- * Core CSV types , Csv , Record diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 1ced964..b6503d8 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -26,6 +26,9 @@ module Data.Csv.Encoding , defaultEncodeOptions , encodeWith , encodeByNameWith + + -- * Space-delimited files + , decodeTable ) where import Blaze.ByteString.Builder @@ -184,4 +187,11 @@ decodeWithP p to s = Error msg -> Left $ "conversion error: " ++ msg AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++ show (BL8.unpack left) -{-# INLINE decodeWithP #-} \ No newline at end of file +{-# INLINE decodeWithP #-} + + +------------------------------------------------------------------------ +-- * Space-delimited files + +decodeTable :: FromRecord a => L.ByteString -> Either String (Vector a) +decodeTable = decodeWithP table (parse . traverse parseRecord) From 05398ba364f245a2b3fcc6f0b58f9278d44dc95b Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 22 Oct 2012 19:52:19 +0400 Subject: [PATCH 03/21] Handle leading and trailing spaces --- Data/Csv/Parser.hs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 5d5e1d5..5fc3f41 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -146,9 +146,24 @@ table = do return $ V.fromList $ removeBlankLines vals {-# INLINE table #-} --- | Parse record for space-separated files. +-- | Parse record for space-separated files. It's more complicated +-- that CSV parser because we need to drop both recordTable :: AL.Parser Record -recordTable = V.fromList <$> fieldTable `sepBy1` delimTable +recordTable + = V.fromList <$> + ((delimTable <|> pure ()) *> + (fieldTable `sepBy11` delimTable) + ) + where + sepBy11 p s = liftA2 (:) p scan + where + scan = s *> (([] <$ eol) <|> (liftA2 (:) p scan)) + <|> pure [] + eol = ([] <$ endOfInput) <|> do + mb <- AL.peekWord8 + case mb of + Just b | b == newline || b == cr -> pure [] + _ -> empty {-# INLINE recordTable #-} fieldTable :: AL.Parser Field From 805bb75d7820324498f0ac3748fb3eff0f9225a8 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 23 Oct 2012 14:55:02 +0400 Subject: [PATCH 04/21] Add ghc-prof-options to gather sensible profiling information --- cassava.cabal | 1 + 1 file changed, 1 insertion(+) diff --git a/cassava.cabal b/cassava.cabal index 424943d..7c3bc00 100644 --- a/cassava.cabal +++ b/cassava.cabal @@ -40,6 +40,7 @@ Library vector ghc-options: -Wall -O2 + ghc-prof-options: -auto-all if impl(ghc >= 7.2.1) cpp-options: -DGENERICS From b5061961cb2f124892f7ee5fe8c81817024b5627 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 23 Oct 2012 15:04:10 +0400 Subject: [PATCH 05/21] Use named constants --- Data/Csv/Parser.hs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 5fc3f41..dd45930 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -175,13 +175,13 @@ fieldTable = do unescapedFieldTable :: AL.Parser Field unescapedFieldTable = A.takeWhile (\c -> c /= doubleQuote && - c /= newline && - c /= cr && - c /= 9 && - c /= 32 ) + c /= newline && + c /= cr && + c /= tab && + c /= wspace ) delimTable :: AL.Parser () -delimTable = () <$ AL.many1 (A.satisfy $ \c -> c == 32 || c == 9) +delimTable = () <$ AL.many1 (A.satisfy $ \c -> c == wspace || c == tab) dquote :: AL.Parser Char dquote = char '"' @@ -201,7 +201,9 @@ unescape = toByteString <$> go mempty where then return (acc `mappend` fromByteString h) else rest -doubleQuote, newline, cr :: Word8 +doubleQuote, newline, cr, tab, wspace :: Word8 doubleQuote = 34 -newline = 10 -cr = 13 +newline = 10 +cr = 13 +tab = 9 +wspace = 32 From 9185c6ff5b1d9b7d05a1c44167c1c8ee8036f6ae Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 23 Oct 2012 15:27:08 +0400 Subject: [PATCH 06/21] Use takeWhile1 it slightly improves performance --- Data/Csv/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index dd45930..27ad102 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -181,7 +181,7 @@ unescapedFieldTable = A.takeWhile (\c -> c /= doubleQuote && c /= wspace ) delimTable :: AL.Parser () -delimTable = () <$ AL.many1 (A.satisfy $ \c -> c == wspace || c == tab) +delimTable = () <$ A.takeWhile1 (\c -> c == wspace || c == tab) dquote :: AL.Parser Char dquote = char '"' From 5910b82395d5267e348bd50c0f4ee08627165f0f Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Fri, 26 Oct 2012 22:07:34 +0400 Subject: [PATCH 07/21] Add named decoders for tables --- Data/Csv.hs | 2 +- Data/Csv/Parser.hs | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Data/Csv.hs b/Data/Csv.hs index e839a0e..ebd7a74 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -32,7 +32,7 @@ module Data.Csv , encodeWith , encodeByNameWith - -- * Space-delimited + -- * Space-delimited files , decodeTable -- * Core CSV types diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 27ad102..d2f51e5 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -25,7 +25,11 @@ module Data.Csv.Parser , field -- * Tables , table + , tableWithHeader + , tableHeader , recordTable + , tableName + , fieldTable ) where import Blaze.ByteString.Builder (fromByteString, toByteString) @@ -146,6 +150,21 @@ table = do return $ V.fromList $ removeBlankLines vals {-# INLINE table #-} +tableWithHeader :: AL.Parser (Header, V.Vector NamedRecord) +tableWithHeader = do + hdr <- tableHeader + vals <- map (toNamedRecord hdr) . removeBlankLines <$> + recordTable `sepBy1` endOfLine + _ <- optional endOfLine + endOfInput + return (hdr, V.fromList vals) + +tableHeader :: AL.Parser Record +tableHeader = recordTable <* endOfLine + +tableName :: AL.Parser Field +tableName = fieldTable + -- | Parse record for space-separated files. It's more complicated -- that CSV parser because we need to drop both recordTable :: AL.Parser Record From cbae628ed8213901ac545cc4c2ad05bdc38f144c Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 25 Dec 2012 18:30:19 +0400 Subject: [PATCH 08/21] Fix build --- Data/Csv/Encoding.hs | 22 +++++++++++++--------- Data/Csv/Parser.hs | 11 ++++++----- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 89c3c1d..60997b0 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -100,7 +100,8 @@ decodeWith :: FromRecord a -- skipped -> L.ByteString -- ^ CSV data -> Either String (Vector a) -decodeWith = decodeWithC (runParser . parseCsv) +decodeWith !opt = + decodeWithC (header $ decDelimiter opt) (csv opt) (runParser . parseCsv) {-# INLINE [1] decodeWith #-} parseCsv :: FromRecord a => Csv -> Parser (Vector a) @@ -124,14 +125,16 @@ mapM' f = go -- conversion is performed. idDecodeWith :: DecodeOptions -> Bool -> L.ByteString -> Either String (Vector (Vector B.ByteString)) -idDecodeWith = decodeWithC pure +idDecodeWith !opt = decodeWithC (header $ decDelimiter opt) (csv opt) pure -decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString - -> Either String a -decodeWithC convert !opts skipHeader = decodeWithP parser convert +decodeWithC :: AL.Parser Header -> AL.Parser Csv + -> (Csv -> Either String a) + -> Bool -> L.ByteString -> Either String a +decodeWithC headerP body convert skipHeader + = decodeWithP parser convert where parser - | skipHeader = header (decDelimiter opts) *> csv opts - | otherwise = csv opts + | skipHeader = headerP *> body + | otherwise = body {-# INLINE decodeWithC #-} -- | Like 'decodeByName', but lets you customize how the CSV data is @@ -254,5 +257,6 @@ decodeWithP p to s = ------------------------------------------------------------------------ -- * Space-delimited files -decodeTable :: FromRecord a => L.ByteString -> Either String (Vector a) -decodeTable = decodeWithP table (parse . traverse parseRecord) +decodeTable :: FromRecord a => Bool -> L.ByteString -> Either String (Vector a) +decodeTable = + decodeWithC tableHeader table (runParser . parseCsv) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 72d55cc..e8f16cf 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -34,8 +34,8 @@ module Data.Csv.Parser import Blaze.ByteString.Builder (fromByteString, toByteString) import Blaze.ByteString.Builder.Char.Utf8 (fromChar) -import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional, - pure) +import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), (<$), optional, + pure, liftA2, empty) import Data.Attoparsec.Char8 (char, endOfInput, endOfLine) import qualified Data.Attoparsec as A import qualified Data.Attoparsec.Lazy as AL @@ -176,7 +176,7 @@ unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && -- | table :: AL.Parser Csv table = do - vals <- recordTable `sepBy1` endOfLine + vals <- recordTable `AL.sepBy1` endOfLine _ <- optional endOfLine endOfInput return $ V.fromList $ removeBlankLines vals @@ -186,7 +186,7 @@ tableWithHeader :: AL.Parser (Header, V.Vector NamedRecord) tableWithHeader = do hdr <- tableHeader vals <- map (toNamedRecord hdr) . removeBlankLines <$> - recordTable `sepBy1` endOfLine + recordTable `AL.sepBy1` endOfLine _ <- optional endOfLine endOfInput return (hdr, V.fromList vals) @@ -198,7 +198,8 @@ tableName :: AL.Parser Field tableName = fieldTable -- | Parse record for space-separated files. It's more complicated --- that CSV parser because we need to drop both +-- that CSV parser because we need to drop both leading and trailing +-- spaces. recordTable :: AL.Parser Record recordTable = V.fromList <$> From 09b73f739e633c15ba9e6a272da5d5eb28a1c22b Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 25 Dec 2012 18:44:10 +0400 Subject: [PATCH 09/21] Add decoding of space-delimited data with header --- Data/Csv/Encoding.hs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 60997b0..d5559a2 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -29,6 +29,7 @@ module Data.Csv.Encoding -- * Space-delimited files , decodeTable + , decodeTableByName ) where import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8, @@ -259,4 +260,9 @@ decodeWithP p to s = decodeTable :: FromRecord a => Bool -> L.ByteString -> Either String (Vector a) decodeTable = - decodeWithC tableHeader table (runParser . parseCsv) + decodeWithC tableHeader table (runParser . parseCsv) + +decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, Vector a) +decodeTableByName = + decodeWithP tableWithHeader + (\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs)) From 953b09d0caa3583296e266da25c251ee580d124d Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 25 Dec 2012 18:45:15 +0400 Subject: [PATCH 10/21] Escape tab too --- Data/Csv/Encoding.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index d5559a2..2f59fc7 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -191,7 +191,7 @@ encodeRecord delim = mconcat . intersperse (fromWord8 delim) escape :: B.ByteString -> B.ByteString escape s | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr || - b == sp) s == Nothing = s + b == sp || b == tab) s == Nothing = s | otherwise = B.concat ["\"", B.concatMap @@ -203,6 +203,7 @@ escape s nl = 10 cr = 13 sp = 32 + tab = 9 -- | Like 'encodeByName', but lets you customize how the CSV data is -- encoded. From 7927d9cbf697d456da6ca9835cc498ea652253e7 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Tue, 25 Dec 2012 18:57:13 +0400 Subject: [PATCH 11/21] Add encoding of space-delimited data --- Data/Csv/Encoding.hs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 2f59fc7..4008133 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -30,6 +30,8 @@ module Data.Csv.Encoding -- * Space-delimited files , decodeTable , decodeTableByName + , encodeTable + , encodeTableByName ) where import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8, @@ -267,3 +269,26 @@ decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, decodeTableByName = decodeWithP tableWithHeader (\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs)) + +encodeTable :: ToRecord a => V.Vector a -> L.ByteString +encodeTable = toLazyByteString + . unlines + . map (encodeTableRow . toRecord) + . V.toList +{-# INLINE encodeTable #-} + +encodeTableByName :: ToNamedRecord a => Header -> V.Vector a -> L.ByteString +encodeTableByName hdr v = + toLazyByteString ( encodeTableRow hdr + <> fromByteString "\r\n" + <> records ) + where + records = unlines + . map (encodeTableRow . namedRecordToRecord hdr . toNamedRecord) + . V.toList $ v +{-# INLINE encodeTableByName #-} + +encodeTableRow :: Record -> Builder +encodeTableRow = mconcat . intersperse (fromWord8 9) + . map fromByteString . map escape . V.toList +{-# INLINE encodeTableRow #-} From 1bf95bd35d5489f6393df2c9432a3600edf7d3c0 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 26 Dec 2012 20:44:01 +0400 Subject: [PATCH 12/21] Use more consistent naming --- Data/Csv/Parser.hs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index e8f16cf..595234b 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -27,9 +27,9 @@ module Data.Csv.Parser , table , tableWithHeader , tableHeader - , recordTable + , tableRecord , tableName - , fieldTable + , tableField ) where import Blaze.ByteString.Builder (fromByteString, toByteString) @@ -176,7 +176,7 @@ unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && -- | table :: AL.Parser Csv table = do - vals <- recordTable `AL.sepBy1` endOfLine + vals <- tableRecord `AL.sepBy1` endOfLine _ <- optional endOfLine endOfInput return $ V.fromList $ removeBlankLines vals @@ -186,25 +186,25 @@ tableWithHeader :: AL.Parser (Header, V.Vector NamedRecord) tableWithHeader = do hdr <- tableHeader vals <- map (toNamedRecord hdr) . removeBlankLines <$> - recordTable `AL.sepBy1` endOfLine + tableRecord `AL.sepBy1` endOfLine _ <- optional endOfLine endOfInput return (hdr, V.fromList vals) tableHeader :: AL.Parser Record -tableHeader = recordTable <* endOfLine +tableHeader = tableRecord <* endOfLine tableName :: AL.Parser Field -tableName = fieldTable +tableName = tableField -- | Parse record for space-separated files. It's more complicated -- that CSV parser because we need to drop both leading and trailing -- spaces. -recordTable :: AL.Parser Record -recordTable +tableRecord :: AL.Parser Record +tableRecord = V.fromList <$> ((delimTable <|> pure ()) *> - (fieldTable `sepBy11` delimTable) + (tableField `sepBy11` delimTable) ) where sepBy11 p s = liftA2 (:) p scan @@ -216,10 +216,10 @@ recordTable case mb of Just b | b == newline || b == cr -> pure [] _ -> empty -{-# INLINE recordTable #-} +{-# INLINE tableRecord #-} -fieldTable :: AL.Parser Field -fieldTable = do +tableField :: AL.Parser Field +tableField = do mb <- A.peekWord8 case mb of Just b | b == doubleQuote -> escapedField From 5dec6799f1ed874ffd658856ab51a68c75d466da Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 26 Dec 2012 21:05:03 +0400 Subject: [PATCH 13/21] Add space-delimited data decoding for incremental API --- Data/Csv/Incremental.hs | 66 +++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs index 8188ce4..89ec17b 100644 --- a/Data/Csv/Incremental.hs +++ b/Data/Csv/Incremental.hs @@ -8,8 +8,11 @@ module Data.Csv.Incremental ( -- * Decoding headers HeaderParser(..) + -- ** CSV , decodeHeader , decodeHeaderWith + -- ** + , decodeTableHeader -- ** Providing input -- $feed-header , feedChunkH @@ -23,11 +26,13 @@ module Data.Csv.Incremental -- $indexbased , decode , decodeWith + , decodeTable -- ** Name-based record conversion -- $namebased , decodeByName , decodeByNameWith + , decodeTableByName -- ** Providing input -- $feed-records @@ -132,10 +137,16 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions -- | Like 'decodeHeader', but lets you customize how the CSV data is -- parsed. decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString -decodeHeaderWith !opts = PartialH (go . parser) - where - parser = A.parse (header $ decDelimiter opts) +decodeHeaderWith !opts = parseWithH (header $ decDelimiter opts) + +-- | Parse header for space-delimited data. It have same semantic as +-- 'decodeHeader'. +decodeTableHeader :: HeaderParser B.ByteString +decodeTableHeader = parseWithH tableHeader +parseWithH :: A.Parser Header -> HeaderParser B.ByteString +parseWithH parser = PartialH (go . A.parse parser) + where go (A.Fail rest _ msg) = FailH rest err where err = "parse error (" ++ msg ++ ")" -- TODO: Check empty and give attoparsec one last chance to return @@ -143,6 +154,8 @@ decodeHeaderWith !opts = PartialH (go . parser) go (A.Partial k) = PartialH $ \ s -> go (k s) go (A.Done rest r) = DoneH r rest + + ------------------------------------------------------------------------ -- * Decoding records @@ -229,12 +242,27 @@ decodeWith :: FromRecord a -> Bool -- ^ Data contains header that should be -- skipped -> Parser a -decodeWith !opts skipHeader - | skipHeader = Partial $ \ s -> go (decodeHeaderWith opts `feedChunkH` s) - | otherwise = Partial (decodeWithP parseRecord opts) +decodeWith !opts = parseSimple (header d) (record d) + where + d = decDelimiter opts + +-- | Efficiently deserialize space-delimited data. +decodeTable :: FromRecord a + => Bool -- ^ Data contains header that should be + -- skipped + -> Parser a +decodeTable = parseSimple tableHeader tableRecord + +parseSimple :: FromRecord a => A.Parser Header -> A.Parser Record -> Bool -> Parser a +parseSimple headP body skipHeader + | skipHeader = Partial $ \ s -> go (parseWithH headP `feedChunkH` s) + | otherwise = Partial (decodeWithP body parseRecord) where go (FailH rest msg) = Fail rest msg go (PartialH k) = Partial $ \ s' -> go (k s') - go (DoneH _ rest) = decodeWithP parseRecord opts rest + go (DoneH _ rest) = decodeWithP body parseRecord rest + + + ------------------------------------------------------------------------ @@ -251,13 +279,23 @@ decodeByName = decodeByNameWith defaultDecodeOptions decodeByNameWith :: FromNamedRecord a => DecodeOptions -- ^ Decoding options -> HeaderParser (Parser a) -decodeByNameWith !opts = - PartialH (go . (decodeHeaderWith opts `feedChunkH`)) +decodeByNameWith !opts = parseNamed (header d) (record d) + where + d = decDelimiter opts + +decodeTableByName :: FromNamedRecord a + => HeaderParser (Parser a) +decodeTableByName = undefined + +parseNamed :: FromNamedRecord a + => A.Parser Header -> A.Parser Record -> HeaderParser (Parser a) +parseNamed headP body = + PartialH (go . (parseWithH headP `feedChunkH`)) where go (FailH rest msg) = FailH rest msg go (PartialH k) = PartialH $ \ s -> go (k s) go (DoneH hdr rest) = - DoneH hdr (decodeWithP (parseNamedRecord . toNamedRecord hdr) opts rest) + DoneH hdr (decodeWithP body (parseNamedRecord . toNamedRecord hdr) rest) -- Copied from Data.Csv.Parser toNamedRecord :: Header -> Record -> NamedRecord @@ -266,9 +304,11 @@ toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v ------------------------------------------------------------------------ -- | Like 'decode', but lets you customize how the CSV data is parsed. -decodeWithP :: (Record -> Conversion.Parser a) -> DecodeOptions -> B.ByteString +decodeWithP :: A.Parser Record + -> (Record -> Conversion.Parser a) + -> B.ByteString -> Parser a -decodeWithP p !opts = go Incomplete [] . parser +decodeWithP rowParser p = go Incomplete [] . parser where go !_ !acc (A.Fail rest _ msg) | null acc = Fail rest err @@ -295,7 +335,7 @@ decodeWithP p !opts = go Incomplete [] . parser acc' | blankLine r = acc | otherwise = convert r : acc - parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput)) + parser = A.parse (rowParser <* (endOfLine <|> endOfInput)) convert = runParser . p {-# INLINE decodeWithP #-} From 2ace4142df48c98a405dd82e887950aa71ec22eb Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 26 Dec 2012 23:05:21 +0400 Subject: [PATCH 14/21] Add decoding of space-delimited data for streaming API --- Data/Csv/Streaming.hs | 49 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/Data/Csv/Streaming.hs b/Data/Csv/Streaming.hs index b226d19..999fb0b 100644 --- a/Data/Csv/Streaming.hs +++ b/Data/Csv/Streaming.hs @@ -20,11 +20,13 @@ module Data.Csv.Streaming -- $indexbased , decode , decodeWith + , decodeTable -- ** Name-based record conversion -- $namebased , decodeByName , decodeByNameWith + , decodeTableByName ) where import Control.Applicative ((<$>), (<*>), pure) @@ -37,7 +39,7 @@ import Prelude hiding (foldr) import Data.Csv.Conversion import Data.Csv.Incremental hiding (decode, decodeByName, decodeByNameWith, - decodeWith) + decodeWith, decodeTable, decodeTableByName) import qualified Data.Csv.Incremental as I import Data.Csv.Parser import Data.Csv.Types @@ -132,9 +134,24 @@ decodeWith :: FromRecord a -- skipped -> BL.ByteString -- ^ CSV data -> Records a -decodeWith !opts skipHeader s0 = case BL.toChunks s0 of - [] -> go [] (feedEndOfInput $ I.decodeWith opts skipHeader) - (s:ss) -> go ss (I.decodeWith opts skipHeader `feedChunk` s) +decodeWith !opts skipHeader = + streamData $ I.decodeWith opts skipHeader + +-- | Efficiently deserialize space-delimited records in a streaming +-- fashion. +decodeTable :: FromRecord a + => Bool -- ^ Data contains header that should be + -- skipped + -> BL.ByteString -- ^ Space-delimited data + -> Records a +decodeTable skipHeader = + streamData $ I.decodeTable skipHeader + + +streamData :: I.Parser a -> BL.ByteString -> Records a +streamData parse s0 = case BL.toChunks s0 of + [] -> go [] (feedEndOfInput parse) + (s:ss) -> go ss (parse `feedChunk` s ) where go ss (Done xs) = foldr Cons (Nil Nothing (BL.fromChunks ss)) xs go ss (Fail rest err) = Nil (Just err) (BL.fromChunks (rest:ss)) @@ -143,6 +160,9 @@ decodeWith !opts skipHeader s0 = case BL.toChunks s0 of go [] (Some xs k) = foldr Cons (go [] (k B.empty)) xs go (s:ss) (Some xs k) = foldr Cons (go ss (k s)) xs + + + -- | Efficiently deserialize CSV in a streaming fashion. The data is -- assumed to be preceeded by a header. Returns @'Left' errMsg@ if -- parsing the header fails. Equivalent to @'decodeByNameWith' @@ -160,9 +180,24 @@ decodeByNameWith :: FromNamedRecord a => DecodeOptions -- ^ Decoding options -> BL.ByteString -- ^ CSV data -> Either String (Header, Records a) -decodeByNameWith !opts s0 = case BL.toChunks s0 of - [] -> go [] (feedEndOfInputH $ I.decodeByNameWith opts) - (s:ss) -> go ss (I.decodeByNameWith opts `feedChunkH` s) +decodeByNameWith !opts = + streamDataByName (I.decodeByNameWith opts) + + +-- | Like 'decodeByName', but lets you customize how the CSV data is +-- parsed. +decodeTableByName :: FromNamedRecord a + => BL.ByteString -- ^ Space-delimited data + -> Either String (Header, Records a) +decodeTableByName = + streamDataByName I.decodeTableByName + + +streamDataByName :: HeaderParser (I.Parser a) + -> BL.ByteString -> Either String (Header, Records a) +streamDataByName parser s0 = case BL.toChunks s0 of + [] -> go [] (feedEndOfInputH parser) + (s:ss) -> go ss (parser `feedChunkH` s) where go ss (DoneH hdr p) = Right (hdr, go2 ss p) go ss (FailH rest err) = Left $ err ++ " at " ++ From bd7b4bad94b55f4ed0f936fc290d2957d44fe56e Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 14:46:04 +0400 Subject: [PATCH 15/21] Add documentation --- Data/Csv.hs | 25 +++++++++++++++++++++---- Data/Csv/Encoding.hs | 17 ++++++++++++++++- Data/Csv/Incremental.hs | 3 +++ Data/Csv/Parser.hs | 31 +++++++++++++++++++++++-------- Data/Csv/Streaming.hs | 5 +++-- cassava.cabal | 2 +- 6 files changed, 67 insertions(+), 16 deletions(-) diff --git a/Data/Csv.hs b/Data/Csv.hs index d7d1ec7..903b952 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -1,5 +1,9 @@ --- | This module implements encoding and decoding of CSV data. The --- implementation is RFC 4180 compliant, with the following +-- | This module implements encoding and decoding of CSV and +-- space-delimited data. APIs for both format are essentially the +-- same. They only differ in function names and space-delimited data +-- doesn't have encoding\/decoding options. +-- +-- The CSV implementation is RFC 4180 compliant, with the following -- extensions: -- -- * Empty lines are ignored. @@ -9,6 +13,16 @@ -- -- * Escaped fields may contain any characters (but double-quotes -- need to be escaped). +-- +-- Space-delimited data don't have specification so following format +-- is assumed: +-- +-- * Fields are delimited by one or more tabs and spaces at the +-- beginning and end of line are ignored. +-- +-- * Empty lines are ignored. +-- +-- * Escaping rules are same as with CSV module Data.Csv ( -- * Usage example @@ -20,7 +34,7 @@ module Data.Csv -- * Custom type conversions -- $customtypeconversions - -- * Encoding and decoding + -- * Encoding and decoding CSV -- $encoding decode , decodeByName @@ -38,8 +52,11 @@ module Data.Csv , encodeWith , encodeByNameWith - -- * Space-delimited files + -- * Space-delimited data , decodeTable + , decodeTableByName + , encodeTable + , encodeTableByName -- * Core CSV types , Csv diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 4008133..02322ff 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -261,15 +261,27 @@ decodeWithP p to s = ------------------------------------------------------------------------ -- * Space-delimited files -decodeTable :: FromRecord a => Bool -> L.ByteString -> Either String (Vector a) +-- | Efficiently deserialize space-delimited records from a lazy +-- ByteString. If this fails due to incomplete or invalid input, +-- @'Left' msg@ is returned. +decodeTable :: FromRecord a + => Bool -- ^ Data contains header that should be skipped + -> L.ByteString -- ^ Raw data + -> Either String (Vector a) decodeTable = decodeWithC tableHeader table (runParser . parseCsv) +-- | Efficiently deserialize space-delimited records from a lazy +-- ByteString. If this fails due to incomplete or invalid input, +-- @'Left' msg@ is returned. The data is assumed to be preceeded by a +-- header. decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, Vector a) decodeTableByName = decodeWithP tableWithHeader (\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs)) +-- | Efficiently serialize space-delimited records as a lazy +-- ByteString. Single tab is used as separator. encodeTable :: ToRecord a => V.Vector a -> L.ByteString encodeTable = toLazyByteString . unlines @@ -277,6 +289,9 @@ encodeTable = toLazyByteString . V.toList {-# INLINE encodeTable #-} +-- | Efficiently serialize space-delimited records as a lazy +-- ByteString. The header is written before any records and dictates +-- the field order. Single tab is used as separator. encodeTableByName :: ToNamedRecord a => Header -> V.Vector a -> L.ByteString encodeTableByName hdr v = toLazyByteString ( encodeTableRow hdr diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs index 89ec17b..1155aac 100644 --- a/Data/Csv/Incremental.hs +++ b/Data/Csv/Incremental.hs @@ -283,6 +283,9 @@ decodeByNameWith !opts = parseNamed (header d) (record d) where d = decDelimiter opts +-- | Efficiently deserialize space-delimited data in incremental +-- fashion. The data is assumed to be preceeded by a header. Have same +-- semantics as 'decodeByName'. decodeTableByName :: FromNamedRecord a => HeaderParser (Parser a) decodeTableByName = undefined diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 595234b..08e4a73 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -1,7 +1,7 @@ {-# LANGUAGE BangPatterns, CPP #-} --- | A CSV parser. The parser defined here is RFC 4180 compliant, with --- the following extensions: +-- | Parsers for CSV and space-delimited data. The CSV parser defined +-- here is RFC 4180 compliant, with the following extensions: -- -- * Empty lines are ignored. -- @@ -11,6 +11,16 @@ -- * Escaped fields may contain any characters (but double-quotes -- need to be escaped). -- +-- Space-delimited data don't have specification so following format +-- is assumed: +-- +-- * Fields are delimited by one or more tabs and spaces at the +-- beginning and end of line are ignored. +-- +-- * Empty lines are ignored. +-- +-- * Escaping rules are same as with CSV +-- -- The functions in this module can be used to implement e.g. a -- resumable parser that is fed input incrementally. module Data.Csv.Parser @@ -23,7 +33,7 @@ module Data.Csv.Parser , record , name , field - -- * Tables + -- * Space-delimited data , table , tableWithHeader , tableHeader @@ -51,7 +61,7 @@ import Data.Word (Word8) import Data.Csv.Types import Data.Csv.Util ((<$!>)) --- | Options that controls how data is decoded. These options can be +-- | Options that controls how CSV data is decoded. These options can be -- used to e.g. decode tab-separated data instead of comma-separated -- data. -- @@ -173,7 +183,7 @@ unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && c /= delim && c /= cr) --- | +-- | Parse space-delimited data which doesn't include header. table :: AL.Parser Csv table = do vals <- tableRecord `AL.sepBy1` endOfLine @@ -182,6 +192,7 @@ table = do return $ V.fromList $ removeBlankLines vals {-# INLINE table #-} +-- | Parse space-delimited data with header. tableWithHeader :: AL.Parser (Header, V.Vector NamedRecord) tableWithHeader = do hdr <- tableHeader @@ -191,15 +202,17 @@ tableWithHeader = do endOfInput return (hdr, V.fromList vals) +-- | Parse header name. tableHeader :: AL.Parser Record tableHeader = tableRecord <* endOfLine +-- | Parse header name. They have same format as regular 'tableField's. tableName :: AL.Parser Field tableName = tableField --- | Parse record for space-separated files. It's more complicated --- that CSV parser because we need to drop both leading and trailing --- spaces. +-- | Parse row for space-delimited data not including terminating line +-- separator. It's more complicated that CSV parser because we need +-- to drop both leading and trailing spaces. tableRecord :: AL.Parser Record tableRecord = V.fromList <$> @@ -218,6 +231,8 @@ tableRecord _ -> empty {-# INLINE tableRecord #-} +-- | Parse field. It could be escaped or not escaped. The return value +-- is escaped. tableField :: AL.Parser Field tableField = do mb <- A.peekWord8 diff --git a/Data/Csv/Streaming.hs b/Data/Csv/Streaming.hs index 999fb0b..fcfbcd8 100644 --- a/Data/Csv/Streaming.hs +++ b/Data/Csv/Streaming.hs @@ -184,8 +184,9 @@ decodeByNameWith !opts = streamDataByName (I.decodeByNameWith opts) --- | Like 'decodeByName', but lets you customize how the CSV data is --- parsed. +-- | Efficiently deserialize space-delimited in a streaming +-- fashion. The data is assumed to be preceeded by a header. Returns +-- @'Left' errMsg@ if parsing the header fails. decodeTableByName :: FromNamedRecord a => BL.ByteString -- ^ Space-delimited data -> Either String (Header, Records a) diff --git a/cassava.cabal b/cassava.cabal index 2630456..c04810f 100644 --- a/cassava.cabal +++ b/cassava.cabal @@ -3,7 +3,7 @@ Version: 0.2.1.2 Synopsis: A CSV parsing and encoding library Description: A CSV parsing and encoding library optimized for ease of use and high - performance. + performance. It can encode/decode space-delimited data as well. Homepage: https://github.com/tibbe/cassava License: BSD3 License-file: LICENSE From 7f6a544c0d617f7cea9424b7c3ec7e4be83df132 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:05:43 +0400 Subject: [PATCH 16/21] Add tests --- tests/UnitTests.hs | 53 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 2231e0c..76647d3 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -26,8 +26,8 @@ import qualified Data.Csv.Streaming as S ------------------------------------------------------------------------ -- Parse tests -decodesAs :: BL.ByteString -> [[B.ByteString]] -> Assertion -decodesAs input expected = assertResult input expected $ decode False input +decodesAs :: (Bool -> BL.ByteString -> Either String Csv) -> BL.ByteString -> [[B.ByteString]] -> Assertion +decodesAs dec input expected = assertResult input expected $ dec False input decodesWithAs :: DecodeOptions -> BL.ByteString -> [[B.ByteString]] -> Assertion decodesWithAs opts input expected = @@ -41,9 +41,9 @@ assertResult input expected res = case res of " input: " ++ show (BL8.unpack input) ++ "\n" ++ "parse error: " ++ err -encodesAs :: [[B.ByteString]] -> BL.ByteString -> Assertion -encodesAs input expected = - encode (V.fromList (map V.fromList input)) @?= expected +encodesAs :: (Csv -> BL.ByteString) -> [[B.ByteString]] -> BL.ByteString -> Assertion +encodesAs enc input expected = + enc (V.fromList (map V.fromList input)) @?= expected encodesWithAs :: EncodeOptions -> [[B.ByteString]] -> BL.ByteString -> Assertion encodesWithAs opts input expected = @@ -100,12 +100,14 @@ namedDecodesStreamingAs input ehdr expected = case S.decodeByName input of positionalTests :: [TF.Test] positionalTests = + -- Encode CSV [ testGroup "encode" $ map encodeTest [ ("simple", [["abc"]], "abc\r\n") , ("quoted", [["\"abc\""]], "\"\"\"abc\"\"\"\r\n") , ("quote", [["a\"b"]], "\"a\"\"b\"\r\n") , ("quotedQuote", [["\"a\"b\""]], "\"\"\"a\"\"b\"\"\"\r\n") , ("leadingSpace", [[" abc"]], "\" abc\"\r\n") + , ("leadingTab", [["\tabc"]], "\"\tabc\"\r\n") , ("comma", [["abc,def"]], "\"abc,def\"\r\n") , ("twoFields", [["abc","def"]], "abc,def\r\n") , ("twoRecords", [["abc"], ["def"]], "abc\r\ndef\r\n") @@ -115,12 +117,29 @@ positionalTests = [ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 }) [["1", "2"]] "1\t2\r\n" ] - , testGroup "decode" $ map decodeTest decodeTests + -- Encode space-delimited data + , testGroup "encode table" $ map encodeTableTest + [ ("simple", [["abc"]], "abc\r\n") + , ("empty", [[""]], "\"\"\r\n") + , ("quoted", [["\"abc\""]], "\"\"\"abc\"\"\"\r\n") + , ("quote", [["a\"b"]], "\"a\"\"b\"\r\n") + , ("quotedQuote", [["\"a\"b\""]], "\"\"\"a\"\"b\"\"\"\r\n") + , ("leadingSpace", [[" abc"]], "\" abc\"\r\n") + , ("leadingTab", [["\tabc"]], "\"\tabc\"\r\n") + , ("comma", [["abc,def"]], "\"abc,def\"\r\n") + , ("twoFields", [["abc","def"]], "abc\tdef\r\n") + , ("twoRecords", [["abc"], ["def"]], "abc\r\ndef\r\n") + , ("newline", [["abc\ndef"]], "\"abc\ndef\"\r\n") + ] + -- Decode CSV + , testGroup "decode" $ map decodeTest decodeTests , testGroup "decodeWith" $ map decodeWithTest decodeWithTests , testGroup "streaming" - [ testGroup "decode" $ map streamingDecodeTest decodeTests + [ testGroup "decode" $ map streamingDecodeTest decodeTests , testGroup "decodeWith" $ map streamingDecodeWithTest decodeWithTests ] + -- Decode space-delimited data + , testGroup "decode table" $ map decodeTableTest decodeTableTests ] where rfc4180Input = BL8.pack $ @@ -145,11 +164,25 @@ positionalTests = decodeWithTests = [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]]) ] + decodeTableTests = + [ ("simple", "a b c\n", [["a", "b", "c"]]) + , ("crlf", "a b\r\nc d\r\n", [["a", "b"], ["c", "d"]]) + , ("noEol", "a b c", [["a", "b", "c"]]) + , ("blankLine", "a b c\n\nd e f\n\n", + [["a", "b", "c"], ["d", "e", "f"]]) + , ("leadingSpace", " a b c\n", [["a", "b", "c"]]) + , ("trailingSpace", "a b c \n", [["a", "b", "c"]]) + , ("emptyField", "a \"\"\n", [["a", ""]]) + ] encodeTest (name, input, expected) = - testCase name $ input `encodesAs` expected + testCase name $ encodesAs encode input expected + encodeTableTest (name, input, expected) = + testCase name $ encodesAs encodeTable input expected decodeTest (name, input, expected) = - testCase name $ input `decodesAs` expected + testCase name $ decodesAs decode input expected + decodeTableTest (name, input, expected) = + testCase name $ decodesAs decodeTable input expected decodeWithTest (name, opts, input, expected) = testCase name $ decodesWithAs opts input expected streamingDecodeTest (name, input, expected) = @@ -209,7 +242,7 @@ instance Arbitrary LT.Text where -- empty line (which we will ignore.) We therefore encode at least two -- columns. roundTrip :: (Eq a, FromField a, ToField a) => a -> Bool -roundTrip x = Right record == decode False (encode record) +roundTrip x = Right record == decode False (encode record) where record = V.singleton (x, dummy) dummy = 'a' From 371bdd296a36a6b59abca7834d6e431d255d2d59 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:10:41 +0400 Subject: [PATCH 17/21] Fix escaping bug for empty strings --- Data/Csv/Encoding.hs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 02322ff..f13d4e3 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -305,5 +305,12 @@ encodeTableByName hdr v = encodeTableRow :: Record -> Builder encodeTableRow = mconcat . intersperse (fromWord8 9) - . map fromByteString . map escape . V.toList + . map fromByteString . map escapeT . V.toList + where + -- We need to escape empty strings. Otherwise we'll get: + -- > encode ["a","","b"] = 'a b' + -- instead of + -- > encode ["a","","b"] = 'a "" b' + escapeT b | B.null b = "\"\"" + | otherwise = escape b {-# INLINE encodeTableRow #-} From 2ea024a362baaf33050e80d52ac8ea220bcdcc60 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:28:32 +0400 Subject: [PATCH 18/21] Test for streaming --- tests/UnitTests.hs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 76647d3..ccfcd46 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -73,10 +73,10 @@ recordsToList (S.Cons (Right x) rs) = case recordsToList rs of l@(Left _) -> l (Right xs) -> Right (x : xs) -decodesStreamingAs :: BL.ByteString -> [[B.ByteString]] -> Assertion -decodesStreamingAs input expected = +-- decodesStreamingAs :: BL.ByteString -> [[B.ByteString]] -> Assertion +decodesStreamingAs dec input expected = assertResult input expected $ fmap (V.fromList . map V.fromList) $ - recordsToList $ S.decode False input + recordsToList $ dec False input decodesWithStreamingAs :: DecodeOptions -> BL.ByteString -> [[B.ByteString]] -> Assertion @@ -131,15 +131,15 @@ positionalTests = , ("twoRecords", [["abc"], ["def"]], "abc\r\ndef\r\n") , ("newline", [["abc\ndef"]], "\"abc\ndef\"\r\n") ] - -- Decode CSV + -- Decode CSV & space delimited data , testGroup "decode" $ map decodeTest decodeTests , testGroup "decodeWith" $ map decodeWithTest decodeWithTests + , testGroup "decode table" $ map decodeTableTest decodeTableTests , testGroup "streaming" - [ testGroup "decode" $ map streamingDecodeTest decodeTests - , testGroup "decodeWith" $ map streamingDecodeWithTest decodeWithTests + [ testGroup "decode" $ map streamingDecodeTest decodeTests + , testGroup "decodeWith" $ map streamingDecodeWithTest decodeWithTests + , testGroup "table" $ map streamingDecodeTableTest decodeTableTests ] - -- Decode space-delimited data - , testGroup "decode table" $ map decodeTableTest decodeTableTests ] where rfc4180Input = BL8.pack $ @@ -175,18 +175,23 @@ positionalTests = , ("emptyField", "a \"\"\n", [["a", ""]]) ] + -- Encode encodeTest (name, input, expected) = testCase name $ encodesAs encode input expected encodeTableTest (name, input, expected) = testCase name $ encodesAs encodeTable input expected + -- Decode decodeTest (name, input, expected) = testCase name $ decodesAs decode input expected decodeTableTest (name, input, expected) = testCase name $ decodesAs decodeTable input expected decodeWithTest (name, opts, input, expected) = testCase name $ decodesWithAs opts input expected + -- Streaming streamingDecodeTest (name, input, expected) = - testCase name $ input `decodesStreamingAs` expected + testCase name $ decodesStreamingAs S.decode input expected + streamingDecodeTableTest (name, input, expected) = + testCase name $ decodesStreamingAs S.decodeTable input expected streamingDecodeWithTest (name, opts, input, expected) = testCase name $ decodesWithStreamingAs opts input expected defEnc = defaultEncodeOptions From 6e4ecc565a1690ca4848e50ebc8a535724d5bfcc Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:35:48 +0400 Subject: [PATCH 19/21] Copy optimization from CSV functions * Add rule to rewrite to specialized function * Inline decoding functions --- Data/Csv/Encoding.hs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index f13d4e3..63ba498 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -270,6 +270,17 @@ decodeTable :: FromRecord a -> Either String (Vector a) decodeTable = decodeWithC tableHeader table (runParser . parseCsv) +{-# INLINE decodeTable #-} + +-- | Same as 'decodeWith', but more efficient as no type +-- conversion is performed. +idDecodeTable :: Bool -> L.ByteString -> Either String (Vector (Vector B.ByteString)) +idDecodeTable = decodeWithC tableHeader table pure + +{-# RULES + "idDecodeTable" decodeTable = idDecodeTable + #-} + -- | Efficiently deserialize space-delimited records from a lazy -- ByteString. If this fails due to incomplete or invalid input, @@ -279,6 +290,7 @@ decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, decodeTableByName = decodeWithP tableWithHeader (\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs)) +{-# INLINE decodeTableByName #-} -- | Efficiently serialize space-delimited records as a lazy -- ByteString. Single tab is used as separator. From 53a1d23635860ed14f18cd38d1ea7800dea5ac3d Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:40:19 +0400 Subject: [PATCH 20/21] Move common functionality and helpers to the end of module --- Data/Csv/Encoding.hs | 144 ++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 70 deletions(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 63ba498..b1f9862 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -107,19 +107,6 @@ decodeWith !opt = decodeWithC (header $ decDelimiter opt) (csv opt) (runParser . parseCsv) {-# INLINE [1] decodeWith #-} -parseCsv :: FromRecord a => Csv -> Parser (Vector a) -parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs) - -mapM' :: Monad m => (a -> m b) -> [a] -> m [b] -mapM' f = go - where - go [] = return [] - go (x:xs) = do - !y <- f x - ys <- go xs - return (y : ys) -{-# INLINE mapM' #-} - {-# RULES "idDecodeWith" decodeWith = idDecodeWith #-} @@ -150,9 +137,6 @@ decodeByNameWith !opts = decodeWithP (csvWithHeader opts) (\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs)) -parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a) -parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs) - -- | Options that controls how data is encoded. These options can be -- used to e.g. encode data in a tab-separated format instead of in a -- comma-separated format. @@ -189,24 +173,6 @@ encodeRecord delim = mconcat . intersperse (fromWord8 delim) . map fromByteString . map escape . V.toList {-# INLINE encodeRecord #-} --- TODO: Optimize -escape :: B.ByteString -> B.ByteString -escape s - | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr || - b == sp || b == tab) s == Nothing = s - | otherwise = - B.concat ["\"", - B.concatMap - (\ b -> if b == dquote then "\"\"" else B.singleton b) s, - "\""] - where - dquote = 34 - comma = 44 - nl = 10 - cr = 13 - sp = 32 - tab = 9 - -- | Like 'encodeByName', but lets you customize how the CSV data is -- encoded. encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a @@ -222,42 +188,6 @@ encodeByNameWith opts hdr v = {-# INLINE encodeByNameWith #-} -namedRecordToRecord :: Header -> NamedRecord -> Record -namedRecordToRecord hdr nr = V.map find hdr - where - find n = case HM.lookup n nr of - Nothing -> moduleError "namedRecordToRecord" $ - "header contains name " ++ show (B8.unpack n) ++ - " which is not present in the named record" - Just v -> v - -moduleError :: String -> String -> a -moduleError func msg = error $ "Data.Csv.Encoding." ++ func ++ ": " ++ msg -{-# NOINLINE moduleError #-} - -unlines :: [Builder] -> Builder -unlines [] = mempty -unlines (b:bs) = b <> fromString "\r\n" <> unlines bs - -intersperse :: Builder -> [Builder] -> [Builder] -intersperse _ [] = [] -intersperse sep (x:xs) = x : prependToAll sep xs - -prependToAll :: Builder -> [Builder] -> [Builder] -prependToAll _ [] = [] -prependToAll sep (x:xs) = sep <> x : prependToAll sep xs - -decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b -decodeWithP p to s = - case AL.parse p s of - AL.Done _ v -> case to v of - Right a -> Right a - Left msg -> Left $ "conversion error: " ++ msg - AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++ - show (BL8.unpack left) -{-# INLINE decodeWithP #-} - - ------------------------------------------------------------------------ -- * Space-delimited files @@ -326,3 +256,77 @@ encodeTableRow = mconcat . intersperse (fromWord8 9) escapeT b | B.null b = "\"\"" | otherwise = escape b {-# INLINE encodeTableRow #-} + + +------------------------------------------------------------------------ +-- * Common functionality and helpers + +parseCsv :: FromRecord a => Csv -> Parser (Vector a) +parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs) + +parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a) +parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs) + +-- TODO: Optimize +escape :: B.ByteString -> B.ByteString +escape s + | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr || + b == sp || b == tab) s == Nothing = s + | otherwise = + B.concat ["\"", + B.concatMap + (\ b -> if b == dquote then "\"\"" else B.singleton b) s, + "\""] + where + dquote = 34 + comma = 44 + nl = 10 + cr = 13 + sp = 32 + tab = 9 + +namedRecordToRecord :: Header -> NamedRecord -> Record +namedRecordToRecord hdr nr = V.map find hdr + where + find n = case HM.lookup n nr of + Nothing -> moduleError "namedRecordToRecord" $ + "header contains name " ++ show (B8.unpack n) ++ + " which is not present in the named record" + Just v -> v + +moduleError :: String -> String -> a +moduleError func msg = error $ "Data.Csv.Encoding." ++ func ++ ": " ++ msg +{-# NOINLINE moduleError #-} + +unlines :: [Builder] -> Builder +unlines [] = mempty +unlines (b:bs) = b <> fromString "\r\n" <> unlines bs + +intersperse :: Builder -> [Builder] -> [Builder] +intersperse _ [] = [] +intersperse sep (x:xs) = x : prependToAll sep xs + +prependToAll :: Builder -> [Builder] -> [Builder] +prependToAll _ [] = [] +prependToAll sep (x:xs) = sep <> x : prependToAll sep xs + +decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b +decodeWithP p to s = + case AL.parse p s of + AL.Done _ v -> case to v of + Right a -> Right a + Left msg -> Left $ "conversion error: " ++ msg + AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++ + show (BL8.unpack left) +{-# INLINE decodeWithP #-} + +mapM' :: Monad m => (a -> m b) -> [a] -> m [b] +mapM' f = go + where + go [] = return [] + go (x:xs) = do + !y <- f x + ys <- go xs + return (y : ys) +{-# INLINE mapM' #-} + From 4f503c18f57445ad391ff2032490d1eea7200ee5 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 2 Jan 2013 17:57:54 +0400 Subject: [PATCH 21/21] Missing type signature --- tests/UnitTests.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index ccfcd46..c47a2b3 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -74,6 +74,8 @@ recordsToList (S.Cons (Right x) rs) = case recordsToList rs of (Right xs) -> Right (x : xs) -- decodesStreamingAs :: BL.ByteString -> [[B.ByteString]] -> Assertion +decodesStreamingAs :: (Bool -> BL8.ByteString -> S.Records [B.ByteString]) + -> BL8.ByteString -> [[B.ByteString]] -> Assertion decodesStreamingAs dec input expected = assertResult input expected $ fmap (V.fromList . map V.fromList) $ recordsToList $ dec False input