From f2bf695a6bba8b26a2e8b33f19021dc1d6b629d8 Mon Sep 17 00:00:00 2001 From: Johan Tibell Date: Fri, 22 Feb 2013 16:54:25 -0800 Subject: [PATCH 01/13] Check-point commit Space-delimited parsing is mostly working, except that trailing whitespace is not dropped correctly. --- Data/Csv/Incremental.hs | 2 +- Data/Csv/Parser.hs | 52 ++++++++++++++++++++++++++++++++++------- tests/UnitTests.hs | 6 +++++ 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs index 9eeada5..6b42ee3 100644 --- a/Data/Csv/Incremental.hs +++ b/Data/Csv/Incremental.hs @@ -290,7 +290,7 @@ decodeWithP p !opts = go Incomplete [] . parser acc' | blankLine r = acc | otherwise = let !r' = convert r in r' : acc - parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput)) + parser = A.parse (record opts <* (endOfLine <|> endOfInput)) convert = runParser . p {-# INLINE decodeWithP #-} diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 0a8c80e..b2921cf 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -16,6 +16,7 @@ module Data.Csv.Parser ( DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , csv , csvWithHeader , header @@ -26,9 +27,10 @@ module Data.Csv.Parser import Blaze.ByteString.Builder (fromByteString, toByteString) import Blaze.ByteString.Builder.Char.Utf8 (fromChar) -import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional, - pure) -import Data.Attoparsec.Char8 (char, endOfInput, endOfLine) +import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many, + optional, pure) +import Control.Monad (when) +import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space) import qualified Data.Attoparsec as A import qualified Data.Attoparsec.Lazy as AL import Data.Attoparsec.Types (Parser) @@ -56,18 +58,40 @@ import Data.Csv.Util ((<$!>), blankLine) data DecodeOptions = DecodeOptions { -- | Field delimiter. decDelimiter :: {-# UNPACK #-} !Word8 + + -- | Runs of consecutive delimiters are regarded as a single + -- delimiter. This is useful e.g. when parsing white space + -- separated data. + , decMergeDelimiters :: !Bool + + -- | Trim leading and trailing whitespace at the begining and + -- end of each record (but not at the begining and end of each + -- field). + , decTrimRecordSpace :: !Bool } deriving (Eq, Show) +-- TODO: Document default values in defaultDecodeOptions + -- | Decoding options for parsing CSV files. defaultDecodeOptions :: DecodeOptions defaultDecodeOptions = DecodeOptions { decDelimiter = 44 -- comma + , decMergeDelimiters = False + , decTrimRecordSpace = False + } + +-- | Decoding options for parsing space-delimited files. +spaceDecodeOptions :: DecodeOptions +spaceDecodeOptions = DecodeOptions + { decDelimiter = 32 -- space + , decMergeDelimiters = True + , decTrimRecordSpace = True } -- | Parse a CSV file that does not include a header. csv :: DecodeOptions -> AL.Parser Csv csv !opts = do - vals <- record (decDelimiter opts) `sepBy1'` endOfLine + vals <- record opts `sepBy1'` endOfLine _ <- optional endOfLine endOfInput let nonEmpty = removeBlankLines vals @@ -96,7 +120,7 @@ csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord) csvWithHeader !opts = do !hdr <- header (decDelimiter opts) vals <- map (toNamedRecord hdr) . removeBlankLines <$> - (record (decDelimiter opts)) `sepBy1'` endOfLine + (record opts) `sepBy1'` endOfLine _ <- optional endOfLine endOfInput let !v = V.fromList vals @@ -120,13 +144,23 @@ removeBlankLines = filter (not . blankLine) -- CSV file is allowed to not have a terminating line separator. You -- most likely want to use the 'endOfLine' parser in combination with -- this parser. -record :: Word8 -- ^ Field delimiter - -> AL.Parser Record -record !delim = do - fs <- field delim `sepBy1'` (A.word8 delim) +record :: DecodeOptions -> AL.Parser Record +record !opts = do + when trim skipSpaces + fs <- field delim `sepBy1'` delimiter + when trim skipSpaces return $! V.fromList fs + where + trim = decTrimRecordSpace opts + delim = decDelimiter opts + delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim + | otherwise = A.word8 delim {-# INLINE record #-} +skipSpaces :: AL.Parser () +skipSpaces = scan + where scan = (space *> scan) <|> pure () + -- | Parse a field. The field may be in either the escaped or -- non-escaped format. The return value is unescaped. field :: Word8 -> AL.Parser Field diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 2231e0c..b9cc458 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -144,6 +144,7 @@ positionalTests = ] decodeWithTests = [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]]) + , ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]]) ] encodeTest (name, input, expected) = @@ -158,6 +159,11 @@ positionalTests = testCase name $ decodesWithStreamingAs opts input expected defEnc = defaultEncodeOptions defDec = defaultDecodeOptions + spaceDec = defaultDecodeOptions -- TODO: Use spaceDecodeOptions + { decDelimiter = 32 -- space + , decMergeDelimiters = True + , decTrimRecordSpace = True + } nameBasedTests :: [TF.Test] nameBasedTests = From 9eeea09549395c261e040b25c561b23b6437373f Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 25 Feb 2013 18:52:21 +0400 Subject: [PATCH 02/13] Fix build --- Data/Csv/Encoding.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index ba99d68..5eb8f8b 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -254,7 +254,7 @@ csv !opts = do return $! V.fromList vals where records = do - !r <- record (decDelimiter opts) + !r <- record opts if blankLine r then (endOfLine *> records) <|> pure [] else case runParser (parseRecord r) of @@ -276,7 +276,7 @@ csvWithHeader !opts = do return (hdr, v) where records hdr = do - !r <- record (decDelimiter opts) + !r <- record opts if blankLine r then (endOfLine *> records hdr) <|> pure [] else case runParser (convert hdr r) of From d217e27d335a3fb589ef8b081df85b20b5696037 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 25 Feb 2013 18:59:26 +0400 Subject: [PATCH 03/13] Add more failing tests --- tests/UnitTests.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index b9cc458..6cf36e4 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -145,6 +145,8 @@ positionalTests = decodeWithTests = [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]]) , ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]]) + , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"] + , ["11","22"]]) ] encodeTest (name, input, expected) = From 6b8e1ddbf564561686446fe9343dbd55893815c8 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 25 Feb 2013 20:48:38 +0400 Subject: [PATCH 04/13] Correctly leading/trailing spaces It's hard to strip whitespaces correctly because a) It's valid part of field for CSV so "a,b,c " -> ["a","b","c "] b) If we're using spaces as delimiter we get spurious empty field at the end fo the line "a b c " -> ["a","b","c",""] Only reliable way to strip them is to read whole line, strip spaces and parse stripped line. --- Data/Csv/Parser.hs | 50 +++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index b2921cf..3e745c2 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -27,10 +27,9 @@ module Data.Csv.Parser import Blaze.ByteString.Builder (fromByteString, toByteString) import Blaze.ByteString.Builder.Char.Utf8 (fromChar) -import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many, +import Control.Applicative (Alternative, (*>), (<$), (<$>), (<*), (<|>), optional, pure) -import Control.Monad (when) -import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space) +import Data.Attoparsec.Char8 (char, endOfInput, endOfLine) import qualified Data.Attoparsec as A import qualified Data.Attoparsec.Lazy as AL import Data.Attoparsec.Types (Parser) @@ -145,22 +144,31 @@ removeBlankLines = filter (not . blankLine) -- most likely want to use the 'endOfLine' parser in combination with -- this parser. record :: DecodeOptions -> AL.Parser Record -record !opts = do - when trim skipSpaces - fs <- field delim `sepBy1'` delimiter - when trim skipSpaces - return $! V.fromList fs +record !opts + -- If we need to trim spaces from line only robust way to do so is + -- to read whole line, remove spaces and run record parser on + -- trimmed line. For example: + -- + -- + "a,b,c " will be parsed as ["a","b","c "] since spaces are + -- allowed in field + -- + "a b c " will be parsed as ["a","b","c",""] if we use space + -- as separator. + | decTrimRecordSpace opts = do + AL.skipMany $ AL.satisfy isSpace + line <- AL.takeWhile $ \c -> c /= newline && c /= cr + let (dat,_) = S.spanEnd isSpace line + case AL.parseOnly parser dat of + Left e -> fail e + Right x -> return x + | otherwise = parser where - trim = decTrimRecordSpace opts delim = decDelimiter opts - delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim - | otherwise = A.word8 delim + delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) + | otherwise = () <$ A.word8 delim + parser = do fs <- field delim `sepBy1'` delimiter + return $! V.fromList fs {-# INLINE record #-} -skipSpaces :: AL.Parser () -skipSpaces = scan - where scan = (space *> scan) <|> pure () - -- | Parse a field. The field may be in either the escaped or -- non-escaped format. The return value is unescaped. field :: Word8 -> AL.Parser Field @@ -212,7 +220,13 @@ unescape = toByteString <$!> go mempty where then return (acc `mappend` fromByteString h) else rest -doubleQuote, newline, cr :: Word8 +doubleQuote, newline, cr, space, tab :: Word8 doubleQuote = 34 -newline = 10 -cr = 13 +newline = 10 +cr = 13 +space = 32 +tab = 9 + +isSpace :: Word8 -> Bool +isSpace c = c == space || c == tab +{-# INLINE isSpace #-} From 6fbc77f89a5576ac35cb9b891e81641d3626acab Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 25 Feb 2013 21:00:15 +0400 Subject: [PATCH 05/13] parser for header is same as parser for record --- Data/Csv/Encoding.hs | 4 ++-- Data/Csv/Incremental.hs | 2 +- Data/Csv/Parser.hs | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index 5eb8f8b..d0e846d 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -122,7 +122,7 @@ decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool -> BL8.ByteString -> Either String a decodeWithC p !opts skipHeader = decodeWithP parser where parser - | skipHeader = header (decDelimiter opts) *> p opts + | skipHeader = header opts *> p opts | otherwise = p opts {-# INLINE decodeWithC #-} @@ -268,7 +268,7 @@ csv !opts = do csvWithHeader :: FromNamedRecord a => DecodeOptions -> AL.Parser (Header, V.Vector a) csvWithHeader !opts = do - !hdr <- header (decDelimiter opts) + !hdr <- header opts vals <- records hdr _ <- optional endOfLine endOfInput diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs index 6b42ee3..17c0660 100644 --- a/Data/Csv/Incremental.hs +++ b/Data/Csv/Incremental.hs @@ -133,7 +133,7 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString decodeHeaderWith !opts = PartialH (go . parser) where - parser = A.parse (header $ decDelimiter opts) + parser = A.parse (header opts) go (A.Fail rest _ msg) = FailH rest err where err = "parse error (" ++ msg ++ ")" diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 3e745c2..f792072 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -117,7 +117,7 @@ sepBy1' p s = go -- | Parse a CSV file that includes a header. csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord) csvWithHeader !opts = do - !hdr <- header (decDelimiter opts) + !hdr <- header opts vals <- map (toNamedRecord hdr) . removeBlankLines <$> (record opts) `sepBy1'` endOfLine _ <- optional endOfLine @@ -126,9 +126,10 @@ csvWithHeader !opts = do return (hdr, v) -- | Parse a header, including the terminating line separator. -header :: Word8 -- ^ Field delimiter +header :: DecodeOptions -- ^ Field delimiter -> AL.Parser Header -header !delim = V.fromList <$!> name delim `sepBy1'` (A.word8 delim) <* endOfLine +header = record +{-# INLINE header #-} -- | Parse a header name. Header names have the same format as regular -- 'field's. From 0a60b79f57afe05a0f30eead3f212952d3f87a7b Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Mon, 25 Feb 2013 21:06:20 +0400 Subject: [PATCH 06/13] Pass DecodeOptions to the field and name parsers It's necessary for implementing delimiters which are not single character --- Data/Csv/Parser.hs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index f792072..34c950f 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -133,8 +133,8 @@ header = record -- | Parse a header name. Header names have the same format as regular -- 'field's. -name :: Word8 -> AL.Parser Name -name !delim = field delim +name :: DecodeOptions -> AL.Parser Name +name = field removeBlankLines :: [Record] -> [Record] removeBlankLines = filter (not . blankLine) @@ -166,20 +166,20 @@ record !opts delim = decDelimiter opts delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) | otherwise = () <$ A.word8 delim - parser = do fs <- field delim `sepBy1'` delimiter + parser = do fs <- field opts `sepBy1'` delimiter return $! V.fromList fs {-# INLINE record #-} -- | Parse a field. The field may be in either the escaped or -- non-escaped format. The return value is unescaped. -field :: Word8 -> AL.Parser Field -field !delim = do +field :: DecodeOptions -> AL.Parser Field +field !opt = do mb <- A.peekWord8 -- We purposely don't use <|> as we want to commit to the first -- choice if we see a double quote. case mb of Just b | b == doubleQuote -> escapedField - _ -> unescapedField delim + _ -> unescapedField opt {-# INLINE field #-} escapedField :: AL.Parser S.ByteString @@ -197,11 +197,13 @@ escapedField = do Left err -> fail err else return s -unescapedField :: Word8 -> AL.Parser S.ByteString -unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && - c /= newline && - c /= delim && - c /= cr) +unescapedField :: DecodeOptions -> AL.Parser S.ByteString +unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote && + c /= newline && + c /= delim && + c /= cr) + where + delim = decDelimiter opt dquote :: AL.Parser Char dquote = char '"' From 24f375ea05c87b474b9f36d6ec0fc290c6ec0a3c Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Thu, 28 Feb 2013 12:39:19 +0400 Subject: [PATCH 07/13] Use predicate on delimiter character. Now all tests are passing. --- Data/Csv/Parser.hs | 18 +++++++++--------- tests/UnitTests.hs | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 34c950f..c79e1a8 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -56,7 +56,7 @@ import Data.Csv.Util ((<$!>), blankLine) -- > } data DecodeOptions = DecodeOptions { -- | Field delimiter. - decDelimiter :: {-# UNPACK #-} !Word8 + decDelimiter :: Word8 -> Bool -- | Runs of consecutive delimiters are regarded as a single -- delimiter. This is useful e.g. when parsing white space @@ -67,14 +67,14 @@ data DecodeOptions = DecodeOptions -- end of each record (but not at the begining and end of each -- field). , decTrimRecordSpace :: !Bool - } deriving (Eq, Show) + } -- TODO: Document default values in defaultDecodeOptions -- | Decoding options for parsing CSV files. defaultDecodeOptions :: DecodeOptions defaultDecodeOptions = DecodeOptions - { decDelimiter = 44 -- comma + { decDelimiter = (==44) -- comma , decMergeDelimiters = False , decTrimRecordSpace = False } @@ -82,7 +82,7 @@ defaultDecodeOptions = DecodeOptions -- | Decoding options for parsing space-delimited files. spaceDecodeOptions :: DecodeOptions spaceDecodeOptions = DecodeOptions - { decDelimiter = 32 -- space + { decDelimiter = \c -> c == space || c == tab , decMergeDelimiters = True , decTrimRecordSpace = True } @@ -164,8 +164,8 @@ record !opts | otherwise = parser where delim = decDelimiter opts - delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) - | otherwise = () <$ A.word8 delim + delimiter | decMergeDelimiters opts = A.skipMany1 (A.satisfy delim) + | otherwise = () <$ A.satisfy delim parser = do fs <- field opts `sepBy1'` delimiter return $! V.fromList fs {-# INLINE record #-} @@ -199,9 +199,9 @@ escapedField = do unescapedField :: DecodeOptions -> AL.Parser S.ByteString unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote && - c /= newline && - c /= delim && - c /= cr) + c /= newline && + c /= cr && + not (delim c)) where delim = decDelimiter opt diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 6cf36e4..919d582 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -22,6 +22,8 @@ import Test.Framework.Providers.QuickCheck2 as TF import Data.Csv hiding (record) import qualified Data.Csv.Streaming as S +import Data.Csv.Parser (spaceDecodeOptions) + ------------------------------------------------------------------------ -- Parse tests @@ -143,10 +145,12 @@ positionalTests = , ("rfc4180", rfc4180Input, rfc4180Output) ] decodeWithTests = - [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]]) + [ ("tab-delim", defDec { decDelimiter = (==9) }, "1\t2", [["1", "2"]]) , ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]]) , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"] , ["11","22"]]) + , ("blankLine-space", spaceDec, "1 2\n\n3 4\n", [ ["1","2"] + , ["3","4"]]) ] encodeTest (name, input, expected) = @@ -159,13 +163,9 @@ positionalTests = testCase name $ input `decodesStreamingAs` expected streamingDecodeWithTest (name, opts, input, expected) = testCase name $ decodesWithStreamingAs opts input expected - defEnc = defaultEncodeOptions - defDec = defaultDecodeOptions - spaceDec = defaultDecodeOptions -- TODO: Use spaceDecodeOptions - { decDelimiter = 32 -- space - , decMergeDelimiters = True - , decTrimRecordSpace = True - } + defEnc = defaultEncodeOptions + defDec = defaultDecodeOptions + spaceDec = spaceDecodeOptions nameBasedTests :: [TF.Test] nameBasedTests = From f7d8d8a1f04fbbd8df606ffc9f5e5c6cd831a467 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Thu, 28 Feb 2013 18:54:09 +0400 Subject: [PATCH 08/13] Reexport spaceDecodeOptions --- Data/Csv.hs | 1 + Data/Csv/Encoding.hs | 1 + tests/UnitTests.hs | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Data/Csv.hs b/Data/Csv.hs index 7cb45ec..454c62f 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -31,6 +31,7 @@ module Data.Csv -- $options , DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , decodeWith , decodeByNameWith , EncodeOptions(..) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index d0e846d..c75868c 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -20,6 +20,7 @@ module Data.Csv.Encoding -- ** Encoding and decoding options , DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , decodeWith , decodeByNameWith , EncodeOptions(..) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 919d582..24ac684 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -22,7 +22,6 @@ import Test.Framework.Providers.QuickCheck2 as TF import Data.Csv hiding (record) import qualified Data.Csv.Streaming as S -import Data.Csv.Parser (spaceDecodeOptions) ------------------------------------------------------------------------ From 1c18502f5b74db39439f26c1c3a8047a4b84b130 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Thu, 28 Feb 2013 20:31:28 +0400 Subject: [PATCH 09/13] Inline unescapedField Now it depends on rather complex data structure and needs to be specialized by compiler --- Data/Csv/Parser.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index c79e1a8..dee0429 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -204,6 +204,7 @@ unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote && not (delim c)) where delim = decDelimiter opt +{-# INLINE unescapedField #-} dquote :: AL.Parser Char dquote = char '"' From 361703c875209a49d168ba9fdbdc3bc44a2b76f9 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Thu, 28 Feb 2013 20:22:11 +0400 Subject: [PATCH 10/13] Add options for encoding of space-delimited data Always escape delimiter. Now it's passed to escape function as parameter and not hardcoded as comma. Note correct encoding of space-delimited data depends on escaping of space character. --- Data/Csv/Encoding.hs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index c75868c..9defba4 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -25,6 +25,7 @@ module Data.Csv.Encoding , decodeByNameWith , EncodeOptions(..) , defaultEncodeOptions + , spaceEncodeOptions , encodeWith , encodeByNameWith ) where @@ -151,10 +152,16 @@ data EncodeOptions = EncodeOptions encDelimiter :: {-# UNPACK #-} !Word8 } deriving (Eq, Show) --- | Encoding options for CSV files. +-- | Encoding options for CSV files. Comma is used as separator. defaultEncodeOptions :: EncodeOptions defaultEncodeOptions = EncodeOptions - { encDelimiter = 44 -- comma + { encDelimiter = 44 -- comma + } + +-- | Encode options for space-delimited files. Tab is used as separator. +spaceEncodeOptions :: EncodeOptions +spaceEncodeOptions = EncodeOptions + { encDelimiter = 9 -- tab } -- | Like 'encode', but lets you customize how the CSV data is @@ -168,13 +175,13 @@ encodeWith opts = toLazyByteString encodeRecord :: Word8 -> Record -> Builder encodeRecord delim = mconcat . intersperse (fromWord8 delim) - . map fromByteString . map escape . V.toList + . map fromByteString . map (escape delim) . V.toList {-# INLINE encodeRecord #-} -- TODO: Optimize -escape :: B.ByteString -> B.ByteString -escape s - | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr || +escape :: Word8 -> B.ByteString -> B.ByteString +escape delim s + | B.find (\ b -> b == dquote || b == delim || b == nl || b == cr || b == sp) s == Nothing = s | otherwise = B.concat ["\"", @@ -182,11 +189,11 @@ escape s (\ b -> if b == dquote then "\"\"" else B.singleton b) s, "\""] where + sp = 32 dquote = 34 - comma = 44 nl = 10 cr = 13 - sp = 32 + -- | Like 'encodeByName', but lets you customize how the CSV data is -- encoded. From fa7fe10a094534af71c1ad94098dfc888e50f9bc Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Fri, 8 Mar 2013 18:58:47 +0400 Subject: [PATCH 11/13] Reexport spaceEncodingOptions from Data.Csv --- Data/Csv.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Data/Csv.hs b/Data/Csv.hs index 454c62f..c65ae51 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -36,6 +36,7 @@ module Data.Csv , decodeByNameWith , EncodeOptions(..) , defaultEncodeOptions + , spaceEncodeOptions , encodeWith , encodeByNameWith From 1ca1784aa50f19d14124cfd7ae8b2058c0f042b6 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Fri, 8 Mar 2013 18:59:27 +0400 Subject: [PATCH 12/13] Add tests for encoding of space delimited data All tests are passing --- tests/UnitTests.hs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 24ac684..4e1a607 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -116,6 +116,14 @@ positionalTests = [ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 }) [["1", "2"]] "1\t2\r\n" ] + , testGroup "encodeSpace" $ map (\(n,a,b) -> testCase n $ encodesWithAs spaceEncodeOptions a b) + [ ("simple", [["abc"]], "abc\r\n") + , ("leadingSpace", [[" abc"]], "\" abc\"\r\n") + , ("comma", [["abc,def"]], "abc,def\r\n") + , ("space", [["abc def"]], "\"abc def\"\r\n") + , ("tab", [["abc\tdef"]], "\"abc\tdef\"\r\n") + , ("twoFields", [["abc","def"]], "abc\tdef\r\n") + ] , testGroup "decode" $ map decodeTest decodeTests , testGroup "decodeWith" $ map decodeWithTest decodeWithTests , testGroup "streaming" From b0090850ab86a143bdfee20fff0b009f35b6c4bb Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Fri, 8 Mar 2013 19:46:43 +0400 Subject: [PATCH 13/13] Document field values in defaultDecodeOptions and spaceDecodeOptions --- Data/Csv/Parser.hs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index dee0429..7002dc6 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -69,20 +69,30 @@ data DecodeOptions = DecodeOptions , decTrimRecordSpace :: !Bool } --- TODO: Document default values in defaultDecodeOptions - --- | Decoding options for parsing CSV files. +-- | Decoding options for parsing CSV files. Fields' values are set to: +-- +-- [@'decDelimiter'@] comma +-- +-- [@'decMergeDelimiters'@] false +-- +-- [@'decTrimRecordSpace'@] false defaultDecodeOptions :: DecodeOptions defaultDecodeOptions = DecodeOptions - { decDelimiter = (==44) -- comma + { decDelimiter = (==44) -- comma , decMergeDelimiters = False , decTrimRecordSpace = False } --- | Decoding options for parsing space-delimited files. +-- | Decoding options for parsing space-delimited files. Fields' values are set to: +-- +-- [@'decDelimiter'@] space or tab character. +-- +-- [@'decMergeDelimiters'@] true +-- +-- [@'decTrimRecordSpace'@] true spaceDecodeOptions :: DecodeOptions spaceDecodeOptions = DecodeOptions - { decDelimiter = \c -> c == space || c == tab + { decDelimiter = \c -> c == space || c == tab , decMergeDelimiters = True , decTrimRecordSpace = True }