haskell-hvr · Shimuuar · Feb 23, 2013 · Feb 25, 2013 · Feb 25, 2013 · Feb 25, 2013
diff --git a/Data/Csv.hs b/Data/Csv.hs
@@ -32,10 +32,12 @@ module Data.Csv
     -- $options
     , DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , decodeWith
     , decodeByNameWith
     , EncodeOptions(..)
     , defaultEncodeOptions
+    , spaceEncodeOptions
     , encodeWith
     , encodeByNameWith
 

diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
@@ -21,10 +21,12 @@ module Data.Csv.Encoding
     -- ** Encoding and decoding options
     , DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , decodeWith
     , decodeByNameWith
     , EncodeOptions(..)
     , defaultEncodeOptions
+    , spaceEncodeOptions
     , encodeWith
     , encodeByNameWith
     ) where
@@ -123,7 +125,7 @@ decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> HasHeader
             -> BL8.ByteString -> Either String a
 decodeWithC p !opts hasHeader = decodeWithP parser
   where parser = case hasHeader of
-            HasHeader -> header (decDelimiter opts) *> p opts
+            HasHeader -> header opts *> p opts
             NoHeader  -> p opts
 {-# INLINE decodeWithC #-}
 
@@ -151,10 +153,16 @@ data EncodeOptions = EncodeOptions
       encDelimiter  :: {-# UNPACK #-} !Word8
     } deriving (Eq, Show)
 
--- | Encoding options for CSV files.
+-- | Encoding options for CSV files. Comma is used as separator.
 defaultEncodeOptions :: EncodeOptions
 defaultEncodeOptions = EncodeOptions
-    { encDelimiter = 44  -- comma
+    { encDelimiter   = 44  -- comma
+    }
+
+-- | Encode options for space-delimited files. Tab is used as separator.
+spaceEncodeOptions :: EncodeOptions
+spaceEncodeOptions = EncodeOptions
+    { encDelimiter   = 9       -- tab
     }
 
 -- | Like 'encode', but lets you customize how the CSV data is
@@ -167,13 +175,13 @@ encodeWith opts = toLazyByteString
 
 encodeRecord :: Word8 -> Record -> Builder
 encodeRecord delim = mconcat . intersperse (fromWord8 delim)
-                     . map fromByteString . map escape . V.toList
+                     . map fromByteString . map (escape delim) . V.toList
 {-# INLINE encodeRecord #-}
 
 -- TODO: Optimize
-escape :: B.ByteString -> B.ByteString
-escape s
-    | B.any (\ b -> b == dquote || b == comma || b == nl || b == cr || b == sp)
+escape :: Word8 -> B.ByteString -> B.ByteString
+escape delim s
+    | B.any (\ b -> b == dquote || b == delim || b == nl || b == cr || b == sp)
         s = toByteString $
             fromWord8 dquote
             <> B.foldl
@@ -185,11 +193,11 @@ escape s
             <> fromWord8 dquote
     | otherwise = s
   where
+    sp     = 32
     dquote = 34
-    comma  = 44
     nl     = 10
     cr     = 13
-    sp     = 32
+
 
 -- | Like 'encodeByName', but lets you customize how the CSV data is
 -- encoded.
@@ -262,7 +270,7 @@ csv !opts = do
     return $! V.fromList vals
   where
     records = do
-        !r <- record (decDelimiter opts)
+        !r <- record opts
         if blankLine r
             then (endOfLine *> records) <|> pure []
             else case runParser (parseRecord r) of
@@ -276,15 +284,15 @@ csv !opts = do
 csvWithHeader :: FromNamedRecord a => DecodeOptions
               -> AL.Parser (Header, V.Vector a)
 csvWithHeader !opts = do
-    !hdr <- header (decDelimiter opts)
+    !hdr <- header  opts
     vals <- records hdr
     _ <- optional endOfLine
     endOfInput
     let !v = V.fromList vals
     return (hdr, v)
   where
     records hdr = do
-        !r <- record (decDelimiter opts)
+        !r <- record opts
         if blankLine r
             then (endOfLine *> records hdr) <|> pure []
             else case runParser (convert hdr r) of

diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs
@@ -133,7 +133,7 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions
 decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString
 decodeHeaderWith !opts = PartialH (go . parser)
   where
-    parser = A.parse (header $ decDelimiter opts)
+    parser = A.parse (header opts)
 
     go (A.Fail rest _ msg) = FailH rest err
       where err = "parse error (" ++ msg ++ ")"
@@ -290,7 +290,7 @@ decodeWithP p !opts = go Incomplete [] . parser
             acc' | blankLine r = acc
                  | otherwise   = let !r' = convert r in r' : acc
 
-    parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput))
+    parser = A.parse (record opts <* (endOfLine <|> endOfInput))
     convert = runParser . p
 {-# INLINE decodeWithP #-}
 

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
@@ -16,6 +16,7 @@
 module Data.Csv.Parser
     ( DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , csv
     , csvWithHeader
     , header
@@ -26,8 +27,8 @@ module Data.Csv.Parser
 
 import Blaze.ByteString.Builder (fromByteString, toByteString)
 import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional,
-                            pure)
+import Control.Applicative (Alternative, (*>), (<$), (<$>), (<*), (<|>),
+                            optional, pure)
 import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
 import qualified Data.Attoparsec as A
 import qualified Data.Attoparsec.Lazy as AL
@@ -55,19 +56,51 @@ import Data.Csv.Util ((<$!>), blankLine)
 -- >     }
 data DecodeOptions = DecodeOptions
     { -- | Field delimiter.
-      decDelimiter  :: {-# UNPACK #-} !Word8
-    } deriving (Eq, Show)
+      decDelimiter  :: Word8 -> Bool
 
--- | Decoding options for parsing CSV files.
+      -- | Runs of consecutive delimiters are regarded as a single
+      -- delimiter. This is useful e.g. when parsing white space
+      -- separated data.
+    , decMergeDelimiters :: !Bool
+
+      -- | Trim leading and trailing whitespace at the begining and
+      -- end of each record (but not at the begining and end of each
+      -- field).
+    , decTrimRecordSpace :: !Bool
+    }
+
+-- | Decoding options for parsing CSV files. Fields' values are set to:
+--
+-- [@'decDelimiter'@] comma
+--
+-- [@'decMergeDelimiters'@] false
+--
+-- [@'decTrimRecordSpace'@] false
 defaultDecodeOptions :: DecodeOptions
 defaultDecodeOptions = DecodeOptions
-    { decDelimiter = 44  -- comma
+    { decDelimiter       = (==44)  -- comma
+    , decMergeDelimiters = False
+    , decTrimRecordSpace = False
+    }
+
+-- | Decoding options for parsing space-delimited files. Fields' values are set to:
+--
+-- [@'decDelimiter'@] space or tab character.
+--
+-- [@'decMergeDelimiters'@] true
+--
+-- [@'decTrimRecordSpace'@] true
+spaceDecodeOptions :: DecodeOptions
+spaceDecodeOptions = DecodeOptions
+    { decDelimiter       = \c -> c == space || c == tab
+    , decMergeDelimiters = True
+    , decTrimRecordSpace = True
     }
 
 -- | Parse a CSV file that does not include a header.
 csv :: DecodeOptions -> AL.Parser Csv
 csv !opts = do
-    vals <- record (decDelimiter opts) `sepBy1'` endOfLine
+    vals <- record opts `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let nonEmpty = removeBlankLines vals
@@ -94,23 +127,24 @@ sepBy1' p s = go
 -- | Parse a CSV file that includes a header.
 csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
 csvWithHeader !opts = do
-    !hdr <- header (decDelimiter opts)
+    !hdr <- header opts
     vals <- map (toNamedRecord hdr) . removeBlankLines <$>
-            (record (decDelimiter opts)) `sepBy1'` endOfLine
+            (record opts) `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let !v = V.fromList vals
     return (hdr, v)
 
 -- | Parse a header, including the terminating line separator.
-header :: Word8  -- ^ Field delimiter
+header :: DecodeOptions  -- ^ Field delimiter
        -> AL.Parser Header
-header !delim = V.fromList <$!> name delim `sepBy1'` (A.word8 delim) <* endOfLine
+header = record
+{-# INLINE header #-}
 
 -- | Parse a header name. Header names have the same format as regular
 -- 'field's.
-name :: Word8 -> AL.Parser Name
-name !delim = field delim
+name :: DecodeOptions -> AL.Parser Name
+name = field
 
 removeBlankLines :: [Record] -> [Record]
 removeBlankLines = filter (not . blankLine)
@@ -120,23 +154,42 @@ removeBlankLines = filter (not . blankLine)
 -- CSV file is allowed to not have a terminating line separator. You
 -- most likely want to use the 'endOfLine' parser in combination with
 -- this parser.
-record :: Word8  -- ^ Field delimiter
-       -> AL.Parser Record
-record !delim = do
-    fs <- field delim `sepBy1'` (A.word8 delim)
-    return $! V.fromList fs
+record :: DecodeOptions -> AL.Parser Record
+record !opts
+    -- If we need to trim spaces from line only robust way to do so is
+    -- to read whole line, remove spaces and run record parser on
+    -- trimmed line. For example:
+    --
+    -- + "a,b,c " will be parsed as ["a","b","c "] since spaces are
+    --   allowed in field
+    -- + "a b c " will be parsed as ["a","b","c",""] if we use space
+    --   as separator.
+    | decTrimRecordSpace opts = do
+        AL.skipMany $ AL.satisfy isSpace
+        line <- AL.takeWhile $ \c -> c /= newline && c /= cr
+        let (dat,_) = S.spanEnd isSpace line
+        case AL.parseOnly parser dat of
+          Left  e -> fail e
+          Right x -> return x
+    | otherwise              = parser
+  where
+    delim = decDelimiter opts
+    delimiter | decMergeDelimiters opts = A.skipMany1 (A.satisfy delim)
+              | otherwise               = () <$ A.satisfy delim
+    parser = do fs <- field opts `sepBy1'` delimiter
+                return $! V.fromList fs
 {-# INLINE record #-}
 
 -- | Parse a field. The field may be in either the escaped or
 -- non-escaped format. The return value is unescaped.
-field :: Word8 -> AL.Parser Field
-field !delim = do
+field :: DecodeOptions -> AL.Parser Field
+field !opt = do
     mb <- A.peekWord8
     -- We purposely don't use <|> as we want to commit to the first
     -- choice if we see a double quote.
     case mb of
         Just b | b == doubleQuote -> escapedField
-        _                         -> unescapedField delim
+        _                         -> unescapedField opt
 {-# INLINE field #-}
 
 escapedField :: AL.Parser S.ByteString
@@ -154,11 +207,14 @@ escapedField = do
             Left err -> fail err
         else return s
 
-unescapedField :: Word8 -> AL.Parser S.ByteString
-unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
-                                            c /= newline &&
-                                            c /= delim &&
-                                            c /= cr)
+unescapedField :: DecodeOptions -> AL.Parser S.ByteString
+unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote &&
+                                          c /= newline     &&
+                                          c /= cr          &&
+                                          not (delim c))
+  where
+    delim = decDelimiter opt
+{-# INLINE unescapedField #-}
 
 dquote :: AL.Parser Char
 dquote = char '"'
@@ -178,7 +234,13 @@ unescape = toByteString <$!> go mempty where
       then return (acc `mappend` fromByteString h)
       else rest
 
-doubleQuote, newline, cr :: Word8
+doubleQuote, newline, cr, space, tab :: Word8
 doubleQuote = 34
-newline = 10
-cr = 13
+newline     = 10
+cr          = 13
+space       = 32
+tab         = 9
+
+isSpace :: Word8 -> Bool
+isSpace c = c == space || c == tab
+{-# INLINE isSpace #-}
diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
@@ -23,6 +23,7 @@ import Test.Framework.Providers.QuickCheck2 as TF
 import Data.Csv hiding (record)
 import qualified Data.Csv.Streaming as S
 
+
 ------------------------------------------------------------------------
 -- Parse tests
 
@@ -114,6 +115,14 @@ positionalTests =
       [ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 })
         [["1", "2"]] "1\t2\r\n"
       ]
+    , testGroup "encodeSpace" $ map (\(n,a,b) -> testCase n $ encodesWithAs spaceEncodeOptions a b)
+      [ ("simple",       [["abc"]],          "abc\r\n")
+      , ("leadingSpace", [[" abc"]],         "\" abc\"\r\n")
+      , ("comma",        [["abc,def"]],      "abc,def\r\n")
+      , ("space",        [["abc def"]],      "\"abc def\"\r\n")
+      , ("tab",          [["abc\tdef"]],     "\"abc\tdef\"\r\n")
+      , ("twoFields",    [["abc","def"]],    "abc\tdef\r\n")
+      ]
     , testGroup "decode" $ map decodeTest decodeTests
     , testGroup "decodeWith" $ map decodeWithTest decodeWithTests
     , testGroup "streaming"
@@ -142,7 +151,12 @@ positionalTests =
         , ("rfc4180", rfc4180Input, rfc4180Output)
         ]
     decodeWithTests =
-        [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
+        [ ("tab-delim", defDec { decDelimiter = (==9) }, "1\t2", [["1", "2"]])
+        , ("mixed-space", spaceDec, "  88 c \t  0.4  ", [["88", "c", "0.4"]])
+        , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"]
+                                                           , ["11","22"]])
+        , ("blankLine-space", spaceDec, "1 2\n\n3 4\n", [ ["1","2"]
+                                                        , ["3","4"]])
         ]
 
     encodeTest (name, input, expected) =
@@ -155,8 +169,9 @@ positionalTests =
         testCase name $ input `decodesStreamingAs` expected
     streamingDecodeWithTest (name, opts, input, expected) =
         testCase name $ decodesWithStreamingAs opts input expected
-    defEnc = defaultEncodeOptions
-    defDec = defaultDecodeOptions
+    defEnc   = defaultEncodeOptions
+    defDec   = defaultDecodeOptions
+    spaceDec = spaceDecodeOptions
 
 nameBasedTests :: [TF.Test]
 nameBasedTests =