From f2bf695a6bba8b26a2e8b33f19021dc1d6b629d8 Mon Sep 17 00:00:00 2001
From: Johan Tibell <johan.tibell@gmail.com>
Date: Fri, 22 Feb 2013 16:54:25 -0800
Subject: [PATCH 01/13] Check-point commit

Space-delimited parsing is mostly working, except that trailing
whitespace is not dropped correctly.
---
 Data/Csv/Incremental.hs |  2 +-
 Data/Csv/Parser.hs      | 52 ++++++++++++++++++++++++++++++++++-------
 tests/UnitTests.hs      |  6 +++++
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs
index 9eeada5..6b42ee3 100644
--- a/Data/Csv/Incremental.hs
+++ b/Data/Csv/Incremental.hs
@@ -290,7 +290,7 @@ decodeWithP p !opts = go Incomplete [] . parser
             acc' | blankLine r = acc
                  | otherwise   = let !r' = convert r in r' : acc
 
-    parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput))
+    parser = A.parse (record opts <* (endOfLine <|> endOfInput))
     convert = runParser . p
 {-# INLINE decodeWithP #-}
 
diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index 0a8c80e..b2921cf 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -16,6 +16,7 @@
 module Data.Csv.Parser
     ( DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , csv
     , csvWithHeader
     , header
@@ -26,9 +27,10 @@ module Data.Csv.Parser
 
 import Blaze.ByteString.Builder (fromByteString, toByteString)
 import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional,
-                            pure)
-import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
+import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many,
+                            optional, pure)
+import Control.Monad (when)
+import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space)
 import qualified Data.Attoparsec as A
 import qualified Data.Attoparsec.Lazy as AL
 import Data.Attoparsec.Types (Parser)
@@ -56,18 +58,40 @@ import Data.Csv.Util ((<$!>), blankLine)
 data DecodeOptions = DecodeOptions
     { -- | Field delimiter.
       decDelimiter  :: {-# UNPACK #-} !Word8
+
+      -- | Runs of consecutive delimiters are regarded as a single
+      -- delimiter. This is useful e.g. when parsing white space
+      -- separated data.
+    , decMergeDelimiters :: !Bool
+
+      -- | Trim leading and trailing whitespace at the begining and
+      -- end of each record (but not at the begining and end of each
+      -- field).
+    , decTrimRecordSpace :: !Bool
     } deriving (Eq, Show)
 
+-- TODO: Document default values in defaultDecodeOptions
+
 -- | Decoding options for parsing CSV files.
 defaultDecodeOptions :: DecodeOptions
 defaultDecodeOptions = DecodeOptions
     { decDelimiter = 44  -- comma
+    , decMergeDelimiters = False
+    , decTrimRecordSpace = False
+    }
+
+-- | Decoding options for parsing space-delimited files.
+spaceDecodeOptions :: DecodeOptions
+spaceDecodeOptions = DecodeOptions
+    { decDelimiter = 32  -- space
+    , decMergeDelimiters = True
+    , decTrimRecordSpace = True
     }
 
 -- | Parse a CSV file that does not include a header.
 csv :: DecodeOptions -> AL.Parser Csv
 csv !opts = do
-    vals <- record (decDelimiter opts) `sepBy1'` endOfLine
+    vals <- record opts `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let nonEmpty = removeBlankLines vals
@@ -96,7 +120,7 @@ csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
 csvWithHeader !opts = do
     !hdr <- header (decDelimiter opts)
     vals <- map (toNamedRecord hdr) . removeBlankLines <$>
-            (record (decDelimiter opts)) `sepBy1'` endOfLine
+            (record opts) `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let !v = V.fromList vals
@@ -120,13 +144,23 @@ removeBlankLines = filter (not . blankLine)
 -- CSV file is allowed to not have a terminating line separator. You
 -- most likely want to use the 'endOfLine' parser in combination with
 -- this parser.
-record :: Word8  -- ^ Field delimiter
-       -> AL.Parser Record
-record !delim = do
-    fs <- field delim `sepBy1'` (A.word8 delim)
+record :: DecodeOptions -> AL.Parser Record
+record !opts = do
+    when trim skipSpaces
+    fs <- field delim `sepBy1'` delimiter
+    when trim skipSpaces
     return $! V.fromList fs
+  where
+    trim = decTrimRecordSpace opts
+    delim = decDelimiter opts
+    delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim
+              | otherwise               = A.word8 delim
 {-# INLINE record #-}
 
+skipSpaces :: AL.Parser ()
+skipSpaces = scan
+  where scan = (space *> scan) <|> pure ()
+
 -- | Parse a field. The field may be in either the escaped or
 -- non-escaped format. The return value is unescaped.
 field :: Word8 -> AL.Parser Field
diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
index 2231e0c..b9cc458 100644
--- a/tests/UnitTests.hs
+++ b/tests/UnitTests.hs
@@ -144,6 +144,7 @@ positionalTests =
         ]
     decodeWithTests =
         [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
+        , ("mixed-space", spaceDec, "  88 c \t  0.4  ", [["88", "c", "0.4"]])
         ]
 
     encodeTest (name, input, expected) =
@@ -158,6 +159,11 @@ positionalTests =
         testCase name $ decodesWithStreamingAs opts input expected
     defEnc = defaultEncodeOptions
     defDec = defaultDecodeOptions
+    spaceDec = defaultDecodeOptions  -- TODO: Use spaceDecodeOptions
+        { decDelimiter       = 32  -- space
+        , decMergeDelimiters = True
+        , decTrimRecordSpace = True
+        }
 
 nameBasedTests :: [TF.Test]
 nameBasedTests =

From 9eeea09549395c261e040b25c561b23b6437373f Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Mon, 25 Feb 2013 18:52:21 +0400
Subject: [PATCH 02/13] Fix build

---
 Data/Csv/Encoding.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
index ba99d68..5eb8f8b 100644
--- a/Data/Csv/Encoding.hs
+++ b/Data/Csv/Encoding.hs
@@ -254,7 +254,7 @@ csv !opts = do
     return $! V.fromList vals
   where
     records = do
-        !r <- record (decDelimiter opts)
+        !r <- record opts
         if blankLine r
             then (endOfLine *> records) <|> pure []
             else case runParser (parseRecord r) of
@@ -276,7 +276,7 @@ csvWithHeader !opts = do
     return (hdr, v)
   where
     records hdr = do
-        !r <- record (decDelimiter opts)
+        !r <- record opts
         if blankLine r
             then (endOfLine *> records hdr) <|> pure []
             else case runParser (convert hdr r) of

From d217e27d335a3fb589ef8b081df85b20b5696037 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Mon, 25 Feb 2013 18:59:26 +0400
Subject: [PATCH 03/13] Add more failing tests

---
 tests/UnitTests.hs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
index b9cc458..6cf36e4 100644
--- a/tests/UnitTests.hs
+++ b/tests/UnitTests.hs
@@ -145,6 +145,8 @@ positionalTests =
     decodeWithTests =
         [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
         , ("mixed-space", spaceDec, "  88 c \t  0.4  ", [["88", "c", "0.4"]])
+        , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"]
+                                                           , ["11","22"]])
         ]
 
     encodeTest (name, input, expected) =

From 6b8e1ddbf564561686446fe9343dbd55893815c8 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Mon, 25 Feb 2013 20:48:38 +0400
Subject: [PATCH 04/13] Correctly leading/trailing spaces

It's hard to strip whitespaces correctly because
 a) It's valid part of field for CSV so
    "a,b,c " -> ["a","b","c "]
 b) If we're using spaces as delimiter we get spurious empty field
    at the end fo the line
    "a b c " -> ["a","b","c",""]

Only reliable way to strip them is to read whole line, strip spaces
and parse stripped line.
---
 Data/Csv/Parser.hs | 50 +++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index b2921cf..3e745c2 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -27,10 +27,9 @@ module Data.Csv.Parser
 
 import Blaze.ByteString.Builder (fromByteString, toByteString)
 import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many,
+import Control.Applicative (Alternative, (*>), (<$), (<$>), (<*), (<|>),
                             optional, pure)
-import Control.Monad (when)
-import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space)
+import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
 import qualified Data.Attoparsec as A
 import qualified Data.Attoparsec.Lazy as AL
 import Data.Attoparsec.Types (Parser)
@@ -145,22 +144,31 @@ removeBlankLines = filter (not . blankLine)
 -- most likely want to use the 'endOfLine' parser in combination with
 -- this parser.
 record :: DecodeOptions -> AL.Parser Record
-record !opts = do
-    when trim skipSpaces
-    fs <- field delim `sepBy1'` delimiter
-    when trim skipSpaces
-    return $! V.fromList fs
+record !opts
+    -- If we need to trim spaces from line only robust way to do so is
+    -- to read whole line, remove spaces and run record parser on
+    -- trimmed line. For example:
+    --
+    -- + "a,b,c " will be parsed as ["a","b","c "] since spaces are
+    --   allowed in field
+    -- + "a b c " will be parsed as ["a","b","c",""] if we use space
+    --   as separator.
+    | decTrimRecordSpace opts = do
+        AL.skipMany $ AL.satisfy isSpace
+        line <- AL.takeWhile $ \c -> c /= newline && c /= cr
+        let (dat,_) = S.spanEnd isSpace line
+        case AL.parseOnly parser dat of
+          Left  e -> fail e
+          Right x -> return x
+    | otherwise              = parser
   where
-    trim = decTrimRecordSpace opts
     delim = decDelimiter opts
-    delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim
-              | otherwise               = A.word8 delim
+    delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim)
+              | otherwise               = () <$ A.word8 delim
+    parser = do fs <- field delim `sepBy1'` delimiter
+                return $! V.fromList fs
 {-# INLINE record #-}
 
-skipSpaces :: AL.Parser ()
-skipSpaces = scan
-  where scan = (space *> scan) <|> pure ()
-
 -- | Parse a field. The field may be in either the escaped or
 -- non-escaped format. The return value is unescaped.
 field :: Word8 -> AL.Parser Field
@@ -212,7 +220,13 @@ unescape = toByteString <$!> go mempty where
       then return (acc `mappend` fromByteString h)
       else rest
 
-doubleQuote, newline, cr :: Word8
+doubleQuote, newline, cr, space, tab :: Word8
 doubleQuote = 34
-newline = 10
-cr = 13
+newline     = 10
+cr          = 13
+space       = 32
+tab         = 9
+
+isSpace :: Word8 -> Bool
+isSpace c = c == space || c == tab
+{-# INLINE isSpace #-}

From 6fbc77f89a5576ac35cb9b891e81641d3626acab Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Mon, 25 Feb 2013 21:00:15 +0400
Subject: [PATCH 05/13] parser for header is same as parser for record

---
 Data/Csv/Encoding.hs    | 4 ++--
 Data/Csv/Incremental.hs | 2 +-
 Data/Csv/Parser.hs      | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
index 5eb8f8b..d0e846d 100644
--- a/Data/Csv/Encoding.hs
+++ b/Data/Csv/Encoding.hs
@@ -122,7 +122,7 @@ decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
             -> BL8.ByteString -> Either String a
 decodeWithC p !opts skipHeader = decodeWithP parser
   where parser
-            | skipHeader = header (decDelimiter opts) *> p opts
+            | skipHeader = header opts *> p opts
             | otherwise  = p opts
 {-# INLINE decodeWithC #-}
 
@@ -268,7 +268,7 @@ csv !opts = do
 csvWithHeader :: FromNamedRecord a => DecodeOptions
               -> AL.Parser (Header, V.Vector a)
 csvWithHeader !opts = do
-    !hdr <- header (decDelimiter opts)
+    !hdr <- header  opts
     vals <- records hdr
     _ <- optional endOfLine
     endOfInput
diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs
index 6b42ee3..17c0660 100644
--- a/Data/Csv/Incremental.hs
+++ b/Data/Csv/Incremental.hs
@@ -133,7 +133,7 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions
 decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString
 decodeHeaderWith !opts = PartialH (go . parser)
   where
-    parser = A.parse (header $ decDelimiter opts)
+    parser = A.parse (header opts)
 
     go (A.Fail rest _ msg) = FailH rest err
       where err = "parse error (" ++ msg ++ ")"
diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index 3e745c2..f792072 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -117,7 +117,7 @@ sepBy1' p s = go
 -- | Parse a CSV file that includes a header.
 csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
 csvWithHeader !opts = do
-    !hdr <- header (decDelimiter opts)
+    !hdr <- header opts
     vals <- map (toNamedRecord hdr) . removeBlankLines <$>
             (record opts) `sepBy1'` endOfLine
     _ <- optional endOfLine
@@ -126,9 +126,10 @@ csvWithHeader !opts = do
     return (hdr, v)
 
 -- | Parse a header, including the terminating line separator.
-header :: Word8  -- ^ Field delimiter
+header :: DecodeOptions  -- ^ Field delimiter
        -> AL.Parser Header
-header !delim = V.fromList <$!> name delim `sepBy1'` (A.word8 delim) <* endOfLine
+header = record
+{-# INLINE header #-}
 
 -- | Parse a header name. Header names have the same format as regular
 -- 'field's.

From 0a60b79f57afe05a0f30eead3f212952d3f87a7b Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Mon, 25 Feb 2013 21:06:20 +0400
Subject: [PATCH 06/13] Pass DecodeOptions to the field and name parsers

It's necessary for implementing delimiters which are not single
character
---
 Data/Csv/Parser.hs | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index f792072..34c950f 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -133,8 +133,8 @@ header = record
 
 -- | Parse a header name. Header names have the same format as regular
 -- 'field's.
-name :: Word8 -> AL.Parser Name
-name !delim = field delim
+name :: DecodeOptions -> AL.Parser Name
+name = field
 
 removeBlankLines :: [Record] -> [Record]
 removeBlankLines = filter (not . blankLine)
@@ -166,20 +166,20 @@ record !opts
     delim = decDelimiter opts
     delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim)
               | otherwise               = () <$ A.word8 delim
-    parser = do fs <- field delim `sepBy1'` delimiter
+    parser = do fs <- field opts `sepBy1'` delimiter
                 return $! V.fromList fs
 {-# INLINE record #-}
 
 -- | Parse a field. The field may be in either the escaped or
 -- non-escaped format. The return value is unescaped.
-field :: Word8 -> AL.Parser Field
-field !delim = do
+field :: DecodeOptions -> AL.Parser Field
+field !opt = do
     mb <- A.peekWord8
     -- We purposely don't use <|> as we want to commit to the first
     -- choice if we see a double quote.
     case mb of
         Just b | b == doubleQuote -> escapedField
-        _                         -> unescapedField delim
+        _                         -> unescapedField opt
 {-# INLINE field #-}
 
 escapedField :: AL.Parser S.ByteString
@@ -197,11 +197,13 @@ escapedField = do
             Left err -> fail err
         else return s
 
-unescapedField :: Word8 -> AL.Parser S.ByteString
-unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
-                                            c /= newline &&
-                                            c /= delim &&
-                                            c /= cr)
+unescapedField :: DecodeOptions -> AL.Parser S.ByteString
+unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote &&
+                                          c /= newline &&
+                                          c /= delim &&
+                                          c /= cr)
+  where
+    delim = decDelimiter opt
 
 dquote :: AL.Parser Char
 dquote = char '"'

From 24f375ea05c87b474b9f36d6ec0fc290c6ec0a3c Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Thu, 28 Feb 2013 12:39:19 +0400
Subject: [PATCH 07/13] Use predicate on delimiter character.

Now all tests are passing.
---
 Data/Csv/Parser.hs | 18 +++++++++---------
 tests/UnitTests.hs | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index 34c950f..c79e1a8 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -56,7 +56,7 @@ import Data.Csv.Util ((<$!>), blankLine)
 -- >     }
 data DecodeOptions = DecodeOptions
     { -- | Field delimiter.
-      decDelimiter  :: {-# UNPACK #-} !Word8
+      decDelimiter  :: Word8 -> Bool
 
       -- | Runs of consecutive delimiters are regarded as a single
       -- delimiter. This is useful e.g. when parsing white space
@@ -67,14 +67,14 @@ data DecodeOptions = DecodeOptions
       -- end of each record (but not at the begining and end of each
       -- field).
     , decTrimRecordSpace :: !Bool
-    } deriving (Eq, Show)
+    }
 
 -- TODO: Document default values in defaultDecodeOptions
 
 -- | Decoding options for parsing CSV files.
 defaultDecodeOptions :: DecodeOptions
 defaultDecodeOptions = DecodeOptions
-    { decDelimiter = 44  -- comma
+    { decDelimiter = (==44)  -- comma
     , decMergeDelimiters = False
     , decTrimRecordSpace = False
     }
@@ -82,7 +82,7 @@ defaultDecodeOptions = DecodeOptions
 -- | Decoding options for parsing space-delimited files.
 spaceDecodeOptions :: DecodeOptions
 spaceDecodeOptions = DecodeOptions
-    { decDelimiter = 32  -- space
+    { decDelimiter = \c -> c == space || c == tab
     , decMergeDelimiters = True
     , decTrimRecordSpace = True
     }
@@ -164,8 +164,8 @@ record !opts
     | otherwise              = parser
   where
     delim = decDelimiter opts
-    delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim)
-              | otherwise               = () <$ A.word8 delim
+    delimiter | decMergeDelimiters opts = A.skipMany1 (A.satisfy delim)
+              | otherwise               = () <$ A.satisfy delim
     parser = do fs <- field opts `sepBy1'` delimiter
                 return $! V.fromList fs
 {-# INLINE record #-}
@@ -199,9 +199,9 @@ escapedField = do
 
 unescapedField :: DecodeOptions -> AL.Parser S.ByteString
 unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote &&
-                                          c /= newline &&
-                                          c /= delim &&
-                                          c /= cr)
+                                          c /= newline     &&
+                                          c /= cr          &&
+                                          not (delim c))
   where
     delim = decDelimiter opt
 
diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
index 6cf36e4..919d582 100644
--- a/tests/UnitTests.hs
+++ b/tests/UnitTests.hs
@@ -22,6 +22,8 @@ import Test.Framework.Providers.QuickCheck2 as TF
 
 import Data.Csv hiding (record)
 import qualified Data.Csv.Streaming as S
+import Data.Csv.Parser (spaceDecodeOptions)
+
 
 ------------------------------------------------------------------------
 -- Parse tests
@@ -143,10 +145,12 @@ positionalTests =
         , ("rfc4180", rfc4180Input, rfc4180Output)
         ]
     decodeWithTests =
-        [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
+        [ ("tab-delim", defDec { decDelimiter = (==9) }, "1\t2", [["1", "2"]])
         , ("mixed-space", spaceDec, "  88 c \t  0.4  ", [["88", "c", "0.4"]])
         , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"]
                                                            , ["11","22"]])
+        , ("blankLine-space", spaceDec, "1 2\n\n3 4\n", [ ["1","2"]
+                                                        , ["3","4"]])
         ]
 
     encodeTest (name, input, expected) =
@@ -159,13 +163,9 @@ positionalTests =
         testCase name $ input `decodesStreamingAs` expected
     streamingDecodeWithTest (name, opts, input, expected) =
         testCase name $ decodesWithStreamingAs opts input expected
-    defEnc = defaultEncodeOptions
-    defDec = defaultDecodeOptions
-    spaceDec = defaultDecodeOptions  -- TODO: Use spaceDecodeOptions
-        { decDelimiter       = 32  -- space
-        , decMergeDelimiters = True
-        , decTrimRecordSpace = True
-        }
+    defEnc   = defaultEncodeOptions
+    defDec   = defaultDecodeOptions
+    spaceDec = spaceDecodeOptions
 
 nameBasedTests :: [TF.Test]
 nameBasedTests =

From f7d8d8a1f04fbbd8df606ffc9f5e5c6cd831a467 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Thu, 28 Feb 2013 18:54:09 +0400
Subject: [PATCH 08/13] Reexport spaceDecodeOptions

---
 Data/Csv.hs          | 1 +
 Data/Csv/Encoding.hs | 1 +
 tests/UnitTests.hs   | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Data/Csv.hs b/Data/Csv.hs
index 7cb45ec..454c62f 100644
--- a/Data/Csv.hs
+++ b/Data/Csv.hs
@@ -31,6 +31,7 @@ module Data.Csv
     -- $options
     , DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , decodeWith
     , decodeByNameWith
     , EncodeOptions(..)
diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
index d0e846d..c75868c 100644
--- a/Data/Csv/Encoding.hs
+++ b/Data/Csv/Encoding.hs
@@ -20,6 +20,7 @@ module Data.Csv.Encoding
     -- ** Encoding and decoding options
     , DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , decodeWith
     , decodeByNameWith
     , EncodeOptions(..)
diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
index 919d582..24ac684 100644
--- a/tests/UnitTests.hs
+++ b/tests/UnitTests.hs
@@ -22,7 +22,6 @@ import Test.Framework.Providers.QuickCheck2 as TF
 
 import Data.Csv hiding (record)
 import qualified Data.Csv.Streaming as S
-import Data.Csv.Parser (spaceDecodeOptions)
 
 
 ------------------------------------------------------------------------

From 1c18502f5b74db39439f26c1c3a8047a4b84b130 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Thu, 28 Feb 2013 20:31:28 +0400
Subject: [PATCH 09/13] Inline unescapedField

Now it depends on rather complex data structure and needs to be
specialized by compiler
---
 Data/Csv/Parser.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index c79e1a8..dee0429 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -204,6 +204,7 @@ unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote &&
                                           not (delim c))
   where
     delim = decDelimiter opt
+{-# INLINE unescapedField #-}
 
 dquote :: AL.Parser Char
 dquote = char '"'

From 361703c875209a49d168ba9fdbdc3bc44a2b76f9 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Thu, 28 Feb 2013 20:22:11 +0400
Subject: [PATCH 10/13] Add options for encoding of space-delimited data

Always escape delimiter. Now it's passed to escape function as
parameter and not hardcoded as comma.

Note correct encoding of space-delimited data depends on escaping
of space character.
---
 Data/Csv/Encoding.hs | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
index c75868c..9defba4 100644
--- a/Data/Csv/Encoding.hs
+++ b/Data/Csv/Encoding.hs
@@ -25,6 +25,7 @@ module Data.Csv.Encoding
     , decodeByNameWith
     , EncodeOptions(..)
     , defaultEncodeOptions
+    , spaceEncodeOptions
     , encodeWith
     , encodeByNameWith
     ) where
@@ -151,10 +152,16 @@ data EncodeOptions = EncodeOptions
       encDelimiter  :: {-# UNPACK #-} !Word8
     } deriving (Eq, Show)
 
--- | Encoding options for CSV files.
+-- | Encoding options for CSV files. Comma is used as separator.
 defaultEncodeOptions :: EncodeOptions
 defaultEncodeOptions = EncodeOptions
-    { encDelimiter = 44  -- comma
+    { encDelimiter   = 44  -- comma
+    }
+
+-- | Encode options for space-delimited files. Tab is used as separator.
+spaceEncodeOptions :: EncodeOptions
+spaceEncodeOptions = EncodeOptions
+    { encDelimiter   = 9       -- tab
     }
 
 -- | Like 'encode', but lets you customize how the CSV data is
@@ -168,13 +175,13 @@ encodeWith opts = toLazyByteString
 
 encodeRecord :: Word8 -> Record -> Builder
 encodeRecord delim = mconcat . intersperse (fromWord8 delim)
-                     . map fromByteString . map escape . V.toList
+                     . map fromByteString . map (escape delim) . V.toList
 {-# INLINE encodeRecord #-}
 
 -- TODO: Optimize
-escape :: B.ByteString -> B.ByteString
-escape s
-    | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr ||
+escape :: Word8 -> B.ByteString -> B.ByteString
+escape delim s
+    | B.find (\ b -> b == dquote || b == delim || b == nl || b == cr ||
                      b == sp) s == Nothing = s
     | otherwise =
         B.concat ["\"",
@@ -182,11 +189,11 @@ escape s
                   (\ b -> if b == dquote then "\"\"" else B.singleton b) s,
                   "\""]
   where
+    sp     = 32
     dquote = 34
-    comma  = 44
     nl     = 10
     cr     = 13
-    sp     = 32
+
 
 -- | Like 'encodeByName', but lets you customize how the CSV data is
 -- encoded.

From fa7fe10a094534af71c1ad94098dfc888e50f9bc Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Fri, 8 Mar 2013 18:58:47 +0400
Subject: [PATCH 11/13] Reexport spaceEncodingOptions from Data.Csv

---
 Data/Csv.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Data/Csv.hs b/Data/Csv.hs
index 454c62f..c65ae51 100644
--- a/Data/Csv.hs
+++ b/Data/Csv.hs
@@ -36,6 +36,7 @@ module Data.Csv
     , decodeByNameWith
     , EncodeOptions(..)
     , defaultEncodeOptions
+    , spaceEncodeOptions
     , encodeWith
     , encodeByNameWith
 

From 1ca1784aa50f19d14124cfd7ae8b2058c0f042b6 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Fri, 8 Mar 2013 18:59:27 +0400
Subject: [PATCH 12/13] Add tests for encoding of space delimited data

All tests are passing
---
 tests/UnitTests.hs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
index 24ac684..4e1a607 100644
--- a/tests/UnitTests.hs
+++ b/tests/UnitTests.hs
@@ -116,6 +116,14 @@ positionalTests =
       [ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 })
         [["1", "2"]] "1\t2\r\n"
       ]
+    , testGroup "encodeSpace" $ map (\(n,a,b) -> testCase n $ encodesWithAs spaceEncodeOptions a b)
+      [ ("simple",       [["abc"]],          "abc\r\n")
+      , ("leadingSpace", [[" abc"]],         "\" abc\"\r\n")
+      , ("comma",        [["abc,def"]],      "abc,def\r\n")
+      , ("space",        [["abc def"]],      "\"abc def\"\r\n")
+      , ("tab",          [["abc\tdef"]],     "\"abc\tdef\"\r\n")
+      , ("twoFields",    [["abc","def"]],    "abc\tdef\r\n")
+      ]
     , testGroup "decode" $ map decodeTest decodeTests
     , testGroup "decodeWith" $ map decodeWithTest decodeWithTests
     , testGroup "streaming"

From b0090850ab86a143bdfee20fff0b009f35b6c4bb Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Fri, 8 Mar 2013 19:46:43 +0400
Subject: [PATCH 13/13] Document field values in defaultDecodeOptions and
 spaceDecodeOptions

---
 Data/Csv/Parser.hs | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
index dee0429..7002dc6 100644
--- a/Data/Csv/Parser.hs
+++ b/Data/Csv/Parser.hs
@@ -69,20 +69,30 @@ data DecodeOptions = DecodeOptions
     , decTrimRecordSpace :: !Bool
     }
 
--- TODO: Document default values in defaultDecodeOptions
-
--- | Decoding options for parsing CSV files.
+-- | Decoding options for parsing CSV files. Fields' values are set to:
+--
+-- [@'decDelimiter'@] comma
+--
+-- [@'decMergeDelimiters'@] false
+--
+-- [@'decTrimRecordSpace'@] false
 defaultDecodeOptions :: DecodeOptions
 defaultDecodeOptions = DecodeOptions
-    { decDelimiter = (==44)  -- comma
+    { decDelimiter       = (==44)  -- comma
     , decMergeDelimiters = False
     , decTrimRecordSpace = False
     }
 
--- | Decoding options for parsing space-delimited files.
+-- | Decoding options for parsing space-delimited files. Fields' values are set to:
+--
+-- [@'decDelimiter'@] space or tab character.
+--
+-- [@'decMergeDelimiters'@] true
+--
+-- [@'decTrimRecordSpace'@] true
 spaceDecodeOptions :: DecodeOptions
 spaceDecodeOptions = DecodeOptions
-    { decDelimiter = \c -> c == space || c == tab
+    { decDelimiter       = \c -> c == space || c == tab
     , decMergeDelimiters = True
     , decTrimRecordSpace = True
     }