Check-point commit

Space-delimited parsing is mostly working, except that trailing whitespace is not dropped correctly.
haskell-hvr · Feb 23, 2013 · 75c23fd · 75c23fd
1 parent 9de2961
commit 75c23fd
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 10 deletions.
diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs
@@ -295,7 +295,7 @@ decodeWithP p !opts = go Incomplete [] . parser
             acc' | blankLine r = acc
                  | otherwise   = convert r : acc
 
-    parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput))
+    parser = A.parse (record opts <* (endOfLine <|> endOfInput))
     convert = runParser . p
 {-# INLINE decodeWithP #-}
 

diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs
@@ -16,6 +16,7 @@
 module Data.Csv.Parser
     ( DecodeOptions(..)
     , defaultDecodeOptions
+    , spaceDecodeOptions
     , csv
     , csvWithHeader
     , header
@@ -26,9 +27,10 @@ module Data.Csv.Parser
 
 import Blaze.ByteString.Builder (fromByteString, toByteString)
 import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional,
-                            pure)
-import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
+import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many,
+                            optional, pure)
+import Control.Monad (when)
+import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space)
 import qualified Data.Attoparsec as A
 import qualified Data.Attoparsec.Lazy as AL
 import Data.Attoparsec.Types (Parser)
@@ -57,18 +59,40 @@ import Data.Csv.Util ((<$!>))
 data DecodeOptions = DecodeOptions
     { -- | Field delimiter.
       decDelimiter  :: {-# UNPACK #-} !Word8
+
+      -- | Runs of consecutive delimiters are regarded as a single
+      -- delimiter. This is useful e.g. when parsing white space
+      -- separated data.
+    , decMergeDelimiters :: !Bool
+
+      -- | Trim leading and trailing whitespace at the begining and
+      -- end of each record (but not at the begining and end of each
+      -- field).
+    , decTrimRecordSpace :: !Bool
     } deriving (Eq, Show)
 
+-- TODO: Document default values in defaultDecodeOptions
+
 -- | Decoding options for parsing CSV files.
 defaultDecodeOptions :: DecodeOptions
 defaultDecodeOptions = DecodeOptions
     { decDelimiter = 44  -- comma
+    , decMergeDelimiters = False
+    , decTrimRecordSpace = False
+    }
+
+-- | Decoding options for parsing space-delimited files.
+spaceDecodeOptions :: DecodeOptions
+spaceDecodeOptions = DecodeOptions
+    { decDelimiter = 32  -- space
+    , decMergeDelimiters = True
+    , decTrimRecordSpace = True
     }
 
 -- | Parse a CSV file that does not include a header.
 csv :: DecodeOptions -> AL.Parser Csv
 csv !opts = do
-    vals <- record (decDelimiter opts) `sepBy1'` endOfLine
+    vals <- record opts `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let nonEmpty = removeBlankLines vals
@@ -97,7 +121,7 @@ csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
 csvWithHeader !opts = do
     !hdr <- header (decDelimiter opts)
     vals <- map (toNamedRecord hdr) . removeBlankLines <$>
-            (record (decDelimiter opts)) `sepBy1'` endOfLine
+            (record opts) `sepBy1'` endOfLine
     _ <- optional endOfLine
     endOfInput
     let !v = V.fromList vals
@@ -125,13 +149,23 @@ removeBlankLines = filter (not . blankLine)
 -- CSV file is allowed to not have a terminating line separator. You
 -- most likely want to use the 'endOfLine' parser in combination with
 -- this parser.
-record :: Word8  -- ^ Field delimiter
-       -> AL.Parser Record
-record !delim = do
-    fs <- field delim `sepBy1'` (A.word8 delim)
+record :: DecodeOptions -> AL.Parser Record
+record !opts = do
+    when trim skipSpaces
+    fs <- field delim `sepBy1'` delimiter
+    when trim skipSpaces
     return $! V.fromList fs
+  where
+    trim = decTrimRecordSpace opts
+    delim = decDelimiter opts
+    delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim
+              | otherwise               = A.word8 delim
 {-# INLINE record #-}
 
+skipSpaces :: AL.Parser ()
+skipSpaces = scan
+  where scan = (space *> scan) <|> pure ()
+
 -- | Parse a field. The field may be in either the escaped or
 -- non-escaped format. The return value is unescaped.
 field :: Word8 -> AL.Parser Field

diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs
@@ -144,6 +144,7 @@ positionalTests =
         ]
     decodeWithTests =
         [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
+        , ("mixed-space", spaceDec, "  88 c \t  0.4  ", [["88", "c", "0.4"]])
         ]
 
     encodeTest (name, input, expected) =
@@ -158,6 +159,11 @@ positionalTests =
         testCase name $ decodesWithStreamingAs opts input expected
     defEnc = defaultEncodeOptions
     defDec = defaultDecodeOptions
+    spaceDec = defaultDecodeOptions  -- TODO: Use spaceDecodeOptions
+        { decDelimiter       = 32  -- space
+        , decMergeDelimiters = True
+        , decTrimRecordSpace = True
+        }
 
 nameBasedTests :: [TF.Test]
 nameBasedTests =