Skip to content

Commit

Permalink
Check-point commit
Browse files Browse the repository at this point in the history
Space-delimited parsing is mostly working, except that trailing
whitespace is not dropped correctly.
  • Loading branch information
tibbe committed Feb 23, 2013
1 parent 9de2961 commit 75c23fd
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 10 deletions.
2 changes: 1 addition & 1 deletion Data/Csv/Incremental.hs
Expand Up @@ -295,7 +295,7 @@ decodeWithP p !opts = go Incomplete [] . parser
acc' | blankLine r = acc
| otherwise = convert r : acc

parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput))
parser = A.parse (record opts <* (endOfLine <|> endOfInput))
convert = runParser . p
{-# INLINE decodeWithP #-}

Expand Down
52 changes: 43 additions & 9 deletions Data/Csv/Parser.hs
Expand Up @@ -16,6 +16,7 @@
module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, spaceDecodeOptions
, csv
, csvWithHeader
, header
Expand All @@ -26,9 +27,10 @@ module Data.Csv.Parser

import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional,
pure)
import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), many,
optional, pure)
import Control.Monad (when)
import Data.Attoparsec.Char8 (char, endOfInput, endOfLine, space)
import qualified Data.Attoparsec as A
import qualified Data.Attoparsec.Lazy as AL
import Data.Attoparsec.Types (Parser)
Expand Down Expand Up @@ -57,18 +59,40 @@ import Data.Csv.Util ((<$!>))
data DecodeOptions = DecodeOptions
{ -- | Field delimiter.
decDelimiter :: {-# UNPACK #-} !Word8

-- | Runs of consecutive delimiters are regarded as a single
-- delimiter. This is useful e.g. when parsing white space
-- separated data.
, decMergeDelimiters :: !Bool

-- | Trim leading and trailing whitespace at the begining and
-- end of each record (but not at the begining and end of each
-- field).
, decTrimRecordSpace :: !Bool
} deriving (Eq, Show)

-- TODO: Document default values in defaultDecodeOptions

-- | Decoding options for parsing CSV files.
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter = 44 -- comma
, decMergeDelimiters = False
, decTrimRecordSpace = False
}

-- | Decoding options for parsing space-delimited files.
spaceDecodeOptions :: DecodeOptions
spaceDecodeOptions = DecodeOptions
{ decDelimiter = 32 -- space
, decMergeDelimiters = True
, decTrimRecordSpace = True
}

-- | Parse a CSV file that does not include a header.
csv :: DecodeOptions -> AL.Parser Csv
csv !opts = do
vals <- record (decDelimiter opts) `sepBy1'` endOfLine
vals <- record opts `sepBy1'` endOfLine
_ <- optional endOfLine
endOfInput
let nonEmpty = removeBlankLines vals
Expand Down Expand Up @@ -97,7 +121,7 @@ csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader !opts = do
!hdr <- header (decDelimiter opts)
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
(record (decDelimiter opts)) `sepBy1'` endOfLine
(record opts) `sepBy1'` endOfLine
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
Expand Down Expand Up @@ -125,13 +149,23 @@ removeBlankLines = filter (not . blankLine)
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
record :: Word8 -- ^ Field delimiter
-> AL.Parser Record
record !delim = do
fs <- field delim `sepBy1'` (A.word8 delim)
record :: DecodeOptions -> AL.Parser Record
record !opts = do
when trim skipSpaces
fs <- field delim `sepBy1'` delimiter
when trim skipSpaces
return $! V.fromList fs
where
trim = decTrimRecordSpace opts
delim = decDelimiter opts
delimiter | decMergeDelimiters opts = A.skipMany1 (A.word8 delim) *> pure delim
| otherwise = A.word8 delim
{-# INLINE record #-}

skipSpaces :: AL.Parser ()
skipSpaces = scan
where scan = (space *> scan) <|> pure ()

-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: Word8 -> AL.Parser Field
Expand Down
6 changes: 6 additions & 0 deletions tests/UnitTests.hs
Expand Up @@ -144,6 +144,7 @@ positionalTests =
]
decodeWithTests =
[ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
, ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]])
]

encodeTest (name, input, expected) =
Expand All @@ -158,6 +159,11 @@ positionalTests =
testCase name $ decodesWithStreamingAs opts input expected
defEnc = defaultEncodeOptions
defDec = defaultDecodeOptions
spaceDec = defaultDecodeOptions -- TODO: Use spaceDecodeOptions
{ decDelimiter = 32 -- space
, decMergeDelimiters = True
, decTrimRecordSpace = True
}

nameBasedTests :: [TF.Test]
nameBasedTests =
Expand Down

0 comments on commit 75c23fd

Please sign in to comment.