Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Second implementation of parsing for space-delimited data #36

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions Data/Csv.hs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ module Data.Csv
-- $options
, DecodeOptions(..)
, defaultDecodeOptions
, spaceDecodeOptions
, decodeWith
, decodeByNameWith
, EncodeOptions(..)
, defaultEncodeOptions
, spaceEncodeOptions
, encodeWith
, encodeByNameWith

Expand Down
32 changes: 20 additions & 12 deletions Data/Csv/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ module Data.Csv.Encoding
-- ** Encoding and decoding options
, DecodeOptions(..)
, defaultDecodeOptions
, spaceDecodeOptions
, decodeWith
, decodeByNameWith
, EncodeOptions(..)
, defaultEncodeOptions
, spaceEncodeOptions
, encodeWith
, encodeByNameWith
) where
Expand Down Expand Up @@ -123,7 +125,7 @@ decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> HasHeader
-> BL8.ByteString -> Either String a
decodeWithC p !opts hasHeader = decodeWithP parser
where parser = case hasHeader of
HasHeader -> header (decDelimiter opts) *> p opts
HasHeader -> header opts *> p opts
NoHeader -> p opts
{-# INLINE decodeWithC #-}

Expand Down Expand Up @@ -151,10 +153,16 @@ data EncodeOptions = EncodeOptions
encDelimiter :: {-# UNPACK #-} !Word8
} deriving (Eq, Show)

-- | Encoding options for CSV files.
-- | Encoding options for CSV files. Comma is used as separator.
defaultEncodeOptions :: EncodeOptions
defaultEncodeOptions = EncodeOptions
{ encDelimiter = 44 -- comma
{ encDelimiter = 44 -- comma
}

-- | Encode options for space-delimited files. Tab is used as separator.
spaceEncodeOptions :: EncodeOptions
spaceEncodeOptions = EncodeOptions
{ encDelimiter = 9 -- tab
}

-- | Like 'encode', but lets you customize how the CSV data is
Expand All @@ -167,13 +175,13 @@ encodeWith opts = toLazyByteString

encodeRecord :: Word8 -> Record -> Builder
encodeRecord delim = mconcat . intersperse (fromWord8 delim)
. map fromByteString . map escape . V.toList
. map fromByteString . map (escape delim) . V.toList
{-# INLINE encodeRecord #-}

-- TODO: Optimize
escape :: B.ByteString -> B.ByteString
escape s
| B.any (\ b -> b == dquote || b == comma || b == nl || b == cr || b == sp)
escape :: Word8 -> B.ByteString -> B.ByteString
escape delim s
| B.any (\ b -> b == dquote || b == delim || b == nl || b == cr || b == sp)
s = toByteString $
fromWord8 dquote
<> B.foldl
Expand All @@ -185,11 +193,11 @@ escape s
<> fromWord8 dquote
| otherwise = s
where
sp = 32
dquote = 34
comma = 44
nl = 10
cr = 13
sp = 32


-- | Like 'encodeByName', but lets you customize how the CSV data is
-- encoded.
Expand Down Expand Up @@ -262,7 +270,7 @@ csv !opts = do
return $! V.fromList vals
where
records = do
!r <- record (decDelimiter opts)
!r <- record opts
if blankLine r
then (endOfLine *> records) <|> pure []
else case runParser (parseRecord r) of
Expand All @@ -276,15 +284,15 @@ csv !opts = do
csvWithHeader :: FromNamedRecord a => DecodeOptions
-> AL.Parser (Header, V.Vector a)
csvWithHeader !opts = do
!hdr <- header (decDelimiter opts)
!hdr <- header opts
vals <- records hdr
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
return (hdr, v)
where
records hdr = do
!r <- record (decDelimiter opts)
!r <- record opts
if blankLine r
then (endOfLine *> records hdr) <|> pure []
else case runParser (convert hdr r) of
Expand Down
4 changes: 2 additions & 2 deletions Data/Csv/Incremental.hs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions
decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString
decodeHeaderWith !opts = PartialH (go . parser)
where
parser = A.parse (header $ decDelimiter opts)
parser = A.parse (header opts)

go (A.Fail rest _ msg) = FailH rest err
where err = "parse error (" ++ msg ++ ")"
Expand Down Expand Up @@ -290,7 +290,7 @@ decodeWithP p !opts = go Incomplete [] . parser
acc' | blankLine r = acc
| otherwise = let !r' = convert r in r' : acc

parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput))
parser = A.parse (record opts <* (endOfLine <|> endOfInput))
convert = runParser . p
{-# INLINE decodeWithP #-}

Expand Down
120 changes: 91 additions & 29 deletions Data/Csv/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, spaceDecodeOptions
, csv
, csvWithHeader
, header
Expand All @@ -26,8 +27,8 @@ module Data.Csv.Parser

import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional,
pure)
import Control.Applicative (Alternative, (*>), (<$), (<$>), (<*), (<|>),
optional, pure)
import Data.Attoparsec.Char8 (char, endOfInput, endOfLine)
import qualified Data.Attoparsec as A
import qualified Data.Attoparsec.Lazy as AL
Expand Down Expand Up @@ -55,19 +56,51 @@ import Data.Csv.Util ((<$!>), blankLine)
-- > }
data DecodeOptions = DecodeOptions
{ -- | Field delimiter.
decDelimiter :: {-# UNPACK #-} !Word8
} deriving (Eq, Show)
decDelimiter :: Word8 -> Bool

-- | Decoding options for parsing CSV files.
-- | Runs of consecutive delimiters are regarded as a single
-- delimiter. This is useful e.g. when parsing white space
-- separated data.
, decMergeDelimiters :: !Bool

-- | Trim leading and trailing whitespace at the begining and
-- end of each record (but not at the begining and end of each
-- field).
, decTrimRecordSpace :: !Bool
}

-- | Decoding options for parsing CSV files. Fields' values are set to:
--
-- [@'decDelimiter'@] comma
--
-- [@'decMergeDelimiters'@] false
--
-- [@'decTrimRecordSpace'@] false
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter = 44 -- comma
{ decDelimiter = (==44) -- comma
, decMergeDelimiters = False
, decTrimRecordSpace = False
}

-- | Decoding options for parsing space-delimited files. Fields' values are set to:
--
-- [@'decDelimiter'@] space or tab character.
--
-- [@'decMergeDelimiters'@] true
--
-- [@'decTrimRecordSpace'@] true
spaceDecodeOptions :: DecodeOptions
spaceDecodeOptions = DecodeOptions
{ decDelimiter = \c -> c == space || c == tab
, decMergeDelimiters = True
, decTrimRecordSpace = True
}

-- | Parse a CSV file that does not include a header.
csv :: DecodeOptions -> AL.Parser Csv
csv !opts = do
vals <- record (decDelimiter opts) `sepBy1'` endOfLine
vals <- record opts `sepBy1'` endOfLine
_ <- optional endOfLine
endOfInput
let nonEmpty = removeBlankLines vals
Expand All @@ -94,23 +127,24 @@ sepBy1' p s = go
-- | Parse a CSV file that includes a header.
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader !opts = do
!hdr <- header (decDelimiter opts)
!hdr <- header opts
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
(record (decDelimiter opts)) `sepBy1'` endOfLine
(record opts) `sepBy1'` endOfLine
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
return (hdr, v)

-- | Parse a header, including the terminating line separator.
header :: Word8 -- ^ Field delimiter
header :: DecodeOptions -- ^ Field delimiter
-> AL.Parser Header
header !delim = V.fromList <$!> name delim `sepBy1'` (A.word8 delim) <* endOfLine
header = record
{-# INLINE header #-}

-- | Parse a header name. Header names have the same format as regular
-- 'field's.
name :: Word8 -> AL.Parser Name
name !delim = field delim
name :: DecodeOptions -> AL.Parser Name
name = field

removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
Expand All @@ -120,23 +154,42 @@ removeBlankLines = filter (not . blankLine)
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
record :: Word8 -- ^ Field delimiter
-> AL.Parser Record
record !delim = do
fs <- field delim `sepBy1'` (A.word8 delim)
return $! V.fromList fs
record :: DecodeOptions -> AL.Parser Record
record !opts
-- If we need to trim spaces from line only robust way to do so is
-- to read whole line, remove spaces and run record parser on
-- trimmed line. For example:
--
-- + "a,b,c " will be parsed as ["a","b","c "] since spaces are
-- allowed in field
-- + "a b c " will be parsed as ["a","b","c",""] if we use space
-- as separator.
| decTrimRecordSpace opts = do
AL.skipMany $ AL.satisfy isSpace
line <- AL.takeWhile $ \c -> c /= newline && c /= cr
let (dat,_) = S.spanEnd isSpace line
case AL.parseOnly parser dat of
Left e -> fail e
Right x -> return x
| otherwise = parser
where
delim = decDelimiter opts
delimiter | decMergeDelimiters opts = A.skipMany1 (A.satisfy delim)
| otherwise = () <$ A.satisfy delim
parser = do fs <- field opts `sepBy1'` delimiter
return $! V.fromList fs
{-# INLINE record #-}

-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: Word8 -> AL.Parser Field
field !delim = do
field :: DecodeOptions -> AL.Parser Field
field !opt = do
mb <- A.peekWord8
-- We purposely don't use <|> as we want to commit to the first
-- choice if we see a double quote.
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField delim
_ -> unescapedField opt
{-# INLINE field #-}

escapedField :: AL.Parser S.ByteString
Expand All @@ -154,11 +207,14 @@ escapedField = do
Left err -> fail err
else return s

unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= delim &&
c /= cr)
unescapedField :: DecodeOptions -> AL.Parser S.ByteString
unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= cr &&
not (delim c))
where
delim = decDelimiter opt
{-# INLINE unescapedField #-}

dquote :: AL.Parser Char
dquote = char '"'
Expand All @@ -178,7 +234,13 @@ unescape = toByteString <$!> go mempty where
then return (acc `mappend` fromByteString h)
else rest

doubleQuote, newline, cr :: Word8
doubleQuote, newline, cr, space, tab :: Word8
doubleQuote = 34
newline = 10
cr = 13
newline = 10
cr = 13
space = 32
tab = 9

isSpace :: Word8 -> Bool
isSpace c = c == space || c == tab
{-# INLINE isSpace #-}
21 changes: 18 additions & 3 deletions tests/UnitTests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import Test.Framework.Providers.QuickCheck2 as TF
import Data.Csv hiding (record)
import qualified Data.Csv.Streaming as S


------------------------------------------------------------------------
-- Parse tests

Expand Down Expand Up @@ -114,6 +115,14 @@ positionalTests =
[ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 })
[["1", "2"]] "1\t2\r\n"
]
, testGroup "encodeSpace" $ map (\(n,a,b) -> testCase n $ encodesWithAs spaceEncodeOptions a b)
[ ("simple", [["abc"]], "abc\r\n")
, ("leadingSpace", [[" abc"]], "\" abc\"\r\n")
, ("comma", [["abc,def"]], "abc,def\r\n")
, ("space", [["abc def"]], "\"abc def\"\r\n")
, ("tab", [["abc\tdef"]], "\"abc\tdef\"\r\n")
, ("twoFields", [["abc","def"]], "abc\tdef\r\n")
]
, testGroup "decode" $ map decodeTest decodeTests
, testGroup "decodeWith" $ map decodeWithTest decodeWithTests
, testGroup "streaming"
Expand Down Expand Up @@ -142,7 +151,12 @@ positionalTests =
, ("rfc4180", rfc4180Input, rfc4180Output)
]
decodeWithTests =
[ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]])
[ ("tab-delim", defDec { decDelimiter = (==9) }, "1\t2", [["1", "2"]])
, ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]])
, ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"]
, ["11","22"]])
, ("blankLine-space", spaceDec, "1 2\n\n3 4\n", [ ["1","2"]
, ["3","4"]])
]

encodeTest (name, input, expected) =
Expand All @@ -155,8 +169,9 @@ positionalTests =
testCase name $ input `decodesStreamingAs` expected
streamingDecodeWithTest (name, opts, input, expected) =
testCase name $ decodesWithStreamingAs opts input expected
defEnc = defaultEncodeOptions
defDec = defaultDecodeOptions
defEnc = defaultEncodeOptions
defDec = defaultDecodeOptions
spaceDec = spaceDecodeOptions

nameBasedTests :: [TF.Test]
nameBasedTests =
Expand Down