Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of space-delimited data #30

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions Data/Csv.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
-- | This module implements encoding and decoding of CSV data. The
-- implementation is RFC 4180 compliant, with the following
-- | This module implements encoding and decoding of CSV and
-- space-delimited data. APIs for both format are essentially the
-- same. They only differ in function names and space-delimited data
-- doesn't have encoding\/decoding options.
--
-- The CSV implementation is RFC 4180 compliant, with the following
-- extensions:
--
-- * Empty lines are ignored.
Expand All @@ -9,6 +13,16 @@
--
-- * Escaped fields may contain any characters (but double-quotes
-- need to be escaped).
--
-- Space-delimited data don't have specification so following format
-- is assumed:
--
-- * Fields are delimited by one or more tabs and spaces at the
-- beginning and end of line are ignored.
--
-- * Empty lines are ignored.
--
-- * Escaping rules are same as with CSV
module Data.Csv
(
-- * Usage example
Expand All @@ -20,7 +34,7 @@ module Data.Csv
-- * Custom type conversions
-- $customtypeconversions

-- * Encoding and decoding
-- * Encoding and decoding CSV
-- $encoding
decode
, decodeByName
Expand All @@ -38,6 +52,12 @@ module Data.Csv
, encodeWith
, encodeByNameWith

-- * Space-delimited data
, decodeTable
, decodeTableByName
, encodeTable
, encodeTableByName

-- * Core CSV types
, Csv
, Record
Expand Down
164 changes: 124 additions & 40 deletions Data/Csv/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ module Data.Csv.Encoding
, defaultEncodeOptions
, encodeWith
, encodeByNameWith

-- * Space-delimited files
, decodeTable
, decodeTableByName
, encodeTable
, encodeTableByName
) where

import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
Expand Down Expand Up @@ -97,22 +103,10 @@ decodeWith :: FromRecord a
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
decodeWith = decodeWithC (runParser . parseCsv)
decodeWith !opt =
decodeWithC (header $ decDelimiter opt) (csv opt) (runParser . parseCsv)
{-# INLINE [1] decodeWith #-}

parseCsv :: FromRecord a => Csv -> Parser (Vector a)
parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)

mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
mapM' f = go
where
go [] = return []
go (x:xs) = do
!y <- f x
ys <- go xs
return (y : ys)
{-# INLINE mapM' #-}

{-# RULES
"idDecodeWith" decodeWith = idDecodeWith
#-}
Expand All @@ -121,14 +115,16 @@ mapM' f = go
-- conversion is performed.
idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
-> Either String (Vector (Vector B.ByteString))
idDecodeWith = decodeWithC pure
idDecodeWith !opt = decodeWithC (header $ decDelimiter opt) (csv opt) pure

decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
-> Either String a
decodeWithC convert !opts skipHeader = decodeWithP parser convert
decodeWithC :: AL.Parser Header -> AL.Parser Csv
-> (Csv -> Either String a)
-> Bool -> L.ByteString -> Either String a
decodeWithC headerP body convert skipHeader
= decodeWithP parser convert
where parser
| skipHeader = header (decDelimiter opts) *> csv opts
| otherwise = csv opts
| skipHeader = headerP *> body
| otherwise = body
{-# INLINE decodeWithC #-}

-- | Like 'decodeByName', but lets you customize how the CSV data is
Expand All @@ -141,9 +137,6 @@ decodeByNameWith !opts =
decodeWithP (csvWithHeader opts)
(\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))

parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)

-- | Options that controls how data is encoded. These options can be
-- used to e.g. encode data in a tab-separated format instead of in a
-- comma-separated format.
Expand Down Expand Up @@ -180,11 +173,105 @@ encodeRecord delim = mconcat . intersperse (fromWord8 delim)
. map fromByteString . map escape . V.toList
{-# INLINE encodeRecord #-}

-- | Like 'encodeByName', but lets you customize how the CSV data is
-- encoded.
encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a
-> L.ByteString
encodeByNameWith opts hdr v =
toLazyByteString ((encodeRecord (encDelimiter opts) hdr) <>
fromByteString "\r\n" <> records)
where
records = unlines
. map (encodeRecord (encDelimiter opts)
. namedRecordToRecord hdr . toNamedRecord)
. V.toList $ v
{-# INLINE encodeByNameWith #-}


------------------------------------------------------------------------
-- * Space-delimited files

-- | Efficiently deserialize space-delimited records from a lazy
-- ByteString. If this fails due to incomplete or invalid input,
-- @'Left' msg@ is returned.
decodeTable :: FromRecord a
=> Bool -- ^ Data contains header that should be skipped
-> L.ByteString -- ^ Raw data
-> Either String (Vector a)
decodeTable =
decodeWithC tableHeader table (runParser . parseCsv)
{-# INLINE decodeTable #-}

-- | Same as 'decodeWith', but more efficient as no type
-- conversion is performed.
idDecodeTable :: Bool -> L.ByteString -> Either String (Vector (Vector B.ByteString))
idDecodeTable = decodeWithC tableHeader table pure

{-# RULES
"idDecodeTable" decodeTable = idDecodeTable
#-}


-- | Efficiently deserialize space-delimited records from a lazy
-- ByteString. If this fails due to incomplete or invalid input,
-- @'Left' msg@ is returned. The data is assumed to be preceeded by a
-- header.
decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, Vector a)
decodeTableByName =
decodeWithP tableWithHeader
(\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
{-# INLINE decodeTableByName #-}

-- | Efficiently serialize space-delimited records as a lazy
-- ByteString. Single tab is used as separator.
encodeTable :: ToRecord a => V.Vector a -> L.ByteString
encodeTable = toLazyByteString
. unlines
. map (encodeTableRow . toRecord)
. V.toList
{-# INLINE encodeTable #-}

-- | Efficiently serialize space-delimited records as a lazy
-- ByteString. The header is written before any records and dictates
-- the field order. Single tab is used as separator.
encodeTableByName :: ToNamedRecord a => Header -> V.Vector a -> L.ByteString
encodeTableByName hdr v =
toLazyByteString ( encodeTableRow hdr
<> fromByteString "\r\n"
<> records )
where
records = unlines
. map (encodeTableRow . namedRecordToRecord hdr . toNamedRecord)
. V.toList $ v
{-# INLINE encodeTableByName #-}

encodeTableRow :: Record -> Builder
encodeTableRow = mconcat . intersperse (fromWord8 9)
. map fromByteString . map escapeT . V.toList
where
-- We need to escape empty strings. Otherwise we'll get:
-- > encode ["a","","b"] = 'a b'
-- instead of
-- > encode ["a","","b"] = 'a "" b'
escapeT b | B.null b = "\"\""
| otherwise = escape b
{-# INLINE encodeTableRow #-}


------------------------------------------------------------------------
-- * Common functionality and helpers

parseCsv :: FromRecord a => Csv -> Parser (Vector a)
parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)

parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)

-- TODO: Optimize
escape :: B.ByteString -> B.ByteString
escape s
| B.find (\ b -> b == dquote || b == comma || b == nl || b == cr ||
b == sp) s == Nothing = s
b == sp || b == tab) s == Nothing = s
| otherwise =
B.concat ["\"",
B.concatMap
Expand All @@ -196,21 +283,7 @@ escape s
nl = 10
cr = 13
sp = 32

-- | Like 'encodeByName', but lets you customize how the CSV data is
-- encoded.
encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a
-> L.ByteString
encodeByNameWith opts hdr v =
toLazyByteString ((encodeRecord (encDelimiter opts) hdr) <>
fromByteString "\r\n" <> records)
where
records = unlines
. map (encodeRecord (encDelimiter opts)
. namedRecordToRecord hdr . toNamedRecord)
. V.toList $ v
{-# INLINE encodeByNameWith #-}

tab = 9

namedRecordToRecord :: Header -> NamedRecord -> Record
namedRecordToRecord hdr nr = V.map find hdr
Expand Down Expand Up @@ -245,4 +318,15 @@ decodeWithP p to s =
Left msg -> Left $ "conversion error: " ++ msg
AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
show (BL8.unpack left)
{-# INLINE decodeWithP #-}
{-# INLINE decodeWithP #-}

mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
mapM' f = go
where
go [] = return []
go (x:xs) = do
!y <- f x
ys <- go xs
return (y : ys)
{-# INLINE mapM' #-}

Loading