haskell-hvr · Shimuuar · Oct 20, 2012 · Oct 20, 2012 · Oct 22, 2012 · Oct 23, 2012
diff --git a/Data/Csv.hs b/Data/Csv.hs
@@ -1,5 +1,9 @@
--- | This module implements encoding and decoding of CSV data. The
--- implementation is RFC 4180 compliant, with the following
+-- | This module implements encoding and decoding of CSV and
+-- space-delimited data. APIs for both format are essentially the
+-- same. They only differ in function names and space-delimited data
+-- doesn't have encoding\/decoding options.
+--
+-- The CSV implementation is RFC 4180 compliant, with the following
 -- extensions:
 --
 --  * Empty lines are ignored.
@@ -9,6 +13,16 @@
 --
 --  * Escaped fields may contain any characters (but double-quotes
 --    need to be escaped).
+--
+-- Space-delimited data don't have specification so following format
+-- is assumed:
+--
+--  * Fields are delimited by one or more tabs and spaces at the
+--    beginning and end of line are ignored.
+--
+--  * Empty lines are ignored.
+--
+--  * Escaping rules are same as with CSV
 module Data.Csv
     (
     -- * Usage example
@@ -20,7 +34,7 @@ module Data.Csv
     -- * Custom type conversions
     -- $customtypeconversions
 
-    -- * Encoding and decoding
+    -- * Encoding and decoding CSV
     -- $encoding
       decode
     , decodeByName
@@ -38,6 +52,12 @@ module Data.Csv
     , encodeWith
     , encodeByNameWith
 
+    -- * Space-delimited data
+    , decodeTable
+    , decodeTableByName
+    , encodeTable
+    , encodeTableByName
+
     -- * Core CSV types
     , Csv
     , Record

diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs
@@ -26,6 +26,12 @@ module Data.Csv.Encoding
     , defaultEncodeOptions
     , encodeWith
     , encodeByNameWith
+
+    -- * Space-delimited files
+    , decodeTable
+    , decodeTableByName
+    , encodeTable
+    , encodeTableByName
     ) where
 
 import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
@@ -97,22 +103,10 @@ decodeWith :: FromRecord a
                              -- skipped
            -> L.ByteString   -- ^ CSV data
            -> Either String (Vector a)
-decodeWith = decodeWithC (runParser . parseCsv)
+decodeWith !opt =
+  decodeWithC (header $ decDelimiter opt) (csv opt) (runParser . parseCsv)
 {-# INLINE [1] decodeWith #-}
 
-parseCsv :: FromRecord a => Csv -> Parser (Vector a)
-parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)
-
-mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
-mapM' f = go
-  where
-    go [] = return []
-    go (x:xs) = do
-        !y <- f x
-        ys <- go xs
-        return (y : ys)
-{-# INLINE mapM' #-}
-
 {-# RULES
     "idDecodeWith" decodeWith = idDecodeWith
  #-}
@@ -121,14 +115,16 @@ mapM' f = go
 -- conversion is performed.
 idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
              -> Either String (Vector (Vector B.ByteString))
-idDecodeWith = decodeWithC pure
+idDecodeWith !opt = decodeWithC (header $ decDelimiter opt) (csv opt) pure
 
-decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
-            -> Either String a
-decodeWithC convert !opts skipHeader = decodeWithP parser convert
+decodeWithC :: AL.Parser Header -> AL.Parser Csv
+            -> (Csv -> Either String a)
+            -> Bool -> L.ByteString -> Either String a
+decodeWithC headerP body convert skipHeader
+  = decodeWithP parser convert
   where parser
-            | skipHeader = header (decDelimiter opts) *> csv opts
-            | otherwise  = csv opts
+            | skipHeader = headerP *> body
+            | otherwise  = body
 {-# INLINE decodeWithC #-}
 
 -- | Like 'decodeByName', but lets you customize how the CSV data is
@@ -141,9 +137,6 @@ decodeByNameWith !opts =
     decodeWithP (csvWithHeader opts)
     (\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
 
-parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
-parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
-
 -- | Options that controls how data is encoded. These options can be
 -- used to e.g. encode data in a tab-separated format instead of in a
 -- comma-separated format.
@@ -180,11 +173,105 @@ encodeRecord delim = mconcat . intersperse (fromWord8 delim)
                      . map fromByteString . map escape . V.toList
 {-# INLINE encodeRecord #-}
 
+-- | Like 'encodeByName', but lets you customize how the CSV data is
+-- encoded.
+encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a
+                 -> L.ByteString
+encodeByNameWith opts hdr v =
+    toLazyByteString ((encodeRecord (encDelimiter opts) hdr) <>
+                      fromByteString "\r\n" <> records)
+  where
+    records = unlines
+              . map (encodeRecord (encDelimiter opts)
+                     . namedRecordToRecord hdr . toNamedRecord)
+              . V.toList $ v
+{-# INLINE encodeByNameWith #-}
+
+
+------------------------------------------------------------------------
+-- * Space-delimited files
+
+-- | Efficiently deserialize space-delimited records from a lazy
+-- ByteString. If this fails due to incomplete or invalid input,
+-- @'Left' msg@ is returned.
+decodeTable :: FromRecord a
+            => Bool             -- ^ Data contains header that should be skipped
+            -> L.ByteString     -- ^ Raw data
+            -> Either String (Vector a)
+decodeTable =
+    decodeWithC tableHeader table (runParser . parseCsv)
+{-# INLINE decodeTable #-}
+
+-- | Same as 'decodeWith', but more efficient as no type
+-- conversion is performed.
+idDecodeTable :: Bool -> L.ByteString -> Either String (Vector (Vector B.ByteString))
+idDecodeTable = decodeWithC tableHeader table pure
+
+{-# RULES
+    "idDecodeTable" decodeTable = idDecodeTable
+ #-}
+
+
+-- | Efficiently deserialize space-delimited records from a lazy
+-- ByteString. If this fails due to incomplete or invalid input,
+-- @'Left' msg@ is returned. The data is assumed to be preceeded by a
+-- header.
+decodeTableByName :: FromNamedRecord a => L.ByteString -> Either String (Header, Vector a)
+decodeTableByName =
+    decodeWithP tableWithHeader
+    (\(hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
+{-# INLINE decodeTableByName #-}
+
+-- | Efficiently serialize space-delimited records as a lazy
+-- ByteString. Single tab is used as separator.
+encodeTable :: ToRecord a => V.Vector a -> L.ByteString
+encodeTable = toLazyByteString
+            . unlines
+            . map (encodeTableRow . toRecord)
+            . V.toList
+{-# INLINE encodeTable #-}
+
+-- | Efficiently serialize space-delimited records as a lazy
+-- ByteString. The header is written before any records and dictates
+-- the field order. Single tab is used as separator.
+encodeTableByName :: ToNamedRecord a => Header -> V.Vector a -> L.ByteString
+encodeTableByName hdr v =
+    toLazyByteString (  encodeTableRow hdr
+                     <> fromByteString "\r\n"
+                     <> records )
+  where
+    records = unlines
+            . map (encodeTableRow . namedRecordToRecord hdr . toNamedRecord)
+            . V.toList $ v
+{-# INLINE encodeTableByName #-}
+
+encodeTableRow :: Record -> Builder
+encodeTableRow = mconcat . intersperse (fromWord8 9)
+               . map fromByteString . map escapeT . V.toList
+  where
+    -- We need to escape empty strings. Otherwise we'll get:
+    --  > encode ["a","","b"] = 'a  b'
+    -- instead of
+    --  > encode ["a","","b"] = 'a "" b'
+    escapeT b | B.null b  = "\"\""
+              | otherwise = escape b
+{-# INLINE encodeTableRow #-}
+
+
+------------------------------------------------------------------------
+-- * Common functionality and helpers
+
+parseCsv :: FromRecord a => Csv -> Parser (Vector a)
+parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)
+
+parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
+parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
+
 -- TODO: Optimize
 escape :: B.ByteString -> B.ByteString
 escape s
     | B.find (\ b -> b == dquote || b == comma || b == nl || b == cr ||
-                     b == sp) s == Nothing = s
+                     b == sp     || b == tab) s == Nothing = s
     | otherwise =
         B.concat ["\"",
                   B.concatMap
@@ -196,21 +283,7 @@ escape s
     nl     = 10
     cr     = 13
     sp     = 32
-
--- | Like 'encodeByName', but lets you customize how the CSV data is
--- encoded.
-encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a
-                 -> L.ByteString
-encodeByNameWith opts hdr v =
-    toLazyByteString ((encodeRecord (encDelimiter opts) hdr) <>
-                      fromByteString "\r\n" <> records)
-  where
-    records = unlines
-              . map (encodeRecord (encDelimiter opts)
-                     . namedRecordToRecord hdr . toNamedRecord)
-              . V.toList $ v
-{-# INLINE encodeByNameWith #-}
-
+    tab    =  9
 
 namedRecordToRecord :: Header -> NamedRecord -> Record
 namedRecordToRecord hdr nr = V.map find hdr
@@ -245,4 +318,15 @@ decodeWithP p to s =
           Left msg -> Left $ "conversion error: " ++ msg
       AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
                             show (BL8.unpack left)
-{-# INLINE decodeWithP #-}
+{-# INLINE decodeWithP #-}
+
+mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
+mapM' f = go
+  where
+    go [] = return []
+    go (x:xs) = do
+        !y <- f x
+        ys <- go xs
+        return (y : ys)
+{-# INLINE mapM' #-}
+