Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

HTML table to contains list #379

Closed
wants to merge 8 commits into from

2 participants

@qzchenwl

For file table.html

<table>
  <tr>
    <td>
      <ul>
        <li>item1</li>
        <li>item2</li>
      </ul>
    </td>
  </tr>
</table>

pandoc.old -f html -t html table.html

<ul>
<li>item1</li>
<li>item2</li>
</ul>

pandoc.new -f html -t html table.html

<table>
<tbody>
<tr class="odd">
<td align="left"><ul>
<li>item1</li>
<li>item2</li>
</ul></td>
</tr>
</tbody>
</table>
@jgm
Owner

If you apply the patch, then do

pandoc -f html -t markdown | pandoc

on this input, you'll see why I had pPlain instead of block.

The problem is that we can't extract information about the widths of the table columns from the HTML. So we just set them all to 0, which pandoc interprets as meaning "just put the cells on one line and create a simple, not a multiline table".

I suppose that more general tables could be supported in HTML by assuming that all the columns are equal width, but that will often produce funny results.

@jgm
Owner
jgm commented

The latest pandoc handles the original input as expected. Closing.

@jgm jgm closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 18, 2012
  1. @qzchenwl
Commits on Feb 14, 2012
Commits on Feb 15, 2012
  1. @qzchenwl

    docx hello world

    qzchenwl authored
  2. @qzchenwl

    extract files from docx

    qzchenwl authored
Commits on Feb 17, 2012
  1. @qzchenwl

    extract text

    qzchenwl authored
  2. @qzchenwl

    extract plain text from docx

    qzchenwl authored
Commits on Feb 18, 2012
  1. @qzchenwl

    parse bullet list

    qzchenwl authored
Commits on Feb 19, 2012
  1. @qzchenwl

    parse rich table

    qzchenwl authored
This page is out of date. Refresh to see the latest.
View
0  hsmarkdown 100755 → 100644
File mode changed
View
0  make_osx_package.sh 100755 → 100644
File mode changed
View
32 src/Text/Pandoc.hs
@@ -120,6 +120,7 @@ import Text.Pandoc.Readers.LaTeX
import Text.Pandoc.Readers.HTML
import Text.Pandoc.Readers.Textile
import Text.Pandoc.Readers.Native
+import Text.Pandoc.Readers.Docx
import Text.Pandoc.Writers.Native
import Text.Pandoc.Writers.Markdown
import Text.Pandoc.Writers.RST
@@ -141,6 +142,8 @@ import Text.Pandoc.Templates
import Text.Pandoc.Parsing
import Text.Pandoc.Shared
import Data.Version (showVersion)
+import Data.ByteString.Lazy.Char8 (ByteString)
+import Data.ByteString.Lazy.UTF8 (toString)
import Text.JSON.Generic
import Paths_pandoc (version)
@@ -149,20 +152,21 @@ pandocVersion :: String
pandocVersion = showVersion version
-- | Association list of formats and readers.
-readers :: [(String, ParserState -> String -> Pandoc)]
-readers = [("native" , \_ -> readNative)
- ,("json" , \_ -> decodeJSON)
- ,("markdown" , readMarkdown)
- ,("markdown+lhs" , \st ->
- readMarkdown st{ stateLiterateHaskell = True})
- ,("rst" , readRST)
- ,("rst+lhs" , \st ->
- readRST st{ stateLiterateHaskell = True})
- ,("textile" , readTextile) -- TODO : textile+lhs
- ,("html" , readHtml)
- ,("latex" , readLaTeX)
- ,("latex+lhs" , \st ->
- readLaTeX st{ stateLiterateHaskell = True})
+readers :: [(String, ParserState -> ByteString -> Pandoc)]
+readers = [("native" , \_ -> readNative . toString)
+ ,("json" , \_ -> decodeJSON . toString)
+ ,("markdown" , \st bs -> readMarkdown st (toString bs))
+ ,("markdown+lhs" , \st bs ->
+ readMarkdown st{ stateLiterateHaskell = True} (toString bs))
+ ,("rst" , \st bs -> readRST st (toString bs))
+ ,("rst+lhs" , \st bs ->
+ readRST st{ stateLiterateHaskell = True} (toString bs))
+ ,("textile" , \st bs -> readTextile st (toString bs)) -- TODO : textile+lhs
+ ,("html" , \st bs -> readHtml st (toString bs))
+ ,("latex" , \st bs -> readLaTeX st (toString bs))
+ ,("latex+lhs" , \st bs ->
+ readLaTeX st{ stateLiterateHaskell = True} (toString bs))
+ ,("docx" , \st bs -> readDocx st bs)
]
-- | Association list of formats and writers (omitting the
View
147 src/Text/Pandoc/Readers/Docx.hs
@@ -0,0 +1,147 @@
+module Text.Pandoc.Readers.Docx (readDocx) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Parsing
+import Text.Pandoc.Definition
+import Text.HTML.TagSoup
+import Data.ByteString.Lazy.Char8 (ByteString, pack)
+import Data.ByteString.Lazy.UTF8 (toString)
+import Debug.Trace
+import Codec.Archive.Zip (toArchive, findEntryByPath, fromEntry)
+import Control.Monad (liftM)
+
+-- | Convert OpenXML-formatted string to 'Pandoc' document.
+readDocx :: ParserState -- ^ Parser state
+ -> ByteString -- ^ ByteString to parse
+ -> Pandoc
+readDocx st bs = Pandoc meta doc
+ where
+ meta = Meta [] [] []
+ doc = readWith parseDocument st tags
+ tags = parseTags $ toString document
+ document = readFromZip "word/document.xml" bs
+
+readFromZip :: FilePath -> ByteString -> ByteString
+readFromZip path bs = maybe (pack "") fromEntry (findEntryByPath path $ toArchive bs)
+
+type TagParser = GenParser (Tag String) ParserState
+
+parseDocument :: TagParser [Block]
+parseDocument = try $ do
+ skipMany $ pSatisfy (not . (~== TagOpen "w:document" []))
+ pInTags "w:document" parseBody
+
+parseBody :: TagParser [Block]
+parseBody = try $ do
+ skipMany $ pSatisfy (not . (~== TagOpen "w:body" []))
+ pInTags "w:body" blocks
+
+blocks :: TagParser [Block]
+blocks = liftM concat $ many1 block
+
+block :: TagParser [Block]
+block = choice
+ [ pList
+ , pTable
+ , pPlain
+ ]
+pList :: TagParser [Block]
+pList = try $ do
+ items <- many1 item
+ return $ [BulletList items]
+
+item :: TagParser [Block]
+item = try $ do
+ pSatisfy (~== TagOpen "w:p" [])
+ let non = pSatisfy (\t -> (t ~/= TagClose "w:p") && (t ~/= TagOpen "w:ilvl" []))
+ skipMany non
+ pSatisfy (~== TagOpen "w:ilvl" [])
+ str <- liftM concat $ many1Till plainText (pSatisfy (~== TagClose "w:p"))
+ return [Plain [Str str]]
+
+pTable :: TagParser [Block]
+pTable = try $ do
+ pSatisfy (~== TagOpen "w:tbl" [])
+ let non = pSatisfy (\t -> (t ~/= TagClose "w:tbl") && (t ~/= TagOpen "w:tr" []))
+ skipMany non
+ rows <- manyTill pTableRow (pCloses "w:tbl")
+ let cols = maximum $ map length rows
+ let aligns = replicate cols AlignLeft
+ let widths = replicate cols 0
+ return [Table [] aligns widths [] rows]
+
+pTableRow :: TagParser [TableCell]
+pTableRow = try $ do
+ pSatisfy (~== TagOpen "w:tr" [])
+ let non = pSatisfy (\t -> (t ~/= TagClose "w:tr") && (t ~/= TagOpen "w:tc" []))
+ skipMany non
+ cells <- manyTill pTableCell (pCloses "w:tr")
+ return cells
+
+pTableCell :: TagParser TableCell
+pTableCell = try $ do
+ pSatisfy (~== TagOpen "w:tc" [])
+ let non = pSatisfy (\t -> (t ~/= TagClose "w:tc") && (t ~/= TagOpen "w:p" []))
+ skipMany non
+ cell <- manyTill block (pCloses "w:tc")
+ return $ concat cell
+
+pPlain :: TagParser [Block]
+pPlain = do
+ str <- plainText
+ return [Plain [Str str]]
+
+plainText :: TagParser String
+plainText = try $ do
+ tag <- lookAhead anyTag
+ pPlain' tag
+ where pPlain' (TagOpen n _) = pInTags n getText
+ pPlain' (TagText str) = do anyTag
+ return str
+ pPlain' _ = do anyTag
+ return ""
+
+pPlainDebug :: TagParser [Block]
+pPlainDebug = do
+ str <- plainTextDebug
+ return [Plain [Str str]]
+
+plainTextDebug :: TagParser String
+plainTextDebug = try $ do
+ tag <- lookAhead anyTag
+ txt <- pPlain' tag
+ trace ("plainText: return " ++ txt) (return txt)
+ where pPlain' (TagOpen n _) = trace ("plainText: open " ++ n) $ pInTags n getTextDebug
+ pPlain' (TagText str) = do anyTag
+ return $ trace ("plainText: text " ++ str) str
+ pPlain' _ = do anyTag
+ return $ trace ("plainText: other ") ""
+
+getTextDebug :: TagParser String
+getTextDebug = do
+ x <- anyTag
+ return $ if isTagText (trace ("getText: " ++ show x) x) then innerText [x] else []
+
+getText :: TagParser String
+getText = do
+ x <- anyTag
+ return $ if isTagText x then innerText [x] else []
+
+anyTag :: TagParser (Tag String)
+anyTag = pSatisfy (const True)
+
+pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
+pSatisfy f = tokenPrim show
+ (\pos _ _ -> setSourceLine pos (1 + sourceLine pos))
+ (\t -> if f t then Just t else Nothing)
+
+pInTags :: String -> TagParser [a]
+ -> TagParser [a]
+pInTags tagtype parser = try $ do
+ pSatisfy (~== TagOpen tagtype [])
+ liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+
+pCloses :: String -> TagParser ()
+pCloses tagtype = try $ do
+ pSatisfy (~== TagClose tagtype)
+ return ()
View
2  src/Text/Pandoc/Readers/HTML.hs
@@ -228,7 +228,7 @@ pSimpleTable = try $ do
pCell :: String -> TagParser [TableCell]
pCell celltype = try $ do
skipMany pBlank
- res <- pInTags celltype pPlain
+ res <- pInTags celltype block
skipMany pBlank
return [res]
View
13 src/pandoc.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2006-2011 John MacFarlane <jgm@berkeley.edu>
@@ -40,7 +41,7 @@ import System.Exit ( exitWith, ExitCode (..) )
import System.FilePath
import System.Console.GetOpt
import Data.Char ( toLower )
-import Data.List ( intercalate, isSuffixOf, isPrefixOf )
+import Data.List ( isSuffixOf, isPrefixOf )
import System.Directory ( getAppUserDataDirectory, doesFileExist )
import System.IO ( stdout, stderr )
import System.IO.Error ( isDoesNotExistError )
@@ -52,7 +53,6 @@ import Control.Monad (when, unless, liftM)
import Network.HTTP (simpleHTTP, mkRequest, getResponseBody, RequestMethod(..))
import Network.URI (parseURI, isURI, URI(..))
import qualified Data.ByteString.Lazy as B
-import Data.ByteString.Lazy.UTF8 (toString )
import Text.HTML.TagSoup.Entity (lookupEntity)
import Codec.Binary.UTF8.String (decodeString, encodeString)
import Text.CSL.Reference (Reference(..))
@@ -916,17 +916,16 @@ main = do
let readSources [] = mapM readSource ["-"]
readSources srcs = mapM readSource srcs
- readSource "-" = UTF8.getContents
+ readSource "-" = B.getContents
readSource src = case parseURI src of
Just u | uriScheme u `elem` ["http:","https:"] ->
readURI u
- _ -> UTF8.readFile src
- readURI uri = simpleHTTP (mkRequest GET uri) >>= getResponseBody >>=
- return . toString -- treat all as UTF8
+ _ -> B.readFile src
+ readURI uri = simpleHTTP (mkRequest GET uri) >>= getResponseBody
let convertTabs = tabFilter (if preserveTabs then 0 else tabStop)
- doc <- fmap (reader startParserState . convertTabs . intercalate "\n") (readSources sources)
+ doc <- fmap (reader startParserState . B.intercalate "\n") (readSources sources)
let doc0 = foldr ($) doc transforms
View
0  stats.sh 100755 → 100644
File mode changed
View
0  stripansi.sh 100755 → 100644
File mode changed
View
0  tests/MarkdownTest_1.0.3/MarkdownTest.pl 100755 → 100644
File mode changed
Something went wrong with that request. Please try again.