Skip to content

Commit

Permalink
Issue jgm#1692: Initial docx reader support
Browse files Browse the repository at this point in the history
Signed-off-by: Jesse Rosenthal <jrosenthal@jhu.edu>
  • Loading branch information
lierdakil authored and jkr committed Oct 14, 2014
1 parent 8b60d43 commit 5880b93
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 48 deletions.
21 changes: 2 additions & 19 deletions src/Text/Pandoc/Readers/Docx.hs
Expand Up @@ -203,13 +203,6 @@ blockQuoteDivs = ["Quote", "BlockQuote", "BlockQuotation"]
codeDivs :: [String]
codeDivs = ["SourceCode"]


-- For the moment, we have English, Danish, German, and French. This
-- is fairly ad-hoc, and there might be a more systematic way to do
-- it, but it's better than nothing.
headerPrefixes :: [String]
headerPrefixes = ["Heading", "Overskrift", "berschrift", "Titre"]

runElemToInlines :: RunElem -> Inlines
runElemToInlines (TextRun s) = text s
runElemToInlines (LnBrk) = linebreak
Expand Down Expand Up @@ -467,12 +460,11 @@ bodyPartToBlocks (Paragraph pPr parparts)
$ parStyleToTransform pPr
$ codeBlock
$ concatMap parPartToString parparts
| (c : cs) <- filter (isJust . isHeaderClass) $ pStyle pPr
, Just (prefix, n) <- isHeaderClass c = do
| Just (style, n) <- pHeading pPr = do
ils <- local (\s-> s{docxInHeaderBlock=True}) $
(concatReduce <$> mapM parPartToInlines parparts)
makeHeaderAnchor $
headerWith ("", delete (prefix ++ show n) cs, []) n ils
headerWith ("", delete style (pStyle pPr), []) n ils
| otherwise = do
ils <- concatReduce <$> mapM parPartToInlines parparts >>=
(return . fromList . trimLineBreaks . normalizeSpaces . toList)
Expand Down Expand Up @@ -559,12 +551,3 @@ docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag)
docxToOutput opts (Docx (Document _ body)) =
let dEnv = def { docxOptions = opts} in
evalDocxContext (bodyToOutput body) dEnv def

isHeaderClass :: String -> Maybe (String, Int)
isHeaderClass s | (pref:_) <- filter (\h -> isPrefixOf h s) headerPrefixes
, Just s' <- stripPrefix pref s =
case reads s' :: [(Int, String)] of
[] -> Nothing
((n, "") : []) -> Just (pref, n)
_ -> Nothing
isHeaderClass _ = Nothing
97 changes: 68 additions & 29 deletions src/Text/Pandoc/Readers/Docx/Parse.hs
Expand Up @@ -65,14 +65,15 @@ import Text.Pandoc.Compat.Except
import Text.TeXMath.Readers.OMML (readOMML)
import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
import Text.TeXMath (Exp)
import Data.Char (readLitChar, ord, chr)
import Data.Char (readLitChar, ord, chr, isDigit)

data ReaderEnv = ReaderEnv { envNotes :: Notes
, envNumbering :: Numbering
, envRelationships :: [Relationship]
, envMedia :: Media
, envFont :: Maybe Font
, envCharStyles :: CharStyleMap
, envParStyles :: CharStyleMap
}
deriving Show

Expand Down Expand Up @@ -152,13 +153,15 @@ data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer
data ParagraphStyle = ParagraphStyle { pStyle :: [String]
, indentation :: Maybe ParIndentation
, dropCap :: Bool
, pHeading :: Maybe (String,Int)
}
deriving Show

defaultParagraphStyle :: ParagraphStyle
defaultParagraphStyle = ParagraphStyle { pStyle = []
, indentation = Nothing
, dropCap = False
, pHeading = Nothing
}


Expand Down Expand Up @@ -210,6 +213,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool
, isStrike :: Maybe Bool
, rVertAlign :: Maybe VertAlign
, rUnderline :: Maybe String
, rHeading :: Maybe (String,Int)
, rStyle :: Maybe CharStyle}
deriving Show

Expand All @@ -220,6 +224,7 @@ defaultRunStyle = RunStyle { isBold = Nothing
, isStrike = Nothing
, rVertAlign = Nothing
, rUnderline = Nothing
, rHeading = Nothing
, rStyle = Nothing}


Expand All @@ -242,8 +247,9 @@ archiveToDocx archive = do
numbering = archiveToNumbering archive
rels = archiveToRelationships archive
media = archiveToMedia archive
styles = archiveToStyles archive
rEnv = ReaderEnv notes numbering rels media Nothing styles
styles = archiveToStyles archive "character"
pstyles = archiveToStyles archive "paragraph"
rEnv = ReaderEnv notes numbering rels media Nothing styles pstyles
doc <- runD (archiveToDocument archive) rEnv
return $ Docx doc

Expand All @@ -263,8 +269,8 @@ elemToBody ns element | isElem ns "w" "body" element =
(\bps -> return $ Body bps)
elemToBody _ _ = throwError WrongElem

archiveToStyles :: Archive -> CharStyleMap
archiveToStyles zf =
archiveToStyles :: Archive -> String -> CharStyleMap
archiveToStyles zf sType =
let stylesElem = findEntryByPath "word/styles.xml" zf >>=
(parseXMLDoc . UTF8.toStringLazy . fromEntry)
in
Expand All @@ -273,42 +279,45 @@ archiveToStyles zf =
Just styElem ->
let namespaces = mapMaybe attrToNSPair (elAttribs styElem)
in
M.fromList $ buildBasedOnList namespaces styElem Nothing
M.fromList $ buildBasedOnList namespaces styElem Nothing sType

isBasedOnStyle :: NameSpaces -> Element -> Maybe CharStyle -> Bool
isBasedOnStyle ns element parentStyle
isBasedOnStyle :: NameSpaces -> Element -> Maybe CharStyle -> String -> Bool
isBasedOnStyle ns element parentStyle sType
| isElem ns "w" "style" element
, Just "character" <- findAttr (elemName ns "w" "type") element
, Just ctype <- findAttr (elemName ns "w" "type") element
, ctype == sType
, Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>=
findAttr (elemName ns "w" "val")
, Just (parentId, _) <- parentStyle = (basedOnVal == parentId)
| isElem ns "w" "style" element
, Just "character" <- findAttr (elemName ns "w" "type") element
, Just ctype <- findAttr (elemName ns "w" "type") element
, ctype == sType
, Nothing <- findChild (elemName ns "w" "basedOn") element
, Nothing <- parentStyle = True
| otherwise = False

elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle
elemToCharStyle ns element parentStyle
elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> String -> Maybe CharStyle
elemToCharStyle ns element parentStyle sType
| isElem ns "w" "style" element
, Just "character" <- findAttr (elemName ns "w" "type") element
, Just ctype <- findAttr (elemName ns "w" "type") element
, ctype == sType
, Just styleId <- findAttr (elemName ns "w" "styleId") element =
Just (styleId, elemToRunStyle ns element parentStyle)
| otherwise = Nothing

getStyleChildren :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle]
getStyleChildren ns element parentStyle
getStyleChildren :: NameSpaces -> Element -> Maybe CharStyle -> String -> [CharStyle]
getStyleChildren ns element parentStyle sType
| isElem ns "w" "styles" element =
mapMaybe (\e -> elemToCharStyle ns e parentStyle) $
filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element
mapMaybe (\e -> elemToCharStyle ns e parentStyle sType) $
filterChildren (\e' -> isBasedOnStyle ns e' parentStyle sType) element
| otherwise = []

buildBasedOnList :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle]
buildBasedOnList ns element rootStyle =
case (getStyleChildren ns element rootStyle) of
buildBasedOnList :: NameSpaces -> Element -> Maybe CharStyle -> String -> [CharStyle]
buildBasedOnList ns element rootStyle sType =
case (getStyleChildren ns element rootStyle sType) of
[] -> []
stys -> stys ++
(concatMap (\s -> buildBasedOnList ns element (Just s)) stys)
(concatMap (\s -> buildBasedOnList ns element (Just s) sType) stys)

archiveToNotes :: Archive -> Notes
archiveToNotes zf =
Expand Down Expand Up @@ -543,15 +552,17 @@ elemToBodyPart ns element
elemToBodyPart ns element
| isElem ns "w" "p" element
, Just (numId, lvl) <- elemToNumInfo ns element = do
let parstyle = elemToParagraphStyle ns element
sty <- asks envParStyles
let parstyle = elemToParagraphStyle ns element sty
parparts <- mapD (elemToParPart ns) (elChildren element)
num <- asks envNumbering
case lookupLevel numId lvl num of
Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts
Nothing -> throwError WrongElem
elemToBodyPart ns element
| isElem ns "w" "p" element = do
let parstyle = elemToParagraphStyle ns element
sty <- asks envParStyles
let parstyle = elemToParagraphStyle ns element sty
parparts <- mapD (elemToParPart ns) (elChildren element)
return $ Paragraph parstyle parparts
elemToBodyPart ns element
Expand Down Expand Up @@ -684,14 +695,15 @@ elemToRun ns element
return $ Run runStyle runElems
elemToRun _ _ = throwError WrongElem

elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
elemToParagraphStyle ns element
elemToParagraphStyle :: NameSpaces -> Element -> CharStyleMap -> ParagraphStyle
elemToParagraphStyle ns element sty
| Just pPr <- findChild (elemName ns "w" "pPr") element =
ParagraphStyle
{pStyle =
let style =
mapMaybe
(findAttr (elemName ns "w" "val"))
(findChildren (elemName ns "w" "pStyle") pPr)
in ParagraphStyle
{pStyle = style
, indentation =
findChild (elemName ns "w" "ind") pPr >>=
elemToParIndentation ns
Expand All @@ -703,8 +715,19 @@ elemToParagraphStyle ns element
Just "none" -> False
Just _ -> True
Nothing -> False
, pHeading =
case mapMaybe (\x -> M.lookup x sty) style of
[] -> Nothing
x -> let getHeading s =
case rHeading s of
Nothing | Just parentStyle <- rStyle s -> getHeading (snd parentStyle)
| otherwise -> Nothing
Just heading -> Just heading
in case mapMaybe getHeading x of
[] -> Nothing
(x:_) -> Just x
}
elemToParagraphStyle _ _ = defaultParagraphStyle
elemToParagraphStyle _ _ _ = defaultParagraphStyle

checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool
checkOnOff ns rPr tag
Expand Down Expand Up @@ -735,6 +758,21 @@ elemToRunStyleD ns element
return $ elemToRunStyle ns element parentSty
elemToRunStyleD _ _ = return defaultRunStyle

isNumericNotNull :: String -> Bool
isNumericNotNull str = (str /= []) && (all isDigit str)

getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int)
getHeaderLevel ns element
| Just styleId <- findAttr (elemName ns "w" "styleId") element
, Just index <- stripPrefix "Heading" styleId
, isNumericNotNull index = Just (styleId, read index)
| Just styleId <- findAttr (elemName ns "w" "styleId") element
, Just index <- findChild (elemName ns "w" "name") element >>=
findAttr (elemName ns "w" "val") >>=
stripPrefix "heading "
, isNumericNotNull index = Just (styleId, read index)
getHeaderLevel _ _ = Nothing

elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle
elemToRunStyle ns element parentStyle
| Just rPr <- findChild (elemName ns "w" "rPr") element =
Expand All @@ -754,9 +792,10 @@ elemToRunStyle ns element parentStyle
, rUnderline =
findChild (elemName ns "w" "u") rPr >>=
findAttr (elemName ns "w" "val")
, rHeading = getHeaderLevel ns element
, rStyle = parentStyle
}
elemToRunStyle _ _ _ = defaultRunStyle
elemToRunStyle ns element parentStyle = defaultRunStyle { rHeading = getHeaderLevel ns element, rStyle = parentStyle }

elemToRunElem :: NameSpaces -> Element -> D RunElem
elemToRunElem ns element
Expand Down

0 comments on commit 5880b93

Please sign in to comment.