From c693a8107f16ed7304bb8310da897e9303f7cfad Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 5 Jul 2023 23:00:59 -0700 Subject: [PATCH] MediaWiki reader: revise treatment of "link trail." Previously we only included ASCII letters. That is correct for English but not for, e.g., Spanish (see comment in #8525). A safer approach is to include all letters except those in the CJK unified ideograph ranges. --- src/Text/Pandoc/Readers/MediaWiki.hs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index eafaa8aa4b25..9f3aa24a681c 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -19,7 +19,7 @@ module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where import Control.Monad import Control.Monad.Except (throwError) -import Data.Char (isAscii, isDigit, isLetter, isSpace) +import Data.Char (isDigit, isLetter, isSpace) import qualified Data.Foldable as F import Data.List (intersperse) import Data.Maybe (fromMaybe, maybeToList) @@ -664,7 +664,8 @@ internalLink = try $ do -- [[Help:Contents|] -> "Contents" <|> return (B.text $ T.drop 1 $ T.dropWhile (/=':') pagename) ) sym "]]" - linktrail <- B.text <$> manyChar (satisfy (\c -> isAscii c && isLetter c)) + -- see #8525: + linktrail <- B.text <$> manyChar (satisfy (\c -> isLetter c && not (isCJK c))) let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail) if "Category:" `T.isPrefixOf` pagename then do @@ -672,6 +673,23 @@ internalLink = try $ do return mempty else return link +isCJK :: Char -> Bool +isCJK c = + (c >= '\x3400' && c <= '\x4DBF') || + (c >= '\x4E00' && c <= '\x9FFF') || + (c >= '\x20000' && c <= '\x2A6DF') || + (c >= '\x2A700' && c <= '\x2B73F') || + (c >= '\x2B740' && c <= '\x2B81F') || + (c >= '\x2B820' && c <= '\x2CEAF') || + (c >= '\x2CEB0' && c <= '\x2EBEF') || + (c >= '\x30000' && c <= '\x3134F') || + (c >= '\x31350' && c <= '\x323AF') || + (c >= '\xF900' && c <= '\xFAFF') || + (c >= '\x2F800' && c <= '\x2FA1F') || + (c >= '\x2F00' && c <= '\x2FDF') || + (c >= '\x2E80' && c <= '\x2EFF') || + (c >= '\x3000' && c <= '\x303F') + externalLink :: PandocMonad m => MWParser m Inlines externalLink = try $ do char '['