Skip to content

Commit

Permalink
MediaWiki reader: revise treatment of "link trail."
Browse files Browse the repository at this point in the history
Previously we only included ASCII letters. That is correct for
English but not for, e.g., Spanish (see comment in #8525).
A safer approach is to include all letters except those in the
CJK unified ideograph ranges.
  • Loading branch information
jgm committed Jul 6, 2023
1 parent d3e485f commit c693a81
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions src/Text/Pandoc/Readers/MediaWiki.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where

import Control.Monad
import Control.Monad.Except (throwError)
import Data.Char (isAscii, isDigit, isLetter, isSpace)
import Data.Char (isDigit, isLetter, isSpace)
import qualified Data.Foldable as F
import Data.List (intersperse)
import Data.Maybe (fromMaybe, maybeToList)
Expand Down Expand Up @@ -664,14 +664,32 @@ internalLink = try $ do
-- [[Help:Contents|] -> "Contents"
<|> return (B.text $ T.drop 1 $ T.dropWhile (/=':') pagename) )
sym "]]"
linktrail <- B.text <$> manyChar (satisfy (\c -> isAscii c && isLetter c))
-- see #8525:
linktrail <- B.text <$> manyChar (satisfy (\c -> isLetter c && not (isCJK c)))
let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail)
if "Category:" `T.isPrefixOf` pagename
then do
updateState $ \st -> st{ mwCategoryLinks = link : mwCategoryLinks st }
return mempty
else return link

isCJK :: Char -> Bool
isCJK c =
(c >= '\x3400' && c <= '\x4DBF') ||
(c >= '\x4E00' && c <= '\x9FFF') ||
(c >= '\x20000' && c <= '\x2A6DF') ||
(c >= '\x2A700' && c <= '\x2B73F') ||
(c >= '\x2B740' && c <= '\x2B81F') ||
(c >= '\x2B820' && c <= '\x2CEAF') ||
(c >= '\x2CEB0' && c <= '\x2EBEF') ||
(c >= '\x30000' && c <= '\x3134F') ||
(c >= '\x31350' && c <= '\x323AF') ||
(c >= '\xF900' && c <= '\xFAFF') ||
(c >= '\x2F800' && c <= '\x2FA1F') ||
(c >= '\x2F00' && c <= '\x2FDF') ||
(c >= '\x2E80' && c <= '\x2EFF') ||
(c >= '\x3000' && c <= '\x303F')

externalLink :: PandocMonad m => MWParser m Inlines
externalLink = try $ do
char '['
Expand Down

0 comments on commit c693a81

Please sign in to comment.