Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

switch from markdown2 to mistletoe #110

Merged
merged 6 commits into from
Feb 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions mathjax_editing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Ported / adapted from <https://dev.sstatic.net/Js/mathjax-editing.en.js>

# The MIT License (MIT)
#
# Copyright (c) 2016 Stack Exchange
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
This module contains two functions useful for working with markdown strings
that may contain LaTeX that should be processed with MathJax.

`remove_math` strips out parts of the markdown that could potentially contain
math and `replace_math` puts the math back in.
"""

import re
from typing import Match, List

SPLIT = re.compile(r'(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[\\{}$]|[{}]|(?:\n\s*)+|@@\d+@@)', re.I)

def remove_math(text: str, inline: str) -> dict:
"""
text: input markdown string
inline: symbol for inline math (usually '$')

returns a tuple (stripped_text, math)

stripped_text: the input string with math replaced by @@number@@
math: a list of the removed math strings

Break up the text into its component parts and search
through them for math delimiters, braces, linebreaks, etc.
Math delimiters must match and braces must balance.
Don't allow math to pass through a double linebreak
(which will be a paragraph).
Handle backticks (don't do math inside them)
"""

def process_math(i: int, j: int) -> None:
"""
The math is in blocks i through j, so
collect it into one block and clear the others.
# Replace &, <, and > by named entities.
Clear the current math positions and store the index of the
math, then push the math string onto the storage array.
"""
nonlocal blocks, start, end, last
# TODO: replacing >, <, & seems to screw up code blocks
# block = re.sub(r'>',"&gt;",
# re.sub(r'<',"&lt;",
# re.sub(r'&',"&amp;",
# "".join(blocks[i:j+1]))))
block = "".join(blocks[i:j+1])
if indent:
block = re.sub(r'\n ', '\n', block)
while j > i:
blocks[j] = ""
j -= 1
blocks[i] = f"@@{len(math)}@@"
math.append(block)
start = None
end = None
last = None

start = None
end = None
last = None
indent = None # for tracking math delimiters
braces = None
math: List[str] = [] # stores math strings for latter

blocks: List[str] = re.split(SPLIT, re.sub(r'\r\n?', "\n", text))

i = 1
m = len(blocks)
while i < m:
block = blocks[i]
if block[0] == "@":
#
# Things that look like our math markers will get
# stored and then retrieved along with the math.
#
blocks[i] = f"@@{len(math)}@@"
math.append(block)
elif start:
#
# If we are in math or backticks,
# look for the end delimiter,
# but don't go past double line breaks,
# and balance braces within the math,
# but don't process math inside backticks.
#
if block == end:
if braces > 0:
last = i
elif braces == 0:
process_math(start, i)
else:
start = None
end = None
last = None
elif re.search(r'\n.*\n', block) or i + 2 >= m:
if last:
i = last
if braces >= 0:
process_math(start, i)
start = None
end = None
last = None
braces = 0
elif block == "{" and braces >= 0:
braces += 1
elif block == "}" and braces > 0:
braces -= 1
else:
#
# Look for math start delimiters and when
# found, set up the end delimiter.
#
if block == inline or block == "$$":
start = i
end = block
braces = 0
elif block[1:6] == "begin":
start = i
end = "\\end" + block[6:]
braces = 0
elif block[0] == "`":
start = i
last = i
end = block
braces = -1 # no brace balancing
elif block[0] == "\n":
if re.search(r' $', block):
indent = True
i += 2

if last:
process_math(start, last)


def double_escape_delimiters(text: str, inline: str) -> str:
"""
the commonmark renderer will render any `\$` as a simple `$`
which could become a problem if `$` is used as a mathjax inline delimiter
because then even escaped equations (starting with `\$`) would be detected
as mathjax equations. Let's double-escape to make sure we still have a `\$`
after commonmark did its conversion.
"""
if not inline.startswith("\\"):
return re.sub(r'\\\$', '\\\\$', text)
return text

return (double_escape_delimiters("".join(blocks), inline), math)


def replace_math(input: str, math: List[str]) -> str:
"""
input: a string, already processed into HTML by some markdown renderer;
may contain @@number@@ blocks indicating where math was removed by
remove_math.

math: a list of strings containing math blocks to be spliced back into input

Put back the math strings that were saved
"""

def replacer(match: Match):
index = int(match.group(1))
return math[index]

text = re.sub(r'@@(\d+)@@', replacer, input)
return text
112 changes: 112 additions & 0 deletions mistletoe_renderer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
This module contains a class CustomHTMLRenderer, which uses
mistletoe to generate HTML from markdown.

Extra features include:
- Library note links
- Managing LaTeX so that MathJax will be able to process it in the browser
- Syntax highlighting with Pygments
"""
import re

from mistletoe import Document, HTMLRenderer, span_token
from pygments import highlight
from pygments.lexers import get_lexer_by_name as get_lexer
from pygments.formatters.html import HtmlFormatter

from mathjax_editing import remove_math, replace_math


class NoteLink(span_token.SpanToken):
"""
Detect library note links
"""
parse_inner = False
pattern = re.compile(r'Note \[(.*)\]', re.I)

def __init__(self, match):
self.body = match.group(0)
self.note = match.group(1)


class CustomHTMLRenderer(HTMLRenderer):
"""
Call the constructor with `site_root`.

The main rendering function is `render_md`.
"""

def __init__(self, site_root):
self.site_root = site_root
super().__init__(NoteLink)

def render_md(self, ds):
"""
A wrapper for this class's .render() function.

Input is a string containing markdown with LaTeX,
Output is a string containing HTML.

Uses `mathjax_editing` to strip out sections of the text
which potentially contain LaTeX and then splice them back in.
"""
ds_no_math, math = remove_math(ds, '$')
# We have to run `mathjax_editing.replace_math` on the text in code
# blocks before passing it to Pygments (see `render_block_code`),
# otherwise `replace_math` will be confused by the added syntax
# highlighting `<span>`s and won't be able to splice in those blocks.
self.math = math
html = self.render(Document(ds_no_math))
return replace_math(html, self.math)
Comment on lines +53 to +60
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite follow why this is necessary - does mistletoe.latex_token.Math not work here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I posted a reply here. But let's see how #116 turns out!


def render_heading(self, token) -> str:
"""
Override the default heading to provide links like in GitHub.

TODO: populate a list of table of contents in the `.toc_html` field of the body
"""
template = '<h{level} id="{anchor}" class="markdown-heading">{inner} <a class="hover-link" href="#{anchor}">#</a></h{level}>'
inner: str = self.render_inner(token)
# generate anchor following what github does
# See info and links at https://gist.github.com/asabaylus/3071099
anchor = inner.strip().lower()
anchor = re.sub(r'[^\w\- ]+', '', anchor).replace(' ', '-')
return template.format(level=token.level, inner=inner, anchor=anchor)

# Use pygments highlighting.
# https://github.com/miyuchina/mistletoe/blob/8f2f0161b2af92f8dd25a0a55cb7d437a67938bc/contrib/pygments_renderer.py
# HTMLCodeFormatter class copied from markdown2:
# https://github.com/trentm/python-markdown2/blob/2c58d70da0279fe19d04b3269b04d360a56c01ce/lib/markdown2.py#L1826
class HtmlCodeFormatter(HtmlFormatter):
def _wrap_code(self, inner):
"""A function for use in a Pygments Formatter which
wraps in <code> tags.
"""
yield 0, "<code>"
for tup in inner:
yield tup
yield 0, "</code>"

def wrap(self, source, outfile):
"""Return the source with a code, pre, and div."""
return self._wrap_div(self._wrap_pre(self._wrap_code(source)))

# `cssclass` here should agree with what we have in pygments.css
formatter = HtmlCodeFormatter(cssclass='codehilite')

def render_block_code(self, token):
# replace math before highlighting
code = replace_math(token.children[0].content, self.math)
try:
# default to 'lean' if no language is specified
lexer = get_lexer(
token.language) if token.language else get_lexer('lean')
except:
lexer = get_lexer('text')
return highlight(code, lexer, self.formatter)

def render_note_link(self, token):
"""
Render library note links
"""
return f'<a href="{self.site_root}notes.html#{token.note}">{token.body}</a>'
Loading