Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 67 additions & 5 deletions scripts/latex_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,74 @@ def replace_si_unit(m: re.Match) -> str:
return text


def _find_brace_content(text: str, start: int) -> tuple[str, int] | None:
"""Find the content of a brace group starting at *start*, handling arbitrary nesting.

*start* must point to the opening ``{``. Returns ``(content, end)``
where *end* is the index just past the closing ``}``, or ``None`` if
the braces are unbalanced.
"""
if start >= len(text) or text[start] != "{":
return None
depth = 1
i = start + 1
while i < len(text) and depth > 0:
if text[i] == "{":
depth += 1
elif text[i] == "}":
depth -= 1
i += 1
if depth != 0:
return None
# content excludes the outer braces
return text[start + 1 : i - 1], i


_BRACKET_MACROS = {
"\\PB": ("\\left(", "\\right)"),
"\\RB": ("\\left[", "\\right]"),
"\\CB": ("\\left\\{", "\\right\\}"),
}


def _expand_all_bracket_macros(text: str) -> str:
r"""Expand all ``\PB``, ``\RB``, ``\CB`` macros in *text*, inside-out.

When an outer macro wraps inner macros (e.g. ``\PB{a \PB{b}}``) the
inner content is recursively expanded first, so the final result
contains no bracket macros regardless of nesting depth.
"""
result: list[str] = []
i = 0
while i < len(text):
matched_macro = None
for macro in _BRACKET_MACROS:
if text[i:].startswith(macro + "{"):
matched_macro = macro
break
if matched_macro:
brace_start = i + len(matched_macro)
found = _find_brace_content(text, brace_start)
if found:
content, end = found
# Recursively expand any bracket macros inside the content
content = _expand_all_bracket_macros(content)
left, right = _BRACKET_MACROS[matched_macro]
result.append(f"{left} {content} {right}")
i = end
continue
result.append(text[i])
i += 1
return "".join(result)


def expand_bracket_macros(text: str) -> str:
r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX."""
text = re.sub(r"\\PB" + _BRACE_RE, r"\\left( \1 \\right)", text)
text = re.sub(r"\\RB" + _BRACE_RE, r"\\left[ \1 \\right]", text)
text = re.sub(r"\\CB" + _BRACE_RE, r"\\left\\{ \1 \\right\\}", text)
return text
r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX.

Uses stack-based brace matching with recursion to handle arbitrary
nesting depth (e.g. ``\PB{\frac{\dot{m}_{a}}{\dot{m}_{b}}}``).
"""
return _expand_all_bracket_macros(text)


def convert_callout_env(text: str) -> str:
Expand Down
64 changes: 47 additions & 17 deletions scripts/markdown_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,31 @@ def resolve(m: re.Match) -> str:


def clean_equation_labels(text: str) -> str:
r"""Convert equation labels to MathJax \tag{} format."""

# Pattern: equation with \label inside $$ blocks
def add_tag(m: re.Match) -> str:
eq_body = m.group(1)
label_match = re.search(r'<a id="([^"]+)"></a>', eq_body)
if label_match:
label = label_match.group(1)
# Remove the anchor from inside the equation
eq_body = re.sub(r'<a id="[^"]+"></a>\s*', "", eq_body)
# Add \tag at the end of the equation
eq_body = eq_body.rstrip()
if not eq_body.endswith(r"\tag"):
eq_body += f" \\tag{{{label}}}"
return f"$$\n{eq_body}\n$$"

return re.sub(r"\$\$\n(.*?)\n\$\$", add_tag, text, flags=re.DOTALL)
r"""Remove equation labels from display math blocks.

Labels appear in two forms depending on how Pandoc processes them:
1. ``<a id="label"></a>`` — from the Lua filter's RawInline/RawBlock handler
2. ``\label{label}`` — when Pandoc passes display math content verbatim

In both cases we strip the label. The label is already emitted as an
``<a id="...">`` anchor *before* the ``$$`` block (by the Lua filter's
RawBlock handler) or indexed in the label index, so keeping it inside
the math would only confuse MathJax.

Rather than trying to match full ``$$…$$`` blocks (which is fragile due
to adjacent inline-math creating false ``$$`` boundaries), we target
the specific patterns where labels appear:
- ``\label{…}$$`` — label immediately before a closing ``$$``
- ``<a id="…"></a>`` inside ``$$`` blocks (handled by Lua filter anchor)
"""
# Strip \label{...} that appears before a closing $$ (with optional whitespace)
text = re.sub(r"\s*\\label\{[^}]+\}(\$\$)", r"\1", text)
# Strip \label{...} that appears after an opening $$ (on the same line)
text = re.sub(r"(\$\$)\\label\{[^}]+\}\s*", r"\1", text)
# Strip <a id="..."></a> anchors inside equation blocks
# These appear on a line between $$ markers
text = re.sub(r'(<a id="[^"]+"></a>)\s*\n\n\$\$', "\n$$", text)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve equation anchors before display-math blocks

clean_equation_labels now removes any <a id="..."></a> followed by \n\n$$, but RawBlock in scripts/pandoc_filters/energyplus.lua intentionally emits labeled equations/align blocks in exactly that form (<a id="label"></a>\n\n$$...). This strips the only in-page anchor for equation labels, so references that end up as (#label) (including the links normalized by clean_pandoc_ref_attributes) become dead links for labeled equations.

Useful? React with 👍 / 👎.

return text


def fix_heading_dashes(text: str) -> str:
Expand All @@ -192,12 +200,31 @@ def replace_dashes(m: re.Match) -> str:
return re.sub(r"^(#{1,6})\s+(.+)$", replace_dashes, text, flags=re.MULTILINE)


def clean_pandoc_ref_attributes(text: str) -> str:
r"""Clean Pandoc's ``\ref{}`` output artifacts.

Pandoc converts ``\ref{label}`` to
``[\[label\]](#label){reference-type="ref" reference="label"}``.
This function:
1. Strips the ``{reference-type=... reference=...}`` attribute span.
2. Unescapes the bracket notation in link text: ``\[label\]`` → ``label``.
"""
# Strip {reference-type="..." reference="..."} attribute spans
text = re.sub(r'\{reference-type="[^"]*"\s+reference="[^"]*"\}', "", text)
# Clean escaped brackets in link text: [\[label\]] → [label]
# Only inside markdown links to avoid false positives
text = re.sub(r"\[\\\[([^\]]*?)\\\]\]", r"[\1]", text)
return text


def clean_pandoc_artifacts(text: str) -> str:
"""Remove Pandoc artifacts from the converted markdown."""
# Remove {.unnumbered} from headings
text = re.sub(r"\s*\{\.unnumbered\}", "", text)
# Remove {#sec:...} attributes that Pandoc adds
text = re.sub(r"\s*\{#[^}]+\}", "", text)
# Clean Pandoc \ref{} output (attribute spans and escaped brackets)
text = clean_pandoc_ref_attributes(text)
# Fix escaped underscores in non-math contexts
# (be careful not to break underscores in math mode)
# Only fix double-escaped underscores
Expand All @@ -213,6 +240,9 @@ def clean_empty_links(text: str) -> str:
"""Remove empty links and fix malformed link syntax."""
# Remove [](empty) links
text = re.sub(r"\[\]\([^)]*\)", "", text)
# Clean empty bracket artifacts in image alt text: ![caption []]( → ![caption](
# Uses .*? to handle alt text that itself contains brackets (e.g. equation refs)
text = re.sub(r"(!\[.*?)\s*\[\](\]\()", r"\1\2", text)
return text


Expand Down