From 22d7379a8c71f5d2e3d267261ae8f1eebf3b4cf7 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Feb 2026 00:20:28 +0000 Subject: [PATCH 1/2] Fix equation labels, Pandoc ref attributes, and image caption artifacts - Strip \label{} commands from display math blocks (both before closing $$ and after opening $$), fixing ~366 occurrences of raw LaTeX labels in MathJax equations. Use targeted pattern matching instead of fragile $$...$$ block matching to avoid false matches from adjacent inline math. - Strip Pandoc {reference-type="ref" reference="..."} attribute spans (~476 occurrences) and unescape bracket notation in ref link text. - Clean empty [] bracket artifacts in image alt text (~11 occurrences). https://claude.ai/code/session_01AxQpFdDAmHAXuZcjgjjGiC --- scripts/markdown_postprocessor.py | 64 +++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/scripts/markdown_postprocessor.py b/scripts/markdown_postprocessor.py index 5b76d46ac..4db6411c5 100644 --- a/scripts/markdown_postprocessor.py +++ b/scripts/markdown_postprocessor.py @@ -156,23 +156,31 @@ def resolve(m: re.Match) -> str: def clean_equation_labels(text: str) -> str: - r"""Convert equation labels to MathJax \tag{} format.""" - - # Pattern: equation with \label inside $$ blocks - def add_tag(m: re.Match) -> str: - eq_body = m.group(1) - label_match = re.search(r'', eq_body) - if label_match: - label = label_match.group(1) - # Remove the anchor from inside the equation - eq_body = re.sub(r'\s*', "", eq_body) - # Add \tag at the end of the equation - eq_body = eq_body.rstrip() - if not eq_body.endswith(r"\tag"): - eq_body += f" \\tag{{{label}}}" - return f"$$\n{eq_body}\n$$" - - return re.sub(r"\$\$\n(.*?)\n\$\$", add_tag, text, flags=re.DOTALL) + r"""Remove equation labels from display math blocks. + + Labels appear in two forms depending on how Pandoc processes them: + 1. ```` — from the Lua filter's RawInline/RawBlock handler + 2. ``\label{label}`` — when Pandoc passes display math content verbatim + + In both cases we strip the label. The label is already emitted as an + ```` anchor *before* the ``$$`` block (by the Lua filter's + RawBlock handler) or indexed in the label index, so keeping it inside + the math would only confuse MathJax. + + Rather than trying to match full ``$$…$$`` blocks (which is fragile due + to adjacent inline-math creating false ``$$`` boundaries), we target + the specific patterns where labels appear: + - ``\label{…}$$`` — label immediately before a closing ``$$`` + - ```` inside ``$$`` blocks (handled by Lua filter anchor) + """ + # Strip \label{...} that appears before a closing $$ (with optional whitespace) + text = re.sub(r"\s*\\label\{[^}]+\}(\$\$)", r"\1", text) + # Strip \label{...} that appears after an opening $$ (on the same line) + text = re.sub(r"(\$\$)\\label\{[^}]+\}\s*", r"\1", text) + # Strip anchors inside equation blocks + # These appear on a line between $$ markers + text = re.sub(r'()\s*\n\n\$\$', "\n$$", text) + return text def fix_heading_dashes(text: str) -> str: @@ -192,12 +200,31 @@ def replace_dashes(m: re.Match) -> str: return re.sub(r"^(#{1,6})\s+(.+)$", replace_dashes, text, flags=re.MULTILINE) +def clean_pandoc_ref_attributes(text: str) -> str: + r"""Clean Pandoc's ``\ref{}`` output artifacts. + + Pandoc converts ``\ref{label}`` to + ``[\[label\]](#label){reference-type="ref" reference="label"}``. + This function: + 1. Strips the ``{reference-type=... reference=...}`` attribute span. + 2. Unescapes the bracket notation in link text: ``\[label\]`` → ``label``. + """ + # Strip {reference-type="..." reference="..."} attribute spans + text = re.sub(r'\{reference-type="[^"]*"\s+reference="[^"]*"\}', "", text) + # Clean escaped brackets in link text: [\[label\]] → [label] + # Only inside markdown links to avoid false positives + text = re.sub(r"\[\\\[([^\]]*?)\\\]\]", r"[\1]", text) + return text + + def clean_pandoc_artifacts(text: str) -> str: """Remove Pandoc artifacts from the converted markdown.""" # Remove {.unnumbered} from headings text = re.sub(r"\s*\{\.unnumbered\}", "", text) # Remove {#sec:...} attributes that Pandoc adds text = re.sub(r"\s*\{#[^}]+\}", "", text) + # Clean Pandoc \ref{} output (attribute spans and escaped brackets) + text = clean_pandoc_ref_attributes(text) # Fix escaped underscores in non-math contexts # (be careful not to break underscores in math mode) # Only fix double-escaped underscores @@ -213,6 +240,9 @@ def clean_empty_links(text: str) -> str: """Remove empty links and fix malformed link syntax.""" # Remove [](empty) links text = re.sub(r"\[\]\([^)]*\)", "", text) + # Clean empty bracket artifacts in image alt text: ![caption []]( → ![caption]( + # Uses .*? to handle alt text that itself contains brackets (e.g. equation refs) + text = re.sub(r"(!\[.*?)\s*\[\](\]\()", r"\1\2", text) return text From 5272e10bb08b2c304dbd8dcc17068226c92b52cc Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Feb 2026 01:48:35 +0000 Subject: [PATCH 2/2] Fix unexpanded \PB{}/\RB{}/\CB{} bracket macros in math equations Replace the single-pass regex-based bracket macro expansion with a recursive stack-based approach that properly handles arbitrary brace nesting depth. The old regex could only match one level of nested braces, leaving macros like \PB{\frac{\dot{m}_{a}}{\dot{m}_{b}}} unexpanded (21 occurrences across 8 files). The new implementation uses a brace-counting parser that recursively expands inner macros first, eliminating all remaining instances. https://claude.ai/code/session_01AxQpFdDAmHAXuZcjgjjGiC --- scripts/latex_preprocessor.py | 72 ++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/scripts/latex_preprocessor.py b/scripts/latex_preprocessor.py index 731465a75..b41d26fd5 100644 --- a/scripts/latex_preprocessor.py +++ b/scripts/latex_preprocessor.py @@ -113,12 +113,74 @@ def replace_si_unit(m: re.Match) -> str: return text +def _find_brace_content(text: str, start: int) -> tuple[str, int] | None: + """Find the content of a brace group starting at *start*, handling arbitrary nesting. + + *start* must point to the opening ``{``. Returns ``(content, end)`` + where *end* is the index just past the closing ``}``, or ``None`` if + the braces are unbalanced. + """ + if start >= len(text) or text[start] != "{": + return None + depth = 1 + i = start + 1 + while i < len(text) and depth > 0: + if text[i] == "{": + depth += 1 + elif text[i] == "}": + depth -= 1 + i += 1 + if depth != 0: + return None + # content excludes the outer braces + return text[start + 1 : i - 1], i + + +_BRACKET_MACROS = { + "\\PB": ("\\left(", "\\right)"), + "\\RB": ("\\left[", "\\right]"), + "\\CB": ("\\left\\{", "\\right\\}"), +} + + +def _expand_all_bracket_macros(text: str) -> str: + r"""Expand all ``\PB``, ``\RB``, ``\CB`` macros in *text*, inside-out. + + When an outer macro wraps inner macros (e.g. ``\PB{a \PB{b}}``) the + inner content is recursively expanded first, so the final result + contains no bracket macros regardless of nesting depth. + """ + result: list[str] = [] + i = 0 + while i < len(text): + matched_macro = None + for macro in _BRACKET_MACROS: + if text[i:].startswith(macro + "{"): + matched_macro = macro + break + if matched_macro: + brace_start = i + len(matched_macro) + found = _find_brace_content(text, brace_start) + if found: + content, end = found + # Recursively expand any bracket macros inside the content + content = _expand_all_bracket_macros(content) + left, right = _BRACKET_MACROS[matched_macro] + result.append(f"{left} {content} {right}") + i = end + continue + result.append(text[i]) + i += 1 + return "".join(result) + + def expand_bracket_macros(text: str) -> str: - r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX.""" - text = re.sub(r"\\PB" + _BRACE_RE, r"\\left( \1 \\right)", text) - text = re.sub(r"\\RB" + _BRACE_RE, r"\\left[ \1 \\right]", text) - text = re.sub(r"\\CB" + _BRACE_RE, r"\\left\\{ \1 \\right\\}", text) - return text + r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX. + + Uses stack-based brace matching with recursion to handle arbitrary + nesting depth (e.g. ``\PB{\frac{\dot{m}_{a}}{\dot{m}_{b}}}``). + """ + return _expand_all_bracket_macros(text) def convert_callout_env(text: str) -> str: