idfkit · samuelduchesne · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · chatgpt-codex-connector
diff --git a/scripts/latex_preprocessor.py b/scripts/latex_preprocessor.py
@@ -113,12 +113,74 @@ def replace_si_unit(m: re.Match) -> str:
     return text
 
 
+def _find_brace_content(text: str, start: int) -> tuple[str, int] | None:
+    """Find the content of a brace group starting at *start*, handling arbitrary nesting.
+
+    *start* must point to the opening ``{``.  Returns ``(content, end)``
+    where *end* is the index just past the closing ``}``, or ``None`` if
+    the braces are unbalanced.
+    """
+    if start >= len(text) or text[start] != "{":
+        return None
+    depth = 1
+    i = start + 1
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    if depth != 0:
+        return None
+    # content excludes the outer braces
+    return text[start + 1 : i - 1], i
+
+
+_BRACKET_MACROS = {
+    "\\PB": ("\\left(", "\\right)"),
+    "\\RB": ("\\left[", "\\right]"),
+    "\\CB": ("\\left\\{", "\\right\\}"),
+}
+
+
+def _expand_all_bracket_macros(text: str) -> str:
+    r"""Expand all ``\PB``, ``\RB``, ``\CB`` macros in *text*, inside-out.
+
+    When an outer macro wraps inner macros (e.g. ``\PB{a \PB{b}}``) the
+    inner content is recursively expanded first, so the final result
+    contains no bracket macros regardless of nesting depth.
+    """
+    result: list[str] = []
+    i = 0
+    while i < len(text):
+        matched_macro = None
+        for macro in _BRACKET_MACROS:
+            if text[i:].startswith(macro + "{"):
+                matched_macro = macro
+                break
+        if matched_macro:
+            brace_start = i + len(matched_macro)
+            found = _find_brace_content(text, brace_start)
+            if found:
+                content, end = found
+                # Recursively expand any bracket macros inside the content
+                content = _expand_all_bracket_macros(content)
+                left, right = _BRACKET_MACROS[matched_macro]
+                result.append(f"{left} {content} {right}")
+                i = end
+                continue
+        result.append(text[i])
+        i += 1
+    return "".join(result)
+
+
 def expand_bracket_macros(text: str) -> str:
-    r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX."""
-    text = re.sub(r"\\PB" + _BRACE_RE, r"\\left( \1 \\right)", text)
-    text = re.sub(r"\\RB" + _BRACE_RE, r"\\left[ \1 \\right]", text)
-    text = re.sub(r"\\CB" + _BRACE_RE, r"\\left\\{ \1 \\right\\}", text)
-    return text
+    r"""Expand \PB{}, \RB{}, \CB{} bracket macros to standard LaTeX.
+
+    Uses stack-based brace matching with recursion to handle arbitrary
+    nesting depth (e.g. ``\PB{\frac{\dot{m}_{a}}{\dot{m}_{b}}}``).
+    """
+    return _expand_all_bracket_macros(text)
 
 
 def convert_callout_env(text: str) -> str:

diff --git a/scripts/markdown_postprocessor.py b/scripts/markdown_postprocessor.py
@@ -156,23 +156,31 @@ def resolve(m: re.Match) -> str:
 
 
 def clean_equation_labels(text: str) -> str:
-    r"""Convert equation labels to MathJax \tag{} format."""
-
-    # Pattern: equation with \label inside $$ blocks
-    def add_tag(m: re.Match) -> str:
-        eq_body = m.group(1)
-        label_match = re.search(r'<a id="([^"]+)"></a>', eq_body)
-        if label_match:
-            label = label_match.group(1)
-            # Remove the anchor from inside the equation
-            eq_body = re.sub(r'<a id="[^"]+"></a>\s*', "", eq_body)
-            # Add \tag at the end of the equation
-            eq_body = eq_body.rstrip()
-            if not eq_body.endswith(r"\tag"):
-                eq_body += f" \\tag{{{label}}}"
-        return f"$$\n{eq_body}\n$$"
-
-    return re.sub(r"\$\$\n(.*?)\n\$\$", add_tag, text, flags=re.DOTALL)
+    r"""Remove equation labels from display math blocks.
+
+    Labels appear in two forms depending on how Pandoc processes them:
+    1. ``<a id="label"></a>`` — from the Lua filter's RawInline/RawBlock handler
+    2. ``\label{label}`` — when Pandoc passes display math content verbatim
+
+    In both cases we strip the label.  The label is already emitted as an
+    ``<a id="...">`` anchor *before* the ``$$`` block (by the Lua filter's
+    RawBlock handler) or indexed in the label index, so keeping it inside
+    the math would only confuse MathJax.
+
+    Rather than trying to match full ``$$…$$`` blocks (which is fragile due
+    to adjacent inline-math creating false ``$$`` boundaries), we target
+    the specific patterns where labels appear:
+    - ``\label{…}$$`` — label immediately before a closing ``$$``
+    - ``<a id="…"></a>`` inside ``$$`` blocks (handled by Lua filter anchor)
+    """
+    # Strip \label{...} that appears before a closing $$ (with optional whitespace)
+    text = re.sub(r"\s*\\label\{[^}]+\}(\$\$)", r"\1", text)
+    # Strip \label{...} that appears after an opening $$ (on the same line)
+    text = re.sub(r"(\$\$)\\label\{[^}]+\}\s*", r"\1", text)
+    # Strip <a id="..."></a> anchors inside equation blocks
+    # These appear on a line between $$ markers
+    text = re.sub(r'(<a id="[^"]+"></a>)\s*\n\n\$\$', "\n$$", text)
+    return text
 
 
 def fix_heading_dashes(text: str) -> str:
@@ -192,12 +200,31 @@ def replace_dashes(m: re.Match) -> str:
     return re.sub(r"^(#{1,6})\s+(.+)$", replace_dashes, text, flags=re.MULTILINE)
 
 
+def clean_pandoc_ref_attributes(text: str) -> str:
+    r"""Clean Pandoc's ``\ref{}`` output artifacts.
+
+    Pandoc converts ``\ref{label}`` to
+    ``[\[label\]](#label){reference-type="ref" reference="label"}``.
+    This function:
+    1. Strips the ``{reference-type=... reference=...}`` attribute span.
+    2. Unescapes the bracket notation in link text: ``\[label\]`` → ``label``.
+    """
+    # Strip {reference-type="..." reference="..."} attribute spans
+    text = re.sub(r'\{reference-type="[^"]*"\s+reference="[^"]*"\}', "", text)
+    # Clean escaped brackets in link text: [\[label\]] → [label]
+    # Only inside markdown links to avoid false positives
+    text = re.sub(r"\[\\\[([^\]]*?)\\\]\]", r"[\1]", text)
+    return text
+
+
 def clean_pandoc_artifacts(text: str) -> str:
     """Remove Pandoc artifacts from the converted markdown."""
     # Remove {.unnumbered} from headings
     text = re.sub(r"\s*\{\.unnumbered\}", "", text)
     # Remove {#sec:...} attributes that Pandoc adds
     text = re.sub(r"\s*\{#[^}]+\}", "", text)
+    # Clean Pandoc \ref{} output (attribute spans and escaped brackets)
+    text = clean_pandoc_ref_attributes(text)
     # Fix escaped underscores in non-math contexts
     # (be careful not to break underscores in math mode)
     # Only fix double-escaped underscores
@@ -213,6 +240,9 @@ def clean_empty_links(text: str) -> str:
     """Remove empty links and fix malformed link syntax."""
     # Remove [](empty) links
     text = re.sub(r"\[\]\([^)]*\)", "", text)
+    # Clean empty bracket artifacts in image alt text: ![caption []]( → ![caption](
+    # Uses .*? to handle alt text that itself contains brackets (e.g. equation refs)
+    text = re.sub(r"(!\[.*?)\s*\[\](\]\()", r"\1\2", text)
     return text