In [2]:
import re

# --- Formula Preserver (fixed: uses compiled patterns with flags) ---
PATTERNS = [
    re.compile(r"\$\$(.*?)\$\$", re.S),       # $$ ... $$
    re.compile(r"\\\[(.*?)\\\]", re.S),       # \[ ... \]
    re.compile(r"\\\((.*?)\\\)", re.S),       # \( ... \)
    re.compile(r"(?i)<math\b.*?</math>", re.S),  # MathML <math>...</math>, case-insensitive
]

def mask_formulas(text):
    tokens = {}
    counter = 0

    def make_replace():
        nonlocal counter
        def _replace(m):
            nonlocal counter
            token = f"§F{counter}§"
            tokens[token] = m.group(0)
            counter += 1
            return token
        return _replace

    # Apply patterns one by one
    for pat in PATTERNS:
        text = pat.sub(make_replace(), text)
    return text, tokens

def unmask(text, tokens):
    # Replace tokens back (order doesn't matter, but be safe)
    for token, formula in sorted(tokens.items(), key=lambda kv: -len(kv[0])):
        text = text.replace(token, formula)
    return text

# --- Your cleaner (example) ---
def clean(text):
    text = re.sub(r"[ \t]+", " ", text)   # collapse spaces/tabs
    text = re.sub(r" ?\n ?", "\n", text)  # trim around newlines
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = text.replace("—", "-")
    return text

# --- Wrapper ---
def clean_preserving_formulas(text):
    masked, tokens = mask_formulas(text)
    cleaned = clean(masked)
    return unmask(cleaned, tokens)

# --- TEST CASE ---
if __name__ == "__main__":
    raw = r"""
This is a test   to see if it keeps formulas.

Inline math: \(a^2 + b^2 = c^2\)

Block math:
\[
E = mc^2
\]

Double-dollar:
$$
\nabla \cdot \vec{E} = \frac{\rho}{\varepsilon_0}
$$

MathML:
<math>
  <msup><mi>a</mi><mn>2</mn></msup>
  <mo>+</mo>
  <msup><mi>b</mi><mn>2</mn></msup>
  <mo>=</mo>
  <msup><mi>c</mi><mn>2</mn></msup>
</math>

And normal text    gets cleaned   — like this.
"""
    print("=== INPUT ===")
    print(raw)

    print("\n=== OUTPUT ===")
    print(clean_preserving_formulas(raw))


=== INPUT ===

This is a test   to see if it keeps formulas.

Inline math: \(a^2 + b^2 = c^2\)

Block math:
\[
E = mc^2
\]

Double-dollar:
$$
\nabla \cdot \vec{E} = \frac{\rho}{\varepsilon_0}
$$

MathML:
<math>
  <msup><mi>a</mi><mn>2</mn></msup>
  <mo>+</mo>
  <msup><mi>b</mi><mn>2</mn></msup>
  <mo>=</mo>
  <msup><mi>c</mi><mn>2</mn></msup>
</math>

And normal text    gets cleaned   — like this.


=== OUTPUT ===

This is a test to see if it keeps formulas.

Inline math: \(a^2 + b^2 = c^2\)

Block math:
\[
E = mc^2
\]

Double-dollar:
$$
\nabla \cdot \vec{E} = \frac{\rho}{\varepsilon_0}
$$

MathML:
<math>
  <msup><mi>a</mi><mn>2</mn></msup>
  <mo>+</mo>
  <msup><mi>b</mi><mn>2</mn></msup>
  <mo>=</mo>
  <msup><mi>c</mi><mn>2</mn></msup>
</math>

And normal text gets cleaned - like this.

