In [None]:
#| default_exp helper.latex.formatting

# helper.latex.formatting
> Functions for normalizing and cleaning LaTeX code strings.

This module focuses on "cosmetic" transformationsâ€”changing the string representation 
of LaTeX code without altering its mathematical or semantic meaning. This is 
particularly useful for standardizing text generated by LLMs or user input before 
validation or compilation.

Key functionalities include:
- **Whitespace Normalization:** Removing unnecessary spaces around brackets, subscripts, 
  and superscripts (e.g., `_{ f}` -> `_{f}`).
- **Escape Character Fixes:** Correcting common errors like escaped spaces `\ ` where 
  single backslashes `\` are intended.
- **Obsidian Compatibility:** Ensuring LaTeX math delimiters (`$`, `$$`) are correctly 
  spaced for rendering in Markdown tools like Obsidian.md.

In [None]:
#| export
import re

from trouver.helper.regex import (
    latex_indices, replace_string_by_indices)

In [None]:
from fastcore.test import *

Sometimes, when autogenerating a latex string through an ML model, some minor formatting eyesores occur, such as a curly bracket `{` or an underscore `_` followed by an unncessary space. We provide some functions to fix such formatting.

In [None]:
#| export
def reduce_unnecessary_spaces(
        text: str,
        ) -> str:
    """
    Return a string modifying `text` by removing spaces which are
    unnecessary for the purposes of considering the string as a 
    LaTeX string.
    """
    pattern = r'([{_^\\()])\s+'
    text = re.sub(pattern, r'\1', text)
    pattern = r'\s+([}_^()])'
    text = re.sub(pattern, r'\1', text)
    return text
    # for char in ['{', '_', '^', '}', '\\']:
    #     text = re.sub(fr'\s*{chr}\s*', chr, text)

In [None]:

# It might not be necessary or desirable to eliminate the space before the backslash `\``
test_eq(reduce_unnecessary_spaces(r'something something \  operatorname'), r'something something \operatorname')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}  ^ G_ H (R)'), r'\operatorname{Res}^G_H(R)')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}^{ G}_{ H } (R)'), r'\operatorname{Res}^{G}_{H}(R)')
test_eq(reduce_unnecessary_spaces(r'M_{ f}'), r'M_{f}')
test_eq(reduce_unnecessary_spaces(r'h_{ p}'), r'h_{p}')
test_eq(reduce_unnecessary_spaces(r'\zeta (s)'), r'\zeta(s)')
test_eq(reduce_unnecessary_spaces(r'\mathcal{ H} _{ v}'), r'\mathcal{H}_{v}')

#### Make fixes to summary

In [None]:
#| export
def fix_autogen_formatting(
        text: str
        ) -> str:
    """Fix some latex formatting issues in an autogenerated text
    """
    text = text.replace(r'\ ', '\\')
    text = text.replace(r'{ ', r'{')
    text = text.replace(r' }', r'}')
    text, _ = re.subn(r'\$\s*([^\$]+?)\s*\$', r'$\1$', text)
    # TODO: do $ <latex_string> $ into $<latex_stinrg>$
    # TODO: if the replacement of r'\ ' by '\\' happesn to
    # make `\` stick to the previous chunk of things
    # (e.g. r'd\in\mathbb{Z}_{\geq 0}`, then give it some
    # space, e.g. r'd \in \mathbb{Z}_{\geq 0}'.
    text = reduce_unnecessary_spaces(text)
    text = _insert_newline_or_spaces_around_latex(text)
    return text


def _insert_newline_or_spaces_around_latex(
        text:  str
        ) -> str:
    """
    Insert spaces or newlines around latex math mode strings inside `text`
    if necessary.
    """
    math_mode_indices = latex_indices(text)
    replacements = []
    for start, end in math_mode_indices:
        math_mode = text[start:end]
        spaces_potentially_added = math_mode
        if not math_mode.startswith('$$'): #starts with exactly one $
            if start != 0 and text[start-1] != ' ':
                spaces_potentially_added = f' {spaces_potentially_added}'
            if end != len(text) and text[end] != ' ':
                spaces_potentially_added = f'{spaces_potentially_added} '
            replacements.append(spaces_potentially_added)
            continue
        if start != 0 and text[start-1] != '\n':
            front_newline_count = 2
        elif start > 1 and text[start-2] != '\n':
            front_newline_count = 1
        else:
            front_newline_count = 0
        spaces_potentially_added = front_newline_count * '\n' + spaces_potentially_added

        if end != len(text) and text[end] != '\n':
            back_newline_count = 2
        elif end < len(text) - 1 and text[end-1] != '\n':
            back_newline_count = 1
        else:
            back_newline_count = 0
        spaces_potentially_added = spaces_potentially_added + '\n'*back_newline_count
        replacements.append(spaces_potentially_added)
    text = replace_string_by_indices(text, math_mode_indices, replacements)
    text = text.replace('$  $', '$ $')
    text = text.replace('$$\n\n\n\n$$', '$$\n\n$$')
    text = text.replace('$$\n\n\n$$', '$$\n\n$$')
    return text

In [None]:
#| hide
sample_text = '$hi$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), sample_text)
sample_text = '$hi$asdf'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$hi$ asdf')
sample_text = 'asdf$hi$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), 'asdf $hi$')
sample_text = 'asdf$hi$asdf'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), 'asdf $hi$ asdf')


sample_text = '$$hi$$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), sample_text)
sample_text = 'asdf$$hi$$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), 'asdf\n\n$$hi$$')
sample_text = '$$hi$$asdf'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$$hi$$\n\nasdf')
sample_text = 'asdf$$hi$$asdf'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), 'asdf\n\n$$hi$$\n\nasdf')

sample_text = '$hi$ $hi$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$hi$ $hi$')
sample_text = '$hi$$hi$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$hi$ $hi$')
# sample_text = '$$hi$$ $$hi$$'
# test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$$hi$$\n\n$$hi$$')
sample_text = '$$hi$$$$hi$$'
test_eq(_insert_newline_or_spaces_around_latex(sample_text), '$$hi$$\n\n$$hi$$')

Currently, the model is inclined to decode and format its summarizations in such a way that creates formatting issues either for LaTeX or `Obsidian.md`. For example, the model would output a str containing

- `\ <command_name>` instead of `\<command_name>`
- `{ ` when `{` is preferable
- `$ <latex_string> $` when `$<latex_string>$` is needed for `Obsidian.md`.

The `fix_summary_formatting` function attempts to get around some of these issues.

In [None]:
text = r'\ to'
sample_output = fix_autogen_formatting(text)
assert r'\to' in sample_output

text = r'$d\ in\ mathbb{ Z}_{\ geq 0} $'
sample_output = fix_autogen_formatting(text)
assert r'\in' in sample_output
assert r'\mathbb{Z}' in sample_output
assert r'\geq 0' in sample_output


In [None]:
text = r'There are some extra spaces in this math mode string: $  5 + 7 = 12 $.'
sample_output = fix_autogen_formatting(text)
print(sample_output)
assert r'$5' in sample_output
assert r'12$' in sample_output

There are some extra spaces in this math mode string: $5 + 7 = 12$ .


In [None]:
text=  r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
sample_output = fix_autogen_formatting(text)
print(sample_output)

the group of $G$ -coinvariants of $A$ . It is defined as 

$$A_{G} :=A / I_\G} A$$
