In [None]:
#| default_exp helper.latex

# helper.latex
> Helper functions for latex functionalities


In [None]:
#| export
import re

from trouver.helper.regex import latex_indices

In [None]:
from fastcore.test import *

## Validity of latex syntax

We require some functions to evaluate whether a latex math mode string is syntactically valid.

In [None]:
#| export
def _is_balanced_braces(s):
    """
    This is a helper function to `math_mode_string_is_syntactically_valid`.

    Note that curly braces (`{`, `}`) that have 
    """
    stack = []
    escaped = False
    
    for _, char in enumerate(s):
        if char == '\\':
            escaped = True
        elif char == '{' and not escaped:
            stack.append(char)
        elif char == '}' and not escaped:
            if not stack:
                return False
            stack.pop()
        else:
            escaped = False
    
    return len(stack) == 0


In [None]:
#| hide
assert _is_balanced_braces('{{}}')
assert _is_balanced_braces('{asdf_{}}')
assert not _is_balanced_braces('{hi')
assert not _is_balanced_braces('{hi asdf}}')
assert _is_balanced_braces(r'\{hi')
assert _is_balanced_braces(r'\{{hi}')
assert not _is_balanced_braces(r'\{{hi\}')

In [None]:
#| export
def _detect_backslash_space_curly(
        text: str
        ) -> bool:
    """
    Return `True` if there is some backslash `\` followed
    by spaces and then followed by curly brackets `{`

    Note that the presence of such a combination of text
    will induce a syntax error in LaTeX math mode string.

    This is a helper function of `math_mode_string_is_syntactically_valid`
    """
    pattern = r'\\\s+[{}]'
    match = re.search(pattern, text)
    return bool(match)

In [None]:
#| hide
assert _detect_backslash_space_curly(r'\ {')
assert not _detect_backslash_space_curly(r'\{')
assert not _detect_backslash_space_curly(r'{')
assert _detect_backslash_space_curly(r'\ }')

In [None]:
#| export
def math_mode_string_is_syntactically_valid(
        text: str,
        ) -> bool:
    """
    Return `True` if `text` is determined to be syntactically valid
    as a latex str.

    There may be TeX syntax rules beyond the scope of this function.

    Some caveats:

    `text` is allowed to have dollar signs `$` and is also allowed to not have
    dollar signs. Even if `text` does not have dollar signs, this function
    may return `True`. Even if `text` has dollar signs, this function may return
    `False` if the entire string is not a singular math mode string or if the
    dollar signs are not used in a math-mode-valid way.
    """
    # 
    if len(latex_indices(text)) > 1:
        return False
    if _detect_backslash_space_curly(text):
        return False
    if not _is_balanced_braces(text):
        return False
    return True



The `math_mode_string_is_syntactically_valid` experimentally assesses whether a given math mode LaTeX string is syntactically valid. In principal, this should mean that a LaTeX syntax error caused by the string should be detected by the function.

TODO: consider the following to :

Unclosed dollar sign:
`$x^2 + y^2`

Mismatched delimiters:
`$$x^2 + y^2$`

Missing closing parenthesis:
`\left(x^2 + y^2`

Missing argument for command:
`$\frac{a}$`

Double subscript:
`$x_1_2$`
Double superscript:
`$x^1^2$`

Unescaped % sign (starts a comment):
`$x = 50% of y$`

Mismatched environment:
`\begin{equation} x = y \end{align}`

Using ! (negative space) at the beginning of math mode:
`$\!x + y$`

The following lists some example outputs of the `math_mode_string_is_syntactically_valid` function along with explanations.

Unmatched curly braces are a common syntactical error:

In [None]:
assert not math_mode_string_is_syntactically_valid(r'\sqrt{x}}')

However, using `\{` or `\}` does not count towards curly bracket matching:

In [None]:
assert math_mode_string_is_syntactically_valid(r'\{hi')

On the other hand, a backslash `\` followed by spaces ` ` and then followed by a curly bracket is in itself an invalid syntax.

In [None]:
assert not math_mode_string_is_syntactically_valid(r'\ {hi')

`math_mode_string_is_syntactically_valid` will consider the validity of a string whether or not the string has math mode delimiters. 

In [None]:
assert math_mode_string_is_syntactically_valid('\operatorname{Gal}')
assert math_mode_string_is_syntactically_valid('$\operatorname{Gal}$')

However, `math_mode_string_is_syntactically_valid` returns `False` if the string has dollar sign delimiters and more than one math mode string is detected in the string (see `latex_indices`),  

TODO: example of multiple math mode strings in a single test
TODO: example of an unclosed dollar sign
TODO: example of mismatched delimiters


assert 

In [None]:
#| export
def math_mode_string_is_syntactically_clean(
        text: str,
        ) -> bool:
    """
    Return `True` if `text` is syntactically "clean" as a LaTeX math mode str.
    
    While the precise meaning of this may be subjective, here we will
    consider `text` to be clean, assuming that it is syntactically valid, if

    - It does not have double blackslashes
    """
    if r'\\' in text:
        return False

## Tweak a latex string

Sometimes, when autogenerating a latex string through an ML model, some minor formatting eyesores occur, such as a curly bracket `{` or an underscore `_` followed by an unncessary space. We provide some functions to fix such formatting.

In [None]:
#| export
def reduce_unnecessary_spaces(
        text: str
        ) -> str:
    """
    Return a string modifying `text` by removing spaces which are
    unnecessary for the purposes of considering the string as a 
    LaTeX string.
    """
    pattern = r'([{_^\\])\s+'
    text = re.sub(pattern, r'\1', text)
    pattern = r'\s+([}_^])'
    text = re.sub(pattern, r'\1', text)
    return text
    # for char in ['{', '_', '^', '}', '\\']:
    #     text = re.sub(fr'\s*{chr}\s*', chr, text)

In [None]:

# It might not be necessary or desirable to eliminate the space before the backslash `\``
test_eq(reduce_unnecessary_spaces(r'something something \  operatorname'), r'something something \operatorname')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res} ^ G_ H (R)'), r'\operatorname{Res}^G_H (R)')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}^{ G}_{ H } (R)'), r'\operatorname{Res}^{G}_{H} (R)')