In [None]:
#| default_exp helper.latex.processing

# helper.latex.processing
> Functions for processing and heuristically correcting syntactically-invalid LaTeX.



Whereas the formatting module applies deterministic, rule-based cleanup (e.g., managing whitespace), this module tackles deeper syntax errors. It uses validation functions from `trouver.helper.latex.core` to identify invalid fragments and employs heuristic methods, such as finding the closest valid candidate string via Levenshtein distance, to attempt a correction.

It serves as a key post-processing step in machine learning workflows, acting as the corrective counterpart to the data-degrading functions in `trouver.latex.augment`.


In [None]:
#| export 
import re
from typing import Callable, Iterable, Union

from Levenshtein import distance

from trouver.helper import sublist_generator
from trouver.helper.latex.macros_and_commands import math_mode_string_is_syntactically_valid
from trouver.helper.regex import (
    latex_indices, replace_string_by_indices)


In [None]:
from fastcore.test import test_eq

## Tweak a latex string

## Correct syntax errors in autogenerated math mode strings

In [None]:
#| export
def _tokenize_latex_math(
        latex_string: str
        ) -> list[str]:
    """
    Tokenize `latex_string` by the following principles:

    1. A latex command/macro invoked (but not the inputs) is a token.
    2. the special characters ^ { } _ are tokens.
    3. groups of consecutive whitespaces are tokens.
    4. afterwards, all "words" (one or more consecutive non-whitespace non-special characters) are tokens.
    """
    # Define the regex pattern for tokenization
    pattern = r"""
        (\\[a-zA-Z]+)        # Match LaTeX commands (e.g., \alpha, \sum)
        | ([^\\\s^{}_]+)     # Match words (consecutive non-whitespace, non-special characters)
        | ([^\\\s])          # Match special characters (including ^, {, }, _, etc.)
        | (\s+)              # Match groups of consecutive whitespace
    """
    # Use re.findall to find all matches based on the pattern
    tokens = re.findall(pattern, latex_string, re.VERBOSE)
    # Extract the matched groups, filtering out empty strings
    token_list = [token for group in tokens for token in group if token]
    return token_list


In [None]:
#| hide
# Example usage
latex_string = r"\alpha + \beta^{2} - \gamma_{1} + 3 \times \text{some text}"
tokens = _tokenize_latex_math(latex_string)
# print(tokens)
test_eq(
    ['\\alpha', ' ', '+', ' ', '\\beta', '^', '{', '2', '}', ' ', '-', ' ', '\\gamma', '_', '{', '1', '}', ' ', '+', ' ', '3', ' ', '\\times', ' ', '\\text', '{', 'some', ' ', 'text', '}'],
    tokens
    )
test_eq(''.join(tokens), latex_string)

In [None]:
#| export
def _list_of_candidates_from_math_mode_strings(
        main_content: str, # A text of LaTeX code. In practice, this should be the `main content` of an information note, cf. `summarize_notation`.`
        syntax_validation: Callable[[str], bool] = math_mode_string_is_syntactically_valid # A test to tell whether a math mode string is syntactically  valid.
        ) -> set[str]:
    """
    Return a substrings from latex math mode strings in `main_content`
    that are syntactically valid .

    None of the elements in the output have delimiters (`$`, `$$`)
    """
    syntactically_valid_substrings = [] 
    math_mode_indices = latex_indices(main_content)
    for start, end in math_mode_indices:
        latex_str = main_content[start:end]
        latex_str = latex_str.strip('$')
        tokenization = _tokenize_latex_math(latex_str)
        for sublist in sublist_generator(tokenization):
            substring = ''.join(sublist)
            if syntax_validation(substring):
                syntactically_valid_substrings.append(substring.strip())
    return set(syntactically_valid_substrings)

In [None]:
#| hide
output = _list_of_candidates_from_math_mode_strings(r'$H_{*}(X ; M)=H_{*}(S(X) \otimes M)$', math_mode_string_is_syntactically_valid)
output

{'',
 '(S(X)',
 '(S(X) \\otimes',
 '(S(X) \\otimes M)',
 '(X',
 '(X ;',
 '(X ; M)=H',
 '(X ; M)=H_{*}',
 '(X ; M)=H_{*}(S(X)',
 '(X ; M)=H_{*}(S(X) \\otimes',
 '(X ; M)=H_{*}(S(X) \\otimes M)',
 '*',
 ';',
 '; M)=H',
 '; M)=H_{*}',
 '; M)=H_{*}(S(X)',
 '; M)=H_{*}(S(X) \\otimes',
 '; M)=H_{*}(S(X) \\otimes M)',
 'H',
 'H_{*}',
 'H_{*}(X',
 'H_{*}(X ;',
 'H_{*}(X ; M)=H',
 'H_{*}(X ; M)=H_{*}',
 'H_{*}(X ; M)=H_{*}(S(X)',
 'H_{*}(X ; M)=H_{*}(S(X) \\otimes',
 'H_{*}(X ; M)=H_{*}(S(X) \\otimes M)',
 'M)',
 'M)=H',
 'M)=H_{*}',
 'M)=H_{*}(S(X)',
 'M)=H_{*}(S(X) \\otimes',
 'M)=H_{*}(S(X) \\otimes M)',
 '\\otimes',
 '\\otimes M)',
 '_{*}',
 '_{*}(S(X)',
 '_{*}(S(X) \\otimes',
 '_{*}(S(X) \\otimes M)',
 '_{*}(X',
 '_{*}(X ;',
 '_{*}(X ; M)=H',
 '_{*}(X ; M)=H_{*}',
 '_{*}(X ; M)=H_{*}(S(X)',
 '_{*}(X ; M)=H_{*}(S(X) \\otimes',
 '_{*}(X ; M)=H_{*}(S(X) \\otimes M)',
 '{*}',
 '{*}(S(X)',
 '{*}(S(X) \\otimes',
 '{*}(S(X) \\otimes M)',
 '{*}(X',
 '{*}(X ;',
 '{*}(X ; M)=H',
 '{*}(X ; M)=H_{*}',

In [None]:
#| hide
output = _list_of_candidates_from_math_mode_strings(
    r'$\operatorname{Gal}(L/K)$',
    math_mode_string_is_syntactically_valid)
assert r'\operatorname{Gal}' in output
assert r'Gal' in output

output = _list_of_candidates_from_math_mode_strings(
    r'$\operatorname{Gal}(L/K) \to G_\ell^\infty$',
    math_mode_string_is_syntactically_valid)

In [None]:
#| hide
_list_of_candidates_from_math_mode_strings(r'the signum of the complete factorization $\\text\\in S_n$ into disjoint cycles. It is defined by$$\\operatorname sgn(\\left )=(-1)n-t .$$')

{'',
 ')=(-1)n-t',
 ')=(-1)n-t .',
 '.',
 'S',
 'S_n',
 '\\in',
 '\\in S',
 '\\in S_n',
 '\\operatorname',
 '\\operatorname sgn(',
 '_n',
 'n',
 'sgn('}

In [None]:
#| export
def _find_closest_match(
        math_mode_text: str,
        replacement_candidates: Iterable[str]
        ) -> Union[str, None]:
    """This is a helper function to `correct_latex_syntax_error`."""
    if not replacement_candidates:
        return None
    # Calculate Levenshtein distance for each candidate
    distances = [(candidate, distance(math_mode_text, candidate)) for candidate in replacement_candidates]
    # Find the candidate with the minimum distance
    closest_match = min(distances, key=lambda x: x[1])
    return closest_match[0]

In [None]:
#| hide
test_eq(_find_closest_match('hi', ['hib', 'basdy']), 'hib')

In [None]:
#| export
def correct_latex_syntax_error(
        summary: str, # The autogenerated summary
        replacement_candidates: list[str], # A list of candidates to replace. This is expected to be an output of `_list_of_candidates_from_math_mode_strings`
        # min_length_to_replace_math_mode_string: int = 5, # The minimum length that a math mode string needs to be (exclusing delimiting dollar signs `$`, `$$`) in summary in order to be considered for replacement.
        syntax_validation: Callable[[str], bool] = math_mode_string_is_syntactically_valid # A test to tell whether a math mode string is syntactically  valid.
        ) -> str:
    """
    Attempt to replace within `summary` a modified version in which
    the syntactically incorrect latex math mode strings are replaced
    with the most closely resembling element of `replacement_candidates`. 
     
    with a modified version in which the
    latex math mode strings within `summary` that are syntactically
    incorrect 

    TODO: consider the possibility that not all math mode str delimiters
    are formatted correctly.
    """
    math_mode_indices = latex_indices(summary)
    replacements = []
    for start, end in math_mode_indices:
        math_mode_text = summary[start:end]
        if syntax_validation(math_mode_text) or not replacement_candidates:
            replacements.append(math_mode_text)
            continue
        delimiter = '$$' if math_mode_text.startswith('$$') else '$'
        replacement = _find_closest_match(math_mode_text, replacement_candidates)
        replacement = f'{delimiter}{replacement}{delimiter}'
        replacements.append(replacement)
    return replace_string_by_indices(summary, math_mode_indices, replacements)



In [None]:
sample_summary = r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
replacement_candidates = [
    'A',
    'A_',
    'A_{G}',
    'A_{G}:=A',
    'A_{G}:=A',
    'A_{G}:=A /',
    'A_{G}:=A / I_{G}',
    'A_{G}:=A / I_{G} A',
    'H_{0}(G, A)',
    'H_{0}(G, A) \\simeq',
    'H_{0}(G, A) \\simeq A',
    'H_{0}(G, A) \\simeq A_',
    'H_{0}(G, A) \\simeq A_{G}',
]
test_eq(correct_latex_syntax_error(sample_summary, replacement_candidates), r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G}:=A / I_{G} A$$')
# replacement_candidates

In [None]:
#| hide
replacement_candidates = {'',
 '(S(X)',
 '(S(X) \\otimes',
 '(S(X) \\otimes M)',
 '(X',
 '(X ;',
 '(X ; M)=H',
 '(X ; M)=H_{*}',
 '(X ; M)=H_{*}(S(X)',
 '(X ; M)=H_{*}(S(X) \\otimes',
 '(X ; M)=H_{*}(S(X) \\otimes M)',
 '*',
 ';',
 '; M)=H',
 '; M)=H_{*}',
 '; M)=H_{*}(S(X)',
 '; M)=H_{*}(S(X) \\otimes',
 '; M)=H_{*}(S(X) \\otimes M)',
 'H',
 'H_{*}',
 'H_{*}(X',
 'H_{*}(X ;',
 'H_{*}(X ; M)=H',
 'H_{*}(X ; M)=H_{*}',
 'H_{*}(X ; M)=H_{*}(S(X)',
 'H_{*}(X ; M)=H_{*}(S(X) \\otimes',
 'H_{*}(X ; M)=H_{*}(S(X) \\otimes M)',
 'M)',
 'M)=H',
 'M)=H_{*}',
 'M)=H_{*}(S(X)',
 'M)=H_{*}(S(X) \\otimes',
 'M)=H_{*}(S(X) \\otimes M)',
 '\\otimes',
 '\\otimes M)',
 '_{*}',
 '_{*}(S(X)',
 '_{*}(S(X) \\otimes',
 '_{*}(S(X) \\otimes M)',
 '_{*}(X',
 '_{*}(X ;',
 '_{*}(X ; M)=H',
 '_{*}(X ; M)=H_{*}',
 '_{*}(X ; M)=H_{*}(S(X)',
 '_{*}(X ; M)=H_{*}(S(X) \\otimes',
 '_{*}(X ; M)=H_{*}(S(X) \\otimes M)',
 '{*}',
 '{*}(S(X)',
 '{*}(S(X) \\otimes',
 '{*}(S(X) \\otimes M)',
 '{*}(X',
 '{*}(X ;',
 '{*}(X ; M)=H',
 '{*}(X ; M)=H_{*}',
 '{*}(X ; M)=H_{*}(S(X)',
 '{*}(X ; M)=H_{*}(S(X) \\otimes',
 '{*}(X ; M)=H_{*}(S(X) \\otimes M)'}

correct_latex_syntax_error('H_{* ; M)', replacement_candidates)

'H_{* ; M)'