In [None]:
#| default_exp helper.latex.macros_and_commands

# helper.latex.macros_and_commands
> Latex functions for identifying macros and commands (to replace)

In [None]:
#| export

import re
from typing import Union

from pylatexenc.latexwalker import LatexCharsNode, LatexEnvironmentNode, LatexNode, LatexWalker, LatexMacroNode
import regex

from trouver.helper.latex.comments import remove_comments
from trouver.helper.regex import latex_indices
from trouver.helper.latex.core import (
    detect_unbalanced_environments, latex_math_mode_has_soft_syntax_oddities, _detect_backslash_space_curly, _does_not_end_with_script, _has_invalid_left_right_bracket, _has_double_script, _has_unescaped_dollar, _has_double_script_literal, _is_balanced_braces, _is_left_right_balanced)

In [None]:
from fastcore.test import *

# Extracting latex commands

In [None]:
#| export
def extract_latex_commands(
        latex_string: str
        ) -> list[str]:
    """
    Parses a LaTeX string and extracts all command names found within it.
    
    This function uses `pylatexenc.latexwalker` to traverse the 
    abstract syntax tree of the LaTeX code. It recursively finds commands in macros, environments, and arguments.
    
    **Parameters**
    - `latex_string` - str: The LaTeX code to parse.
    
    **Returns**
    - `list[str]`: A list of command names (e.g., "frac", "sum", "begin") 
      found in the string. Returns an empty list if parsing fails.
    """
    # Create a LatexWalker instance
    walker = LatexWalker(latex_string)
    
    # Get the nodes from the LaTeX string
    try:
        nodelist, _, _ = walker.get_latex_nodes()
    except Exception as e:
        print(f"Error parsing LaTeX: {e}")
        return []  # Return an empty list if there's a parsing error
    # Extract commands
    commands = []
    extract_commands_from_nodes(commands, nodelist)
    return commands


def extract_commands_from_nodes(
        commands: list[str],
        nodes: list[LatexNode]
        ) -> None:
    """
    Recursively extracts command names from a list of LatexNodes.
    
    This is a recursive helper function for `extract_latex_commands`. It modifies 
    the `commands` list in-place.
    
    **Parameters**
    - `commands` - list[str]: The list to append found command names to.
    - `nodes` - list[LatexNode]: A list of nodes from `pylatexenc`.
    """
    for node in nodes:
        # If the node is a character node, we skip it
        if isinstance(node, LatexCharsNode):
            continue
        elif isinstance(node, LatexMacroNode):
            commands.append(node.macroname)
            # Check for arguments of the macro node
            for arg in node.nodeargs:
                if arg and not isinstance(arg, LatexCharsNode):
                    if hasattr(arg, 'nodelist'):  # Ensure the argument is not None
                        extract_commands_from_nodes(commands, arg.nodelist)  # Extract from argument nodes
                    elif isinstance(arg, LatexMacroNode):
                        commands.append(arg.macroname)
        # elif isinstance(node, LatexEnvironmentNode):

        elif isinstance(node, LatexEnvironmentNode):
            commands.extend(_detect_begin_and_end_environments(node.latex_verbatim()))
        # If the node has a nodelist, extract commands from it
        nodelist = getattr(node, 'nodelist', None)
        if nodelist is not None:
            extract_commands_from_nodes(commands, nodelist)
        # if hasattr(node, 'nodelist'):
        #     extract_commands_from_nodes(commands, node.nodelist)

def _detect_begin_and_end_environments(
        latex_string: str
        ) -> list[str]:
    r"""
    Detects explicit `\begin` and `\end` commands in a verbatim environment string.
    
    This is a helper for `extract_commands_from_nodes`. Since `pylatexenc` treats
    environments as single nodes, this function manually checks the raw string
    representation of the environment to ensure the `begin` and `end` commands
    themselves are counted.
    
    **Parameters**
    - `latex_string` - str: The verbatim LaTeX string of an environment node.
    
    **Returns**
    - `list[str]`: A list containing "begin", "end", or both, depending on 
      their presence in the string.
    """
    # Regular expressions to match \begin and \end with optional spaces
    begin_pattern = r'\\\s*begin'
    end_pattern = r'\\\s*end'
    
    # Initialize an empty result list
    result = []
    
    # Check for \begin
    if re.search(begin_pattern, latex_string):
        result.append('begin')
    
    # Check for \end
    if re.search(end_pattern, latex_string):
        result.append('end')
    
    return result


In [None]:


# Example usage
assert extract_latex_commands(r"\frac{a}{b}") == ['frac']
assert extract_latex_commands(r"$\frac{a}{b}$") == ['frac']
assert extract_latex_commands(r"\sqrt[n]{x}") == ['sqrt']
assert extract_latex_commands(r"\binom{n}{k}") == ['binom']
assert extract_latex_commands(r"x^2 + y^2") == []  # No commands, just variables
assert extract_latex_commands(r"\overset{a}{b}") == ['overset']

# Additional tests
assert extract_latex_commands(r"\sum_{i=1}^{n} i") == ['sum']
assert extract_latex_commands(r"\int_{0}^{\infty} e^{-x} dx") == ['int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x)") == ['lim', 'to']
assert extract_latex_commands(r"\prod_{i=1}^{n} i") == ['prod']
assert extract_latex_commands(r"\text{Hello} + \frac{1}{2}") == ['text', 'frac']

# Multiple commands in one string
assert extract_latex_commands(r"\frac{a}{b} + \sqrt{c} + \binom{n}{k}") == ['frac', 'sqrt', 'binom']
assert extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx") == ['sum', 'int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x) = \frac{1}{x}") == ['lim', 'to', 'frac']
assert extract_latex_commands(r"\overset{a}{b} + \underset{c}{d}") == ['overset', 'underset']
assert extract_latex_commands(r"\text{This is } \textbf{bold} + \textit{italic} + \frac{1}{2}") == ['text', 'textbf', 'textit', 'frac']

# Complex expressions
test_eq(extract_latex_commands(r"\frac{\sum_{i=1}^{n} i}{n} = \frac{n(n+1)}{2}"), ['frac', 'sum', 'frac'])
test_eq(extract_latex_commands(r"\int_{0}^{1} x^2 \, dx = \frac{1}{3}"), ['int', ',', 'frac'])
assert extract_latex_commands(r"\sqrt{\frac{a}{b}} + \binom{n}{k}") == ['sqrt', 'frac', 'binom']

# Incorrect synntax
assert extract_latex_commands(r"\frac{}}") == ['frac']
assert extract_latex_commands(r"\frac{a}{b}{c}") == ['frac']  # Extra argument
assert extract_latex_commands(r"\frac{a}{b + \frac{c}{d}}") == ['frac', 'frac']  # Nested command
test_eq(extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx = \frac{1}{2}"), ['sum', 'int', 'infty', 'frac'])
# Comment
assert extract_latex_commands(r"%hi") == []
# Environment Node
test_eq(extract_latex_commands(r"\begin{align} \end{align}"), ['begin', 'end'])
test_eq(extract_latex_commands(r"\begin{align}"), ['begin'])
# test_eq(extract_latex_commands(r"\ begin{align} \end{align}"), [' '])

test_eq(extract_latex_commands(r'\text\in'), ['text', 'in'])

In [None]:
#| export
# Some arguments that can be used towards `regex_pattern_detecting_command`
# for some basic latex arguments.
# Note that the last argument doesn't actually matter, because
# we just want to be able to detect uses of comands, see
# `regex_pattern_detecting_commands``
REGEX_PATTERN_DETECTIONS = [
    ('frac', 2, None, None),
    ('binom', 2, None, None),
    ('sqrt', 1, '2', None),
    ('overset', 2, None, None),
    ('underset', 2, None, None),
    ('stackrel', 2, None, None),
    ('dfrac', 2, None, None),
    ('cfrac', 2, None, None),
    ('sideset', 3, None, None),
    ('xrightarrow', 1, None, None),
    ('xleftarrow', 1, None, None),
    ('overline', 1, None, None),
    ('bar', 1, None, None),
    ('arccos', 1, None, None),
    ('arcsin', 1, None, None),
    ('arctan', 1, None, None),
    ('arg', 1, None, None),
    ('atop', 2, None, None),
    ('begin', 1, None, None),
    ('boldsymbol', 1, None, None),
    ('breve', 1, None, None),
    ('check', 1, None, None),
    ('cline', 1, None, None),
    ('cos', 1, None, None),
    ('cosh', 1, None, None),
    ('cot', 1, None, None),
    ('csc', 1, None, None),
    ('dddot', 1, None, None),
    ('ddot', 1, None, None),
    ('dot', 1, None, None),
    ('end', 1, None, None),
    ('exp', 1, None, None),
    ('gcd', 2, None, None),
    ('grave', 1, None, None),
    ('hat', 1, None, None),
    # ('int', '1', None, None),
    ('lcm', 2, None, None),
    # ('left', 1, None, None),
    ('lg', 1, None, None),
    ('lim', 1, None, None),
    ('liminf', 1, None, None),
    ('limsup', 1, None, None),
    ('ln', 1, None, None),
    ('log', 1, None, None),
    ('longdiv', 2, None, None),
    ('lvert', 1, None, None),
    ('mapsto', 1, None, None),
    ('mathbb', 1, None, None),
    ('mathbf', 1, None, None),
    ('mathcal', 1, None, None),
    ('mathfrak', 1, None, None),
    ('mathop', 1, None, None),
    ('mathrm', 1, None, None),
    ('mathscr', 1, None, None),
    ('max', 1, None, None),
    ('min', 1, None, None),
    ('multicolumn', 3, 'center', None),
    ('multirow', 3, None, None),
    ('not', 1, None, None),
    ('oint', 1, None, None),
    ('overbrace', 1, None, None),
    ('overleftarrow', 1, None, None),
    ('overleftrightarrow', 1, None, None),
    ('overrightarrow', 1, None, None),
    # ('prod', 1, None, None),
    # ('right', 1, None, None),
    ('rvert', 1, None, None),
    ('sec', 1, None, None),
    ('section', 1, None, None),
    ('sin', 1, None, None),
    ('sinh', 1, None, None),
    ('stackrel', 2, None, None),
    ('subsection', 2, None, None),
    ('substack', 2, None, None),
    ('subsubsection', 2, None, None),
    # ('sum', 1, None, None),
    ('sup', 1, None, None),
    ('tag', 1, None, None),
    ('tan', 1, None, None),
    ('tanh', 1, None, None),
    ('text', 1, None, None),
    ('textbf', 1, None, None),
    ('textrm', 1, None, None),
    ('tilde', 1, None, None),
    ('underbrace', 1, None, None),
    ('underline', 1, None, None),
    ('underset', 2, None, None),
    ('varliminf', 1, None, None),
    ('varlimsup', 1, None, None),
    ('vec', 1, None, None),
    ('widehat', 1, None, None),
    ('widetilde', 1, None, None),
    ('xrightarrow', 1, None, None),
]
temp_dict = {}
for entry in REGEX_PATTERN_DETECTIONS:
    temp_dict[entry[0]] = entry
REGEX_PATTERN_DETECTIONS = temp_dict



## Identify macros and commands (to replace)

The following functions were originally written for `latex.formatting`, but were moved here.

In [None]:
#| export
def _argument_detection(group_num: int) -> str:
    r"""
    Helper function to `regex_pattern_detecting_command`, and `_commands_from_def`

    This basically helps detect balanced curly braces for invocations of commands.
    """
    return r"\{((?>[^{}]+|\{(?1)\})*)\}".replace("1", str(group_num))

In [None]:
#| export
def custom_commands(
        preamble: str, # The preamble of a LaTeX document.
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    Return a dict mapping commands (and math operators) defined in `preamble` to
    the number of arguments display text of the commands.

    Assumes that the newcommands only have at most one default parameter (newcommands with
    multiple default parameters are not valid in LaTeX).

    Ignores all comented newcommands.
    """
    preamble = remove_comments(preamble)
    latex_commands = _commands_from_newcommand_and_declaremathoperator(preamble)
    tex_commands = _commands_from_def(preamble)
    return latex_commands + tex_commands


def _commands_from_newcommand_and_declaremathoperator(
        preamble: str, # The preamble of a LaTeX document
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    Get custom commands from invocations of `\newcommand` and `DeclareMathOperator`
    in the preamble.

    Helper function to `custom_commands`
    """
    # newcommand_regex = regex.compile(
    #     r'(?<!%)\s*\\(?:(?:re)?newcommand|DeclareMathOperator)\s*\{\\\s*(\w+)\s*\}\s*(?:\[(\d+)\]\s*(?:\[(\w+)\])?)?\s*\{((?>[^{}]+|\{(?4)\})*)\}', re.MULTILINE)
    newcommand_regex = regex.compile(
        r'(?<!%)\s*\\(?:(?:re)?newcommand|DeclareMathOperator)\s*(?:\{\\\s*(\w+)\s*\}|\\\s*(\w+))\s*(?:\[(\d+)\]\s*(?:\[(\w+)\])?)?\s*\{((?>[^{}]+|\{(?5)\})*)\}', re.MULTILINE)

    commands = []
    for match in newcommand_regex.finditer(preamble):
        name_surrounded_in_parentheses = match.group(1) # e.g. \newcommand{\A}
        name_without_parentheses = match.group(2) # e.g. \newcommand\A
        num_args = match.group(3)
        optional_default_arg = match.group(4)
        definition = match.group(5)

        if name_surrounded_in_parentheses is not None:
            name = name_surrounded_in_parentheses
        else:
            name = name_without_parentheses

        # Convert the number of arguments to an integer, if it was specified
        if num_args is not None:
            num_args = int(num_args)
        else:
            num_args = 0

        commands.append((name, num_args, optional_default_arg, definition))
    return commands


def _commands_from_def(
        preamble: str
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    """
    def_command_identifying = r'(?<!%)\s*\\def\s*'
    command_name_identifying = r'\\\s*(\w+)\s*'
    command_def = _argument_detection(2)
    def_regex = regex.compile(
        f"{def_command_identifying}{command_name_identifying}{command_def}"
    )
    return [(match.group(1), 0, None, match.group(2))
            for match in def_regex.finditer(preamble)]


In [None]:
#| hide
text = r"\def\calh{{\mathcal H}}"
test_eq(_commands_from_def(text), [('calh', 0, None, '{\\mathcal H}')])

In [None]:
# Basic
text_1 = r'\newcommand{\con}{\mathcal{C}}'
test_eq(custom_commands(text_1), [('con', 0, None, r'\mathcal{C}')])

# With a parameter
text_2 = r'\newcommand{\field}[1]{\mathbb{#1}}'
test_eq(custom_commands(text_2), [('field', 1, None, r'\mathbb{#1}')]) 

# With multiple parameters, the first of which has a default value of `2`
text_3 = r'\newcommand{\plusbinomial}[3][2]{(#2 + #3)^#1}'
test_eq(custom_commands(text_3), [('plusbinomial', 3, '2', r'(#2 + #3)^#1')])

# The display text has backslashes `\` and curly brances `{}``
text_4 = r'\newcommand{\beq}{\begin{displaymath}}'
test_eq(custom_commands(text_4), [('beq', 0, None, '\\begin{displaymath}')])


# Basic with spaces in the newcommand declaration
text_6 = r'\newcommand {\con}  {\mathcal{C}}'
test_eq(custom_commands(text_6), [('con', 0, None, r'\mathcal{C}')])

# With a parameter and spaces in the newcommand declaration
text_7 = r'\newcommand   {\field}   [1] {\mathbb{#1}}'
test_eq(custom_commands(text_7), [('field', 1, None, r'\mathbb{#1}')])

# With multiple parameters, a default value, and spaces in the newcommand declaration
text_8 = r'\newcommand {\plusbinomial} [3] [2] {(#2 + #3)^#1}'
test_eq(custom_commands(text_8), [('plusbinomial', 3, '2', r'(#2 + #3)^#1')]) 

# With a comment `%'; commented out command declarations should not be detected.
text_9 = r'% \newcommand{\con}{\mathcal{C}}'
test_eq(custom_commands(text_9), [])


# Spanning multiple lines
text_10 = r'''\newcommand{\mat}[4]{\left[\begin{array}{cc}#1 & #2 \\
                                         #3 & #4\end{array}\right]}'''
test_eq(
    custom_commands(text_10),
    [('mat', 4, None,
             '\\left[\\begin{array}{cc}#1 & #2 \\\\\n                                         #3 & #4\\end{array}\\right]')])

# Math operator
text_11 = r'\DeclareMathOperator{\Hom}{Hom}'
test_eq(custom_commands(text_11), [('Hom', 0, None, 'Hom')])

text_12 = r'\DeclareMathOperator{\tConf}{\widetilde{Conf}}'
test_eq(custom_commands(text_12), [('tConf', 0, None, r'\widetilde{Conf}')])

# `\def` commands
# \def is a bit complicated because arguments can either be provided with []
# or can be provided with {}.
text_13 = r'\def\A{{\cO_{K}}}'
test_eq(custom_commands(text_13), [('A', 0, None, r'{\cO_{K}}')])

# newcommand and renewcommand don't require {} for the
# command name, cf. https://arxiv.org/abs/1703.05365
text_14 = r'\newcommand\A{{\mathbb A}}'
test_eq(custom_commands(text_14), [('A', 0, None, r'{\mathbb A}')])

# A test for https://arxiv.org/abs/0902.4637
text_15 = r'\newcommand{\til}[1]{{\widetilde{#1}}}'
test_eq(custom_commands(text_15), [('til', 1, None, '{\\widetilde{#1}}')])




In [None]:
#| export
def regex_pattern_detecting_command(
        command_tuple: tuple[str, int, Union[None, str], str], # Consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
        ) -> regex.Pattern:
    """Return a `regex.pattern` object (not a `re.pattern` object) detecting
    the command with the specified number of parameters, optional argument,
    and display text.

    Assumes that the curly braces used to write the invocations of the commands
    are balanced and properly nested. Assumes that there are no two commands
    of the same name.
    """
    command_name, num_parameters, optional_arg, _ = command_tuple
    backslash_name = fr"\\{command_name}"
    optional_argument_detection = fr"(?:\[(.*?)\])?" if optional_arg is not None else ""
    if optional_arg is not None:
        trailing_arguments = [_argument_detection(i) for i in range(2, 1+num_parameters)]
        trailing_args_pattern = "\\s*".join(trailing_arguments)
        pattern = (f"{backslash_name}\\s*{optional_argument_detection}\\s*{trailing_args_pattern}")
    elif num_parameters > 0:
        arguments = [_argument_detection(i) for i in range(1, 1+num_parameters)]
        args_pattern = "\\s*".join(arguments)
        pattern = f"{backslash_name}\\s*{args_pattern}"
    else:
        # Match the command name exactly without letters immediately following
        # (but underscores following are okay).
        pattern = rf"{backslash_name}(?![^\W_])"
    return regex.compile(pattern)

    

In [None]:
# Basic
pattern = regex_pattern_detecting_command(('Sur', 0, None, r'\mathrm{Sur}'))
text = r'The number of element of $\Sur(\operatorname{Cl} \mathcal{O}_L, A)$ is ...'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], r'\Sur')

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{\frac{2}{5}}{7}'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{error}{7'
match = pattern.search(text)
test_is(match, None)
# start, end = match.span()
# test_eq(text[start:end], text)

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{\frac{2}{5}}{7'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], r'\frac{2}{5}')

# One parameter
pattern = regex_pattern_detecting_command(('field', 1, None, r'\mathbb{#1}'))
text = r'\field{Q}'
# print(pattern.pattern)
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

# Multiple parameters
pattern = regex_pattern_detecting_command(('mat', 4, None, r'\left[\begin{array}{cc}#1 & #2 \\ #3 & #4\end{array}\right]'))
text = r'\mat{{123}}{asdfasdf{}{}}{{{}}}{{asdf}{asdf}{}}' # This is a balanced str.
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)
test_eq(match.group(1), r'{123}')

# Multiple parameters, one of which is optional parameter
pattern = regex_pattern_detecting_command(('plusbinomial', 3, '2', r'(#2 + #3)^#1'))
# When the optional parameter is used
text = r'\plusbinomial{x}{y}'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

# When the optional parameter is not used
text = r'\plusbinomial[4]{x}{y}'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

# One parameter that is optional.
pattern = regex_pattern_detecting_command(('greet', 1, 'world', r'Hello #1!'))
# When the optional parameter is used
text = r'\greet'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

# When the optional parameter is not used
text = r'\greet[govna]'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], text)

# In the following example, `\del` is a command defined as `\delta`.
# Any invocation `\delta` should detected as invocations of `\del``
command_tuple = (r'del', 0, None, r'\delta')
pattern = regex_pattern_detecting_command(command_tuple)
text = r'\del should be detected.'
match = pattern.search(text)
assert match is not None
start, end = match.span()
test_eq(text[start:end], r'\del')
text = r'\delta should not be detected.'
match = pattern.search(text)
assert match is None
# test_eq(replace_command_in_text(text, command_tuple), r'\delta should be replaced. \delta should not.')

# In the following example, the command takes one argument, but sometimes the command
# is `\del` 
command_tuple = ('til', 1, None, '{\\widetilde{#1}}')
pattern = regex_pattern_detecting_command(command_tuple)
text = r'\til \calh_g'
match = pattern.search(text)
# start, end = match.span()


In [None]:
#| export
def regex_pattern_detecting_space_separated_command(
        command_tuple: tuple[str, int, Union[None, str], str]
        ) -> regex.Pattern:
    r"""
    Generate a regex pattern for detecting LaTeX-style commands with their arguments.

    Parameters:
    command_tuple: A tuple containing (command_name, num_args, default_arg, command_definition)
        command_name: The name of the LaTeX command (without backslash)
        num_args: The number of arguments the command takes
        default_arg: The default value for an optional argument, or None if no optional argument
        command_definition: The LaTeX definition of the command (not used in this function)

    Returns:
    re.Pattern: A compiled regex pattern for matching the command and its arguments.

    The regex pattern captures groups in the following manner:

    1. For commands with no arguments:
       - No capture groups, only matches the command name

    2. For commands with an optional argument (when default_arg is provided):
       - Group 1: The optional argument if provided (without brackets), else None
       - Subsequent groups: Mandatory arguments (see below)

    3. For mandatory arguments:
       - Odd-numbered groups (1, 3, 5, ... or 2, 4, 6, ... if there's an optional arg):
         Contents of braced arguments, or None if unbraced
       - Even-numbered groups (2, 4, 6, ... or 3, 5, 7, ... if there's an optional arg):
         Single-character unbraced arguments, or None if braced

    Notes:
    - The pattern uses '\b' after the command name to ensure it doesn't match partial commands
    - Braced arguments can contain any characters except unmatched braces
    - Unbraced arguments are matched as single non-space characters
    - Whitespace between arguments is allowed and ignored in matching

    Examples:
    1. Command with 2 mandatory arguments:
       \cmd {arg1} x
       Groups: ('arg1', None, None, 'x')

    2. Command with 1 optional and 1 mandatory argument:
       \cmd [opt] {arg}
       Groups: ('opt', 'arg', None)

    3. Command with 1 optional and 2 mandatory arguments:
       \cmd {a} b
       Groups: (None, 'a', None, None, 'b')
    """
    command_name, num_args, default_arg, _ = command_tuple
    escaped_command = '\\\\' + command_name
    if num_args == 0:
        return regex_pattern_detecting_command(command_tuple)
      #   return regex.compile(f'{escaped_command}')
    if default_arg is not None:
        # Pattern for optional argument
        optional_arg = r'\s*(?:\[([^\]]*)\])?'
        num_args -= 1  # Reduce num_args by 1 as the first is now optional
    else:
        optional_arg = ''
   
    # Pattern for a single mandatory argument (group or single character)
    arg_pattern = r'(?:\{([^}]*)\}|(\S))'
    # Build the pattern for all mandatory arguments
    args_pattern = r'\s*'.join([arg_pattern] * num_args)
    # Combine all patterns
    full_pattern = f'{escaped_command}\\b{optional_arg}\\s*{args_pattern}'
    return regex.compile(full_pattern)


In [None]:
command_tuple = ('se', 0, None, r'\section')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple)
test_eq(pattern1.match(r'\section'), None)

In [None]:
# Example 1: Command with 2 mandatory arguments
command_tuple1 = ('cmd', 2, None, r'#1 #2')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple1)
match1 = pattern1.match(r'\cmd {arg1} x')
assert match1 is not None, "Regex failed to match"
test_eq(match1.groups(), ('arg1', None, None, 'x'))


# Example 2: Command with 1 optional and 1 mandatory argument
command_tuple2 = ('cmd', 2, 'default', r'#1 #2')
pattern2 = regex_pattern_detecting_space_separated_command(command_tuple2)

m2a = pattern2.match(r'\cmd [opt] {arg}')
if m2a: test_eq(m2a.groups(), ('opt', 'arg', None))

m2b = pattern2.match(r'\cmd {arg}')
if m2b: test_eq(m2b.groups(), (None, 'arg', None))

# Example 3: Command with 1 optional and 2 mandatory arguments
command_tuple3 = ('cmd', 3, 'default', r'#1 #2 #3')
pattern3 = regex_pattern_detecting_space_separated_command(command_tuple3)

m3a = pattern3.match(r'\cmd [opt] {a} b')
if m3a: test_eq(m3a.groups(), ('opt', 'a', None, None, 'b'))

m3b = pattern3.match(r'\cmd {a} b')
if m3b: test_eq(m3b.groups(), (None, 'a', None, None, 'b'))

m3c = pattern3.match(r'\cmd [opt] a b')
if m3c: test_eq(m3c.groups(), ('opt', None, 'a', None, 'b'))# Example 2: Command with 1 optional and 1 mandatory argument
command_tuple2 = ('cmd', 2, 'default', r'#1 #2')
pattern2 = regex_pattern_detecting_space_separated_command(command_tuple2)

m2a = pattern2.match(r'\cmd [opt] {arg}')
if m2a: test_eq(m2a.groups(), ('opt', 'arg', None))

m2b = pattern2.match(r'\cmd {arg}')
if m2b: test_eq(m2b.groups(), (None, 'arg', None))

# Example 3: Command with 1 optional and 2 mandatory arguments
command_tuple3 = ('cmd', 3, 'default', r'#1 #2 #3')
pattern3 = regex_pattern_detecting_space_separated_command(command_tuple3)

m3a = pattern3.match(r'\cmd [opt] {a} b')
if m3a: test_eq(m3a.groups(), ('opt', 'a', None, None, 'b'))

m3b = pattern3.match(r'\cmd {a} b')
if m3b: test_eq(m3b.groups(), (None, 'a', None, None, 'b'))

m3c = pattern3.match(r'\cmd [opt] a b')
if m3c: test_eq(m3c.groups(), ('opt', None, 'a', None, 'b'))

In [None]:
command_tuple = (r'example', 2, r'default', r'{#1 argument: \{#2\}}')
pattern = regex_pattern_detecting_space_separated_command(command_tuple)

# Test cases
test1 = pattern.match(r'\example{hello}')
if test1:
    print(test1.groups())  # Expected: (None, 'hello', None)
    test_eq(test1.groups(), (None, 'hello', None))

test2 = pattern.match(r'\example[custom] asdf')
if test2:
    print(test2.groups())  # Expected: ('custom', None, 'a')
    test_eq(test2.groups(), ('custom', None, 'a'))

test3 = pattern.match(r'\example a s d f')
if test3:
    print(test3.groups())  # Expected: (None, None, 'a')
    test_eq(test3.groups(), (None, None, 'a'))

(None, 'hello', None)
('custom', None, 'a')
(None, None, 'a')


In [None]:
# Test 1: Basic command with 2 arguments
command_tuple1 = ('cmd', 2, None, r'#1 #2')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple1)

if m := pattern1.match(r'\cmd {arg1} {arg2}'): test_eq(m.groups(), ('arg1', None, 'arg2', None))
if m := pattern1.match(r'\cmd a b'):           test_eq(m.groups(), (None, 'a', None, 'b'))
if m := pattern1.match(r'\cmd {a} b'):         test_eq(m.groups(), ('a', None, None, 'b'))

# Test 2: Command with optional argument
command_tuple2 = ('opt', 2, 'default', r'#1 #2')
pattern2 = regex_pattern_detecting_space_separated_command(command_tuple2)

if m := pattern2.match(r'\opt {custom}'):        test_eq(m.groups(), (None, 'custom', None))
if m := pattern2.match(r'\opt [arg] {custom}'):  test_eq(m.groups(), ('arg', 'custom', None))
if m := pattern2.match(r'\opt a'):               test_eq(m.groups(), (None, None, 'a'))

# Test 3: Command with multiple arguments
command_tuple3 = ('multi', 3, None, r'#1 #2 #3')
pattern3 = regex_pattern_detecting_space_separated_command(command_tuple3)

if m := pattern3.match(r'\multi {a b} {c d} {e f}'): test_eq(m.groups(), ('a b', None, 'c d', None, 'e f', None))
if m := pattern3.match(r'\multi a b c'):             test_eq(m.groups(), (None, 'a', None, 'b', None, 'c'))
if m := pattern3.match(r'\multi {a} b {c}'):         test_eq(m.groups(), ('a', None, None, 'b', 'c', None))

# Test 4: Command with no arguments
command_tuple4 = ('noarg', 0, None, r'')
pattern4 = regex_pattern_detecting_space_separated_command(command_tuple4)

if m := pattern4.match(r'\noarg'):  test_eq(m.group(), r'\noarg')
if m := pattern4.match(r'\noarg '): test_eq(m.group(), r'\noarg')

# Test 5: Command with mixed argument types
command_tuple5 = ('mix', 4, None, r'#1 #2 #3 #4')
pattern5 = regex_pattern_detecting_space_separated_command(command_tuple5)

if m := pattern5.match(r'\mix {arg1} a {arg3} b'): test_eq(m.groups(), ('arg1', None, None, 'a', 'arg3', None, None, 'b'))
if m := pattern5.match(r'\mix a {arg2} c d'):     test_eq(m.groups(), (None, 'a', 'arg2', None, None, 'c', None, 'd'))

# Test 6: Command with more complex default argument
command_tuple6 = ('complex', 3, 'default value', r'#1 #2 #3')
pattern6 = regex_pattern_detecting_space_separated_command(command_tuple6)

if m := pattern6.match(r'\complex {a} {b}'):        test_eq(m.groups(), (None, 'a', None, 'b', None))
if m := pattern6.match(r'\complex [custom] a b'):  test_eq(m.groups(), ('custom', None, 'a', None, 'b'))
if m := pattern6.match(r'\complex a b'):           test_eq(m.groups(), (None, None, 'a', None, 'b'))

# Test 7 & 8: These already work because they compare directly to None
test_eq(pattern1.match(r'not a command \cmd {arg1} {arg2}'), None)
test_eq(pattern1.match(r'\cmdextra {arg1} {arg2}'), None)

# Test 9: Command with optional argument used implicitly
command_tuple9 = ('implicit', 2, 'default', r'#1 #2')
pattern9 = regex_pattern_detecting_space_separated_command(command_tuple9)

if m := pattern9.match(r'\implicit {arg}'):         test_eq(m.groups(), (None, 'arg', None))
if m := pattern9.match(r'\implicit [custom] {arg}'): test_eq(m.groups(), ('custom', 'arg', None))
if m := pattern9.match(r'\implicit a b'):            test_eq(m.groups(), (None, None, 'a'))

## Validate latex commands

In [None]:
#| export
def detect_incorrect_latex_commands(
        latex_string: str,
        ) -> bool:
    """
    Return `True` if there is at least one syntactically
    incorrect use of a latex command detected in `latex_string`.

    This is a helper function to `math_mode_string_is_syntactically_valid`.
    """
    commands_in_string = set(extract_latex_commands(latex_string))
    for command in commands_in_string:
        if command not in temp_dict:
            continue
        tuppy = temp_dict[command]
        pattern = regex_pattern_detecting_command(tuppy)
        # Look at each invocation of the command to see if 
        # each invocation is properly used.
        simp_pattern = rf"\\\s*{command}"
        simp_matches = re.finditer(simp_pattern, latex_string)
        # simp_matches = re.findall(simp_pattern, latex_string)
        for match in simp_matches:
            trailing_substring = latex_string[match.start():]
            alt_match = pattern.search(trailing_substring)
            if not alt_match or alt_match.span()[0] != 0:
                return True

        # if not matches and not simp_matches:
        #     continue
        # if len(matches) != len(simp_matches):
        #     return True
    return False

In [None]:
#| hide
    
# Tests
# Correct usage
assert not detect_incorrect_latex_commands(r'\frac{a}{b}')
assert not detect_incorrect_latex_commands(r'\binom{n}{k}')
assert not detect_incorrect_latex_commands(r'\sqrt[n]{x}')
assert not detect_incorrect_latex_commands(r'\overset{a}{b}')
assert not detect_incorrect_latex_commands(r'\underset{a}{b}')
assert not detect_incorrect_latex_commands(r'\stackrel{a}{b}')
assert not detect_incorrect_latex_commands(r'\dfrac{a}{b}')
assert not detect_incorrect_latex_commands(r'\cfrac{1}{1+\cfrac{1}{x}}')
assert not detect_incorrect_latex_commands(r'\xleftarrow{text}')
assert not detect_incorrect_latex_commands(r'\xrightarrow{text}')
assert not detect_incorrect_latex_commands(r'\left( \right.')
assert not detect_incorrect_latex_commands(r'\overbrace{x+y+z}^{\text{sum}}')
assert not detect_incorrect_latex_commands(r'\underbrace{x+y+z}_{\text{sum}}')
assert not detect_incorrect_latex_commands(r'\overbrace{x+y+z}')
assert not detect_incorrect_latex_commands(r'\underbrace{x+y+z}')

# Incorrect usage (missing arguments)
assert detect_incorrect_latex_commands(r'\frac{a}')
assert detect_incorrect_latex_commands(r'\binom{n}')
assert detect_incorrect_latex_commands(r'\overset{a}')
assert detect_incorrect_latex_commands(r'\underset{a}')
assert detect_incorrect_latex_commands(r'\stackrel{a}')
assert detect_incorrect_latex_commands(r'\dfrac{a}')
assert detect_incorrect_latex_commands(r'\cfrac{1}')
assert detect_incorrect_latex_commands(r'\sideset{_1^2}')

#Extra arguments are technically okay 
assert not detect_incorrect_latex_commands(r'\frac{a}{b}{c}')
assert not detect_incorrect_latex_commands(r'\binom{n}{k}{m}')
assert not detect_incorrect_latex_commands(r'\overset{a}{b}{c}')

# Mixed correct and incorrect usage
assert detect_incorrect_latex_commands(r'\frac{a}{b} + \frac{c}')
assert detect_incorrect_latex_commands(r'\binom{n}{k} \cdot \binom{m}')

assert detect_incorrect_latex_commands(r'\text\in')


In [None]:
#| export
def check_unescaped_dollar(txt: str) -> bool:
    """Returns True if dollar signs are validly used."""
    if _has_unescaped_dollar(txt):
        math_mode_indices = latex_indices(txt)
        if len(math_mode_indices) != 1:
            return False
        # Ensure the math mode spans the entire string
        if not (math_mode_indices[0][0] == 0 and math_mode_indices[0][1] == len(txt)):
            return False
    return True


In [None]:
from fastcore.test import *

# --- Happy Paths (Should Pass) ---

# Case 1: String with no dollar signs
test_eq(check_unescaped_dollar(r"This is a standard text string."), True)

# Case 2: String with properly escaped dollar signs
test_eq(check_unescaped_dollar(r"The price is \$100."), True)

# Case 3: Multiple escaped dollar signs
test_eq(check_unescaped_dollar(r"Here is \$1 and there is \$2."), True)

# Case 4: Escaped dollar at the very beginning or end
test_eq(check_unescaped_dollar(r"\$Start and end\$"), True)


# --- Failure Paths (Should Raise Exception) ---

# Case 5: Single unescaped dollar sign
test_eq(check_unescaped_dollar(r"This contains an unescaped $ symbol."), False)

# Case 6: Unescaped dollar at the start of the string
test_eq(check_unescaped_dollar(r"$100 is the price."), False)

# Case 7: Mixed escaped and unescaped dollars
# The function should catch the second, unescaped dollar
test_eq(check_unescaped_dollar(r"This is valid \$ but this is not $."), False)

# Case 8: Double dollars (often used for LaTeX display math, but technically unescaped)
# Assuming the function flags any unescaped $
test_eq(check_unescaped_dollar(r"Display math $$ x^2 $$"), False)


In [None]:
#| export
def math_mode_string_is_syntactically_valid(
        text: str,
        ) -> bool:
    """
    Return `True` if `text` is determined to be syntactically valid
    as a latex str.

    There may be TeX syntax rules beyond the scope of this function.

    Some caveats:

    `text` is allowed to have dollar signs `$` and is also allowed to not have
    dollar signs. Even if `text` does not have dollar signs, this function
    may return `True`. Even if `text` has dollar signs, this function may return
    `False` if the entire string is not a singular math mode string or if the
    dollar signs are not used in a math-mode-valid way.
    """
    # 
    text = text.strip()
    # List of validation functions. Each returns True if the check passes.
    # Functions that detect errors (return True on error) are negated with a lambda.
    checks = [
        check_unescaped_dollar,
        _does_not_end_with_script,
        _is_balanced_braces,
        _is_left_right_balanced,
        lambda t: not _detect_backslash_space_curly(t),
        lambda t: not _has_invalid_left_right_bracket(t),
        lambda t: not _has_double_script(t),
        lambda t: not _has_double_script_literal(t),
        lambda t: not detect_incorrect_latex_commands(t),
        lambda t: not detect_unbalanced_environments(t), # bool([]) is False
    ]

    # The string is valid if all checks pass.
    return all(check(text) for check in checks)




In [None]:
assert not math_mode_string_is_syntactically_valid(r'$$n=p_1^{e_1} p_2^{e_2} \cdots p_k^$$')
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$ $')
assert math_mode_string_is_syntactically_valid(r'hi')
assert math_mode_string_is_syntactically_valid(r'$hi$')
assert not math_mode_string_is_syntactically_valid(r'$hi$$')
assert math_mode_string_is_syntactically_valid(r'$\\dim ^ a$')
assert not math_mode_string_is_syntactically_valid(r'{ hi')
assert math_mode_string_is_syntactically_valid(r'\{ hi')
assert math_mode_string_is_syntactically_valid(r'\ [')
assert math_mode_string_is_syntactically_valid(r'\left( \right.')
assert not math_mode_string_is_syntactically_valid(r'\left \right.')
assert math_mode_string_is_syntactically_valid(r'$$\left|\sum_{i=0} \right|$$')
assert math_mode_string_is_syntactically_valid(r'$\\\$$')
assert not math_mode_string_is_syntactically_valid(r'\begin{enumerate}')
assert math_mode_string_is_syntactically_valid(r'\begin{enumerate} asdf \end{enumerate}')
assert not math_mode_string_is_syntactically_valid(r'$$R=\sum_P\in X\operatorname length\left(\Omega__X / Y\right)_p\cdot P$$')
# I had previously thought the following to be syntactically incorrect, but it actually is syntactically correct.
assert detect_incorrect_latex_commands(r'\sideset{_1^2}{_3^4}\sum')

math_mode_string_is_syntactically_valid(r'\text\in')

False

The `math_mode_string_is_syntactically_valid` experimentally assesses whether a given math mode LaTeX string is syntactically valid. In principal, this should mean that a LaTeX syntax error caused by the string should be detected by the function.

TODO: consider the following to :


Unescaped % sign (starts a comment):
`$x = 50% of y$`

Using ! (negative space) at the beginning of math mode:
`$\!x + y$`

The following lists some example outputs of the `math_mode_string_is_syntactically_valid` function along with explanations.

Unmatched curly braces are a common syntactical error:

In [None]:
assert not math_mode_string_is_syntactically_valid(r'\sqrt{x}}')

However, using `\{` or `\}` does not count towards curly bracket matching:

In [None]:
assert math_mode_string_is_syntactically_valid(r'\{hi')

On the other hand, a backslash `\` followed by spaces ` ` and then followed by a curly bracket is in itself an invalid syntax.

In [None]:
assert not math_mode_string_is_syntactically_valid(r'\ {hi')

`math_mode_string_is_syntactically_valid` will consider the validity of a string whether or not the string has math mode delimiters. 

In [None]:
assert math_mode_string_is_syntactically_valid(r'\operatorname{Gal}')
assert math_mode_string_is_syntactically_valid(r'$\operatorname{Gal}$')

However, `math_mode_string_is_syntactically_valid` returns `False` if the string has dollar sign delimiters and more than one math mode string is detected in the string (use `latex_indices` to separate out math mode strings.),  

In [None]:
# More than one math mode string is present
assert not math_mode_string_is_syntactically_valid('$hi$ $bye$')
# the math mode delimiter `$` is unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
# the math mode delimiters `$$` and `$` are unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')

In [None]:
#| export
# def math_mode_string_is_syntactically_clean(
#         text: str,
#         ) -> bool:
#     """
#     Return `True` if `text` is syntactically "clean" as a LaTeX math mode str.
    
#     While the precise meaning of this may be subjective, here we will
#     consider `text` to be clean, assuming that it is syntactically valid, if

#     - It does not have double blackslashes
#     """
#     if r'\\' in text:
#         return False

### Detect syntax errors along with "soft" syntactical oddities

In [None]:
#| export
def math_mode_string_has_soft_or_hard_syntax_errors(text: str) -> bool:
    return (
        not math_mode_string_is_syntactically_valid(text) or 
        latex_math_mode_has_soft_syntax_oddities(text))

In [None]:

# --- Tests for math_mode_string_has_soft_or_hard_syntax_errors ---
# Checks for balanced curly braces {} AND soft syntax

def test_math_mode_string_has_soft_or_hard_syntax_errors_valid():
    """Should return False only if BOTH hard and soft syntax are valid."""
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\mathcal{F}"), False)
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\{ x \}"), False) # Assuming simple counter logic
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\frac{a}{b}"), False)

def test_math_mode_string_has_soft_or_hard_syntax_errors_hard_errors():
    """Should return True if curly braces are unbalanced."""
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\mathcal{F"), True)
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"}"), True)
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"{a, b"), True)

def test_math_mode_string_has_soft_or_hard_syntax_errors_soft_errors():
    """Should return True if curly braces are fine but parens/brackets are not."""
    # Valid braces {}, but invalid parens (
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\text{Func}(x"), True)
    
    # Valid braces {}, but invalid brackets ]
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"x_{ [i }"), True) # Note: This might fail hard check too depending on implementation

def test_math_mode_string_has_soft_or_hard_syntax_errors_mixed_errors():
    """Should return True if both are invalid."""
    test_eq(math_mode_string_has_soft_or_hard_syntax_errors(r"\mathcal{F}(x"), True)

test_math_mode_string_has_soft_or_hard_syntax_errors_valid()
test_math_mode_string_has_soft_or_hard_syntax_errors_hard_errors()
test_math_mode_string_has_soft_or_hard_syntax_errors_soft_errors()
test_math_mode_string_has_soft_or_hard_syntax_errors_mixed_errors()

NameError: name 'math_mode_string_is_syntactically_valid' is not defined