In [None]:
#| default_exp helper.latex.macros_and_commands

# helper.latex.macros_and_commands
> Latex functions for identifying macros and commands (to replace)

In [None]:
#| export

import re
from typing import Union

import regex

from trouver.helper.latex.comments import remove_comments

In [None]:
from fastcore.test import *

## Identify macros and commands (to replace)

The following functions were originally written for `latex.formatting`, but were moved here.

In [None]:
#| export
def _argument_detection(group_num: int) -> str:
    r"""
    Helper function to `regex_pattern_detecting_command`, and `_commands_from_def`

    This basically helps detect balanced curly braces for invocations of commands.
    """
    return r"\{((?>[^{}]+|\{(?1)\})*)\}".replace("1", str(group_num))

In [None]:
#| export
def custom_commands(
        preamble: str, # The preamble of a LaTeX document.
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    Return a dict mapping commands (and math operators) defined in `preamble` to
    the number of arguments display text of the commands.

    Assumes that the newcommands only have at most one default parameter (newcommands with
    multiple default parameters are not valid in LaTeX).

    Ignores all comented newcommands.
    """
    preamble = remove_comments(preamble)
    latex_commands = _commands_from_newcommand_and_declaremathoperator(preamble)
    tex_commands = _commands_from_def(preamble)
    return latex_commands + tex_commands


def _commands_from_newcommand_and_declaremathoperator(
        preamble: str, # The preamble of a LaTeX document
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    Get custom commands from invocations of `\newcommand` and `DeclareMathOperator`
    in the preamble.

    Helper function to `custom_commands`
    """
    # newcommand_regex = regex.compile(
    #     r'(?<!%)\s*\\(?:(?:re)?newcommand|DeclareMathOperator)\s*\{\\\s*(\w+)\s*\}\s*(?:\[(\d+)\]\s*(?:\[(\w+)\])?)?\s*\{((?>[^{}]+|\{(?4)\})*)\}', re.MULTILINE)
    newcommand_regex = regex.compile(
        r'(?<!%)\s*\\(?:(?:re)?newcommand|DeclareMathOperator)\s*(?:\{\\\s*(\w+)\s*\}|\\\s*(\w+))\s*(?:\[(\d+)\]\s*(?:\[(\w+)\])?)?\s*\{((?>[^{}]+|\{(?5)\})*)\}', re.MULTILINE)

    commands = []
    for match in newcommand_regex.finditer(preamble):
        name_surrounded_in_parentheses = match.group(1) # e.g. \newcommand{\A}
        name_without_parentheses = match.group(2) # e.g. \newcommand\A
        num_args = match.group(3)
        optional_default_arg = match.group(4)
        definition = match.group(5)

        if name_surrounded_in_parentheses is not None:
            name = name_surrounded_in_parentheses
        else:
            name = name_without_parentheses

        # Convert the number of arguments to an integer, if it was specified
        if num_args is not None:
            num_args = int(num_args)
        else:
            num_args = 0

        commands.append((name, num_args, optional_default_arg, definition))
    return commands


def _commands_from_def(
        preamble: str
        ) -> list[tuple[str, int, Union[str, None], str]]: # Each tuple consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
    """
    """
    def_command_identifying = r'(?<!%)\s*\\def\s*'
    command_name_identifying = r'\\\s*(\w+)\s*'
    command_def = _argument_detection(2)
    def_regex = regex.compile(
        f"{def_command_identifying}{command_name_identifying}{command_def}"
    )
    return [(match.group(1), 0, None, match.group(2))
            for match in def_regex.finditer(preamble)]


In [None]:
#| hide
text = r"\def\calh{{\mathcal H}}"
test_eq(_commands_from_def(text), [('calh', 0, None, '{\\mathcal H}')])

In [None]:
# Basic
text_1 = r'\newcommand{\con}{\mathcal{C}}'
test_eq(custom_commands(text_1), [('con', 0, None, r'\mathcal{C}')])

# With a parameter
text_2 = r'\newcommand{\field}[1]{\mathbb{#1}}'
test_eq(custom_commands(text_2), [('field', 1, None, r'\mathbb{#1}')]) 

# With multiple parameters, the first of which has a default value of `2`
text_3 = r'\newcommand{\plusbinomial}[3][2]{(#2 + #3)^#1}'
test_eq(custom_commands(text_3), [('plusbinomial', 3, '2', r'(#2 + #3)^#1')])

# The display text has backslashes `\` and curly brances `{}``
text_4 = r'\newcommand{\beq}{\begin{displaymath}}'
test_eq(custom_commands(text_4), [('beq', 0, None, '\\begin{displaymath}')])


# Basic with spaces in the newcommand declaration
text_6 = r'\newcommand {\con}  {\mathcal{C}}'
test_eq(custom_commands(text_6), [('con', 0, None, r'\mathcal{C}')])

# With a parameter and spaces in the newcommand declaration
text_7 = r'\newcommand   {\field}   [1] {\mathbb{#1}}'
test_eq(custom_commands(text_7), [('field', 1, None, r'\mathbb{#1}')])

# With multiple parameters, a default value, and spaces in the newcommand declaration
text_8 = r'\newcommand {\plusbinomial} [3] [2] {(#2 + #3)^#1}'
test_eq(custom_commands(text_8), [('plusbinomial', 3, '2', r'(#2 + #3)^#1')]) 

# With a comment `%'; commented out command declarations should not be detected.
text_9 = r'% \newcommand{\con}{\mathcal{C}}'
test_eq(custom_commands(text_9), [])


# Spanning multiple lines
text_10 = r'''\newcommand{\mat}[4]{\left[\begin{array}{cc}#1 & #2 \\
                                         #3 & #4\end{array}\right]}'''
test_eq(
    custom_commands(text_10),
    [('mat', 4, None,
             '\\left[\\begin{array}{cc}#1 & #2 \\\\\n                                         #3 & #4\\end{array}\\right]')])

# Math operator
text_11 = r'\DeclareMathOperator{\Hom}{Hom}'
test_eq(custom_commands(text_11), [('Hom', 0, None, 'Hom')])

text_12 = r'\DeclareMathOperator{\tConf}{\widetilde{Conf}}'
test_eq(custom_commands(text_12), [('tConf', 0, None, r'\widetilde{Conf}')])

# `\def` commands
# \def is a bit complicated because arguments can either be provided with []
# or can be provided with {}.
text_13 = r'\def\A{{\cO_{K}}}'
test_eq(custom_commands(text_13), [('A', 0, None, r'{\cO_{K}}')])

# newcommand and renewcommand don't require {} for the
# command name, cf. https://arxiv.org/abs/1703.05365
text_14 = r'\newcommand\A{{\mathbb A}}'
test_eq(custom_commands(text_14), [('A', 0, None, r'{\mathbb A}')])

# A test for https://arxiv.org/abs/0902.4637
text_15 = r'\newcommand{\til}[1]{{\widetilde{#1}}}'
test_eq(custom_commands(text_15), [('til', 1, None, '{\\widetilde{#1}}')])




In [None]:
#| export
def regex_pattern_detecting_command(
        command_tuple: tuple[str, int, Union[None, str], str], # Consists of 1. the name of the custom command 2. the number of parameters 3. The default argument if specified or `None` otherwise, and 4. the display text of the command.
        ) -> regex.Pattern:
    """Return a `regex.pattern` object (not a `re.pattern` object) detecting
    the command with the specified number of parameters, optional argument,
    and display text.

    Assumes that the curly braces used to write the invocations of the commands
    are balanced and properly nested. Assumes that there are no two commands
    of the same name.
    """
    command_name, num_parameters, optional_arg, _ = command_tuple
    backslash_name = fr"\\{command_name}"
    optional_argument_detection = fr"(?:\[(.*?)\])?" if optional_arg is not None else ""
    if optional_arg is not None:
        trailing_arguments = [_argument_detection(i) for i in range(2, 1+num_parameters)]
        trailing_args_pattern = "\\s*".join(trailing_arguments)
        pattern = (f"{backslash_name}\\s*{optional_argument_detection}\\s*{trailing_args_pattern}")
    elif num_parameters > 0:
        arguments = [_argument_detection(i) for i in range(1, 1+num_parameters)]
        args_pattern = "\\s*".join(arguments)
        pattern = f"{backslash_name}\\s*{args_pattern}"
    else:
        # Match the command name exactly without letters immediately following
        # (but underscores following are okay).
        pattern = rf"{backslash_name}(?![^\W_])"
    return regex.compile(pattern)

    

In [None]:
# Basic
pattern = regex_pattern_detecting_command(('Sur', 0, None, r'\mathrm{Sur}'))
text = r'The number of element of $\Sur(\operatorname{Cl} \mathcal{O}_L, A)$ is ...'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], r'\Sur')

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{\frac{2}{5}}{7}'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{error}{7'
match = pattern.search(text)
test_is(match, None)
# start, end = match.span()
# test_eq(text[start:end], text)

pattern = regex_pattern_detecting_command(('frac', 2, None, r'\mathrm{Sur}'))
text = r'\frac{\frac{2}{5}}{7'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], r'\frac{2}{5}')

# One parameter
pattern = regex_pattern_detecting_command(('field', 1, None, r'\mathbb{#1}'))
text = r'\field{Q}'
# print(pattern.pattern)
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

# Multiple parameters
pattern = regex_pattern_detecting_command(('mat', 4, None, r'\left[\begin{array}{cc}#1 & #2 \\ #3 & #4\end{array}\right]'))
text = r'\mat{{123}}{asdfasdf{}{}}{{{}}}{{asdf}{asdf}{}}' # This is a balanced str.
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)
test_eq(match.group(1), r'{123}')

# Multiple parameters, one of which is optional parameter
pattern = regex_pattern_detecting_command(('plusbinomial', 3, '2', r'(#2 + #3)^#1'))
# When the optional parameter is used
text = r'\plusbinomial{x}{y}'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

# When the optional parameter is not used
text = r'\plusbinomial[4]{x}{y}'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

# One parameter that is optional.
pattern = regex_pattern_detecting_command(('greet', 1, 'world', r'Hello #1!'))
# When the optional parameter is used
text = r'\greet'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

# When the optional parameter is not used
text = r'\greet[govna]'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], text)

# In the following example, `\del` is a command defined as `\delta`.
# Any invocation `\delta` should detected as invocations of `\del``
command_tuple = (r'del', 0, None, r'\delta')
pattern = regex_pattern_detecting_command(command_tuple)
text = r'\del should be detected.'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], r'\del')
text = r'\delta should not be detected.'
match = pattern.search(text)
assert match is None
# test_eq(replace_command_in_text(text, command_tuple), r'\delta should be replaced. \delta should not.')

# In the following example, the command takes one argument, but sometimes the command
# is `\del` 
command_tuple = ('til', 1, None, '{\\widetilde{#1}}')
pattern = regex_pattern_detecting_command(command_tuple)
text = r'\til \calh_g'
match = pattern.search(text)
# start, end = match.span()


In [None]:
#| export
def regex_pattern_detecting_space_separated_command(
        command_tuple: tuple[str, int, Union[None, str], str]
        ) -> regex.Pattern:
    r"""
    Generate a regex pattern for detecting LaTeX-style commands with their arguments.

    Parameters:
    command_tuple: A tuple containing (command_name, num_args, default_arg, command_definition)
        command_name: The name of the LaTeX command (without backslash)
        num_args: The number of arguments the command takes
        default_arg: The default value for an optional argument, or None if no optional argument
        command_definition: The LaTeX definition of the command (not used in this function)

    Returns:
    re.Pattern: A compiled regex pattern for matching the command and its arguments.

    The regex pattern captures groups in the following manner:

    1. For commands with no arguments:
       - No capture groups, only matches the command name

    2. For commands with an optional argument (when default_arg is provided):
       - Group 1: The optional argument if provided (without brackets), else None
       - Subsequent groups: Mandatory arguments (see below)

    3. For mandatory arguments:
       - Odd-numbered groups (1, 3, 5, ... or 2, 4, 6, ... if there's an optional arg):
         Contents of braced arguments, or None if unbraced
       - Even-numbered groups (2, 4, 6, ... or 3, 5, 7, ... if there's an optional arg):
         Single-character unbraced arguments, or None if braced

    Notes:
    - The pattern uses '\b' after the command name to ensure it doesn't match partial commands
    - Braced arguments can contain any characters except unmatched braces
    - Unbraced arguments are matched as single non-space characters
    - Whitespace between arguments is allowed and ignored in matching

    Examples:
    1. Command with 2 mandatory arguments:
       \cmd {arg1} x
       Groups: ('arg1', None, None, 'x')

    2. Command with 1 optional and 1 mandatory argument:
       \cmd [opt] {arg}
       Groups: ('opt', 'arg', None)

    3. Command with 1 optional and 2 mandatory arguments:
       \cmd {a} b
       Groups: (None, 'a', None, None, 'b')
    """
    command_name, num_args, default_arg, _ = command_tuple
    escaped_command = '\\\\' + command_name
    if num_args == 0:
        return regex_pattern_detecting_command(command_tuple)
      #   return regex.compile(f'{escaped_command}')
    if default_arg is not None:
        # Pattern for optional argument
        optional_arg = r'\s*(?:\[([^\]]*)\])?'
        num_args -= 1  # Reduce num_args by 1 as the first is now optional
    else:
        optional_arg = ''
   
    # Pattern for a single mandatory argument (group or single character)
    arg_pattern = r'(?:\{([^}]*)\}|(\S))'
    # Build the pattern for all mandatory arguments
    args_pattern = r'\s*'.join([arg_pattern] * num_args)
    # Combine all patterns
    full_pattern = f'{escaped_command}\\b{optional_arg}\\s*{args_pattern}'
    return regex.compile(full_pattern)


In [None]:
command_tuple = ('se', 0, None, r'\section')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple)
test_eq(pattern1.match(r'\section'), None)

In [None]:
# Example 1: Command with 2 mandatory arguments
command_tuple1 = ('cmd', 2, None, r'#1 #2')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple1)

test_eq(pattern1.match(r'\cmd {arg1} x').groups(), ('arg1', None, None, 'x'))

# Example 2: Command with 1 optional and 1 mandatory argument
command_tuple2 = ('cmd', 2, 'default', r'#1 #2')
pattern2 = regex_pattern_detecting_space_separated_command(command_tuple2)

test_eq(pattern2.match(r'\cmd [opt] {arg}').groups(), ('opt', 'arg', None))
test_eq(pattern2.match(r'\cmd {arg}').groups(), (None, 'arg', None))

# Example 3: Command with 1 optional and 2 mandatory arguments
command_tuple3 = ('cmd', 3, 'default', r'#1 #2 #3')
pattern3 = regex_pattern_detecting_space_separated_command(command_tuple3)

test_eq(pattern3.match(r'\cmd [opt] {a} b').groups(), ('opt', 'a', None, None, 'b'))
test_eq(pattern3.match(r'\cmd {a} b').groups(), (None, 'a', None, None, 'b'))
test_eq(pattern3.match(r'\cmd [opt] a b').groups(), ('opt', None, 'a', None, 'b'))


In [None]:
command_tuple = (r'example', 2, r'default', r'{#1 argument: \{#2\}}')
pattern = regex_pattern_detecting_space_separated_command(command_tuple)

# Test cases
test1 = pattern.match(r'\example{hello}')
test2 = pattern.match(r'\example[custom] asdf')
test3 = pattern.match(r'\example a s d f')

print(test1.groups())  # Expected: (None, 'hello', None)
print(test2.groups())  # Expected: ('custom', None, 'a')
print(test3.groups())  # Expected: (None, None, 'a')


(None, 'hello', None)
('custom', None, 'a')
(None, None, 'a')


In [None]:

# Test 1: Basic command with 2 arguments
command_tuple1 = ('cmd', 2, None, r'#1 #2')
pattern1 = regex_pattern_detecting_space_separated_command(command_tuple1)

test_eq(pattern1.match(r'\cmd {arg1} {arg2}').groups(), ('arg1', None, 'arg2', None))
test_eq(pattern1.match(r'\cmd a b').groups(), (None, 'a', None, 'b'))
test_eq(pattern1.match(r'\cmd {a} b').groups(), ('a', None, None, 'b'))

# Test 2: Command with optional argument
command_tuple2 = ('opt', 2, 'default', r'#1 #2')
pattern2 = regex_pattern_detecting_space_separated_command(command_tuple2)

test_eq(pattern2.match(r'\opt {custom}').groups(), (None, 'custom', None))
test_eq(pattern2.match(r'\opt [arg] {custom}').groups(), ('arg', 'custom', None))
test_eq(pattern2.match(r'\opt a').groups(), (None, None, 'a'))

# Test 3: Command with multiple arguments
command_tuple3 = ('multi', 3, None, r'#1 #2 #3')
pattern3 = regex_pattern_detecting_space_separated_command(command_tuple3)

test_eq(pattern3.match(r'\multi {a b} {c d} {e f}').groups(), ('a b', None, 'c d', None, 'e f', None))
test_eq(pattern3.match(r'\multi a b c').groups(), (None, 'a', None, 'b', None, 'c'))
test_eq(pattern3.match(r'\multi {a} b {c}').groups(), ('a', None, None, 'b', 'c', None))

# Test 4: Command with no arguments
command_tuple4 = ('noarg', 0, None, r'')
pattern4 = regex_pattern_detecting_space_separated_command(command_tuple4)

test_eq(pattern4.match(r'\noarg').group(), r'\noarg')
test_eq(pattern4.match(r'\noarg ').group(), r'\noarg')

# Test 5: Command with mixed argument types
command_tuple5 = ('mix', 4, None, r'#1 #2 #3 #4')
pattern5 = regex_pattern_detecting_space_separated_command(command_tuple5)

test_eq(pattern5.match(r'\mix {arg1} a {arg3} b').groups(), ('arg1', None, None, 'a', 'arg3', None, None, 'b'))
test_eq(pattern5.match(r'\mix a {arg2} c d').groups(), (None, 'a', 'arg2', None, None, 'c', None, 'd'))

# Test 6: Command with more complex default argument
command_tuple6 = ('complex', 3, 'default value', r'#1 #2 #3')
pattern6 = regex_pattern_detecting_space_separated_command(command_tuple6)

test_eq(pattern6.match(r'\complex {a} {b}').groups(), (None, 'a', None, 'b', None))
test_eq(pattern6.match(r'\complex [custom] a b').groups(), ('custom', None, 'a', None, 'b'))
test_eq(pattern6.match(r'\complex a b').groups(), (None, None, 'a', None, 'b'))

# Test 7: Ensure pattern matches at the start of the string
test_eq(pattern1.match(r'not a command \cmd {arg1} {arg2}'), None)

# Test 8: Ensure pattern doesn't match partial commands
test_eq(pattern1.match(r'\cmdextra {arg1} {arg2}'), None)

# Test 9: Command with optional argument used implicitly
command_tuple9 = ('implicit', 2, 'default', r'#1 #2')
pattern9 = regex_pattern_detecting_space_separated_command(command_tuple9)

test_eq(pattern9.match(r'\implicit {arg}').groups(), (None, 'arg', None))
test_eq(pattern9.match(r'\implicit [custom] {arg}').groups(), ('custom', 'arg', None))
test_eq(pattern9.match(r'\implicit a b').groups(), (None, None, 'a'))
