In [None]:
#| default_exp helper.definition_and_notation

# helper.definition_and_notation
> Functions that deal with definitions and notations

Note that the functions in this module will be deprecated --- the standard of trouver for marking definitions and notations is now via HTML tags, not by surrounding double asterisks `**`.

In [None]:
#| export
from trouver.helper.regex import find_regex_in_text

In [None]:
import operator
from fastcore.test import *

## Definitions and notations

I surround definitions and notations by double asterisks `**`. The `double_asterisk_indices` method finds strings surrounded by double asterisks, the `notation_asterisk_indices` method finds notation str, and the `definition_asterisk_indices` method finds definition str.

In [None]:
#| export
import re

def _get_bold_indices_with_braces(text):
    """
    Finds indices of text wrapped in double asterisks (**...**).
    Handles cases where ** might appear inside LaTeX curly braces {}.
    """
    indices = []
    i = 0
    n = len(text)
    
    while i < n:
        # Check for start of bold block
        if text[i:i+2] == '**':
            start = i
            i += 2
            brace_depth = 0
            found_end = False
            
            while i < n:
                # Track braces to ignore ** inside them (e.g. M^{**})
                if text[i] == '{':
                    brace_depth += 1
                elif text[i] == '}':
                    brace_depth = max(0, brace_depth - 1)
                
                # Check for end of bold block
                # Must be ** and we must not be inside curly braces
                if text[i:i+2] == '**' and brace_depth == 0:
                    end = i + 2
                    indices.append((start, end))
                    i += 2
                    found_end = True
                    break
                
                i += 1
            
            if not found_end:
                # If we reached end of string without closing **, stop
                break
        else:
            i += 1
            
    return indices

# def _is_notation(text, start, end):
#     """
#     Determines if a bolded section is notation.
#     Criteria: The content inside **...** starts and ends with $.
#     """
#     # Extract content inside **...**
#     content = text[start+2:end-2].strip()
#     return content.startswith('$') and content.endswith('$')

def _is_notation(text, start, end):
    """
    Determines if a bolded section is notation.
    Criteria: The content MUST be a SINGLE LaTeX math block.
    It must start with $ (or $$) and the MATCHING closing $ (or $$) 
    must be the very last characters of the string.
    """
    # Extract content inside **...**
    content = text[start+2:end-2].strip()
    
    # 1. Identify the delimiter ($ or $$)
    if content.startswith('$$'):
        delimiter = '$$'
        search_start_index = 2
    elif content.startswith('$'):
        delimiter = '$'
        search_start_index = 1
    else:
        # Does not start with math delimiter -> Not notation
        return False
        
    # 2. Find the CLOSING delimiter for the FIRST opening delimiter
    i = search_start_index
    while i < len(content):
        # Skip escaped characters (like \$)
        if content[i] == '\\':
            i += 2
            continue
            
        # Check if we found the delimiter
        if content[i:].startswith(delimiter):
            # We found the closing delimiter.
            # 3. CRITICAL CHECK: Is this the end of the string?
            # If there is text after this closing delimiter, it's a Definition (mixed text).
            if i + len(delimiter) == len(content):
                return True
            else:
                return False
        
        i += 1
            
    # If we finish the loop without finding a closing delimiter, it's malformed/text.
    return False



In [None]:
#| export
# def double_asterisk_indices(
#         text: str # the str in which to find the indices of double asterisk surrounded text.
#         ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with double asterisks, including the double asterisks.
#     # TODO: fix double asterisks in math mode.
#     """Return the indices in `str` of text surrounded by double asterisks.
    
#     Assumes there no LaTeX math mode string has double asterisks.

#     **See Also**
    
#     - `notation_asterisk_indices`
#     - `definition_asterisk_indices`
#     """
#     return find_regex_in_text(text, pattern=r'\*\*[^*]+\*\*')

def double_asterisk_indices(text: str) -> list[tuple[int, int]]:
    r"""
    Finds indices of text wrapped in double asterisks (**...**).
    
    This function is robust to LaTeX syntax:
    1. It ignores ** appearing inside curly braces {} (e.g., **$M^{**}$**).
    2. It respects escaped characters (e.g., \** is treated as literal asterisks).

    **See Also**
    
    - `notation_asterisk_indices`
    - `definition_asterisk_indices`
    """
    indices = []
    i = 0
    n = len(text)
    
    while i < n:
        # 1. Handle escaped characters in the main text flow
        if text[i] == '\\':
            i += 2
            continue
            
        # 2. Check for start of bold block
        if text[i:i+2] == '**':
            start = i
            current = i + 2
            brace_depth = 0
            found_end = False
            
            while current < n:
                char = text[current]
                
                # Handle escaped characters inside the block
                if char == '\\':
                    current += 2
                    continue
                
                # Track braces to ignore ** inside them (e.g. \command{**})
                if char == '{':
                    brace_depth += 1
                elif char == '}':
                    brace_depth = max(0, brace_depth - 1)
                
                # Check for end of bold block
                # Must be ** and we must not be inside curly braces
                if text[current:current+2] == '**' and brace_depth == 0:
                    end = current + 2
                    indices.append((start, end))
                    i = end # Move main index past this block
                    found_end = True
                    break
                
                current += 1
            
            if not found_end:
                # If we opened a ** but never closed it properly, 
                # just move past the opening ** to avoid infinite loop.
                i += 2 
        else:
            i += 1
            
    return indices




  """


In [None]:
# |hide
# Test double asterisks in math mode.
text = r"**$M^{**}$**"  # I would like this to get the entire string, but currently, this is not the case.
double_asterisk_indices(text)

[(0, 12)]

In [None]:
#| export
def notation_asterisk_indices(
        text: str # the str in which to find the indices of notations surrounded by double asterisks.
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with LaTeX math mode text with double asterisks, including the double asterisks.
    """Return the indices of notation text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a notation almost always
    when it is purely LaTeX math mode text. 

    Assumes that no LaTeX math mode string has the dollar sign character
    within it.
    """
    # return find_regex_in_text(
    #     text, pattern=r'\*\*\$\$[^$]+\$\$\*\*|\*\*\$[^$]+\$\*\*')
    all_bolds = _get_bold_indices_with_braces(text)
    return [
        (start, end) for start, end in all_bolds 
        if _is_notation(text, start, end)
    ]


def definition_asterisk_indices(
        text: str # The str in which to find the indices of the definitions surrounded by double asterisks.
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a substring in `text` surrounded by double asterisks, including the double asterisks.
    r"""Return the indices of definition text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a definition almost always
    when it is not purely LaTeX math mode text.
    
    Assumes that no LaTeX math mode string has double asterisks and that no
    LaTeX math mode string has the dollar sign character within it.
    """
    # all_double_asterisks = double_asterisk_indices(text)
    # notations = notation_asterisk_indices(text)
    # return [tuppy for tuppy in all_double_asterisks if tuppy not in notations]

    all_bolds = _get_bold_indices_with_braces(text)
    return [
        (start, end) for start, end in all_bolds 
        if not _is_notation(text, start, end)
    ]

#### Examples

In the following example, `scheme` and `structure sheaf` are definitions, whereas `$\mathcal{O}_X$` is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = double_asterisk_indices(text)

start, end = listy[0]
test_eq(text[start:end], '**scheme**')

start, end = listy[1]
test_eq(text[start:end], '**structure sheaf**')

start, end = listy[2]
test_eq(text[start:end], r'**$\mathcal{O}_X$**')

listy = notation_asterisk_indices(text)
start, end = listy[0]
test_eq(text[start:end], r'**$\mathcal{O}_X$**')
test_eq(len(listy), 1)

listy = definition_asterisk_indices(text)
print(listy)
test_eq(len(listy), 2)



[(2, 12), (25, 44)]


The following example has a definition which starts and ends with dollar sign `$` characters:

In [None]:
text = r'A **$G$-group over a ring $A$** is'
listy = notation_asterisk_indices(text)
test_eq(len(listy), 0)

listy = definition_asterisk_indices(text)
test(len(listy), 0, operator.ge)

The following example tests `notation_asterisk_indices` for LaTeX str with single asterisks in them:

In [None]:
# Tests LaTeX str with asterisks in them:
text = (r'''The **direct image of a sheaf $F^{\prime}$ on $X_{E}^{\prime}$** '''
        r'''is defined to be **$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$** and the '''
        r'''**inverse image of a sheaf $F$ on $X_{E}$** is defined to be **$\pi^{*} F=a\left(\pi^{p} F\right)$**.''')
listy = notation_asterisk_indices(text)
test_eq(len(listy), 2)
start, end = listy[0]
test_eq(text[start:end], r'**$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$**')

There are pure LaTeX strings which should be considered definitions, but for the purposes of the code here will be considered notations. For example, Hausdorff spaces in topology are also called $T_2$ spaces:

In [None]:
text = (r"A topological space $X$ is called **$T_2$** if for all $x,y \in X$, "
        r"there exist open neightborhoods $V$ and $W$ around $x$ and $y$ respectively "
        r"such that $V \cap W = 0$.")
listy = notation_asterisk_indices(text)
test_eq(text[listy[0][0]:listy[0][1]], r'**$T_2$**')

Unfortunately, the current implementation of the above methods do not work correctly if there are LaTeX string with double asterisks `**` within them.

In [None]:
# TODO: If this is fixed, delete this.
text = r'The double dual of $M$ is denoted by **$M^{**}$**.'
listy = definition_asterisk_indices(text)
# print(listy)
test_eq(listy, [])
# print(f"The `definition_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a definition. This is incorrect!")

listy = notation_asterisk_indices(text)
# print(f"The `notation_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a notation. I guess this is correct.")
test_eq(listy, [(37,49)])
text[37:49]

'**$M^{**}$**'

A workaround is to replace asterisks `*` with the LaTeX `\ast` command:

In [None]:
text = r'The double dual of $M$ is denoted by **$M^{\ast\ast}$**.'
listy = definition_asterisk_indices(text)
test_eq(len(listy), 0)

listy = notation_asterisk_indices(text)
test_eq(len(listy), 1)
test_eq(text[listy[0][0]:listy[0][1]], r'**$M^{\ast\ast}$**')

In [None]:
#| export
def defs_and_notats_separations(
        text: str 
        )-> list[tuple[int, bool]]:
    """Finds the indices in the text where double asterisks occur and
    categorizes whether each index is for a definition or a notation.
    
    **Parameters**

    - text - str

    **Returns**

    - list[tuple[int, bool]]
        - Each tuple is of the form `(start, end, is_notation)`, where
        `text[start:end]` is the double-asterisk surrounded string,
        including the double asterisks.
    """
    all_double_asterisks = double_asterisk_indices(text)
    notations = notation_asterisk_indices(text)
    return [(start, end, (start, end) in notations)
            for start, end in all_double_asterisks]

In the following example, the first two double-asterisk-surrounded-strings are definitions, and the third is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = defs_and_notats_separations(text)
assert not listy[0][2]
assert not listy[1][2]
assert listy[2][2]