In [None]:
#| default_exp helper.regex

# helper.regex
> Helper functions with regex capabilities

In [None]:
#| export
import re
from typing import Pattern, Sequence, Union

In [None]:
from fastcore.test import *
from trouver.helper.tests import _test_directory

In [None]:
#| export
def find_regex_in_text(
        text: str, # Text in which to find regex patter
        pattern: str | Pattern[str] # The regex pattern
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(a,b)` where `text[a:b]` is the regex match.
    # TODO: rename into regex_indices_in_text
    # TODO: swap parameters.
    """Return ranges in `text` where `pattern` occurs.
    """
    matches = re.finditer(pattern, text)
    return [match.span() for match in matches]

The following example finds the occurrence of the Markdown footnote:

In [None]:
regex_pattern = r'\[\^\d\]'
text = '[^1]: asdf'

output = find_regex_in_text(text, regex_pattern)
test_eq(output, [(0,4)])

start, end = output[0]
test_eq(text[start:end], '[^1]')

If there are multiple matches for the regex pattern, then they are all included in the outputted list.

In [None]:
regex_pattern = r'\d+'  # Searches for one or more consecutive digits
text = '9000 is a big number. But you know what is bigger? 9001.'

output = find_regex_in_text(text, regex_pattern)
test_eq(len(output), 2)

start, end = output[0]
test_eq(text[start:end], '9000')

start, end = output[1]
test_eq(text[start:end], '9001')

The following example detects YAML frontmatter text as used in Obsidian. This regex pattern is also used in `markdown.markdown.file.find_front_matter_meta_in_markdown_text`.


The regex pattern used is able to detect the frontmatter even when it is empty.

In [None]:
sample_regex = r'---\n([\S\s]*?)?(?(1)\n|)---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0,7)]

sample_str = '---\naliases: [this_is_an_aliases_for_the_Obsidian_note]\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0, len(sample_str))]  # The entire sample_str is detected.


Contrast the regex pattern above with the pattern `---\n[\S\s]*?\n---`, which does not detect empty YAML frontmatter text.

In [None]:
sample_regex = '---\n[\S\s]*?\n---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert not sample_output

In [None]:
#| export
def separate_indices_from_str(
        text: str,
        indices: list[tuple[int, int]] # The indices for substrings in `text` to separate.
        ) -> list[str]: # Each str is a substring of `text`, either a substring of `text` specified by `indices`, or substrings in between the substrings specified by `indices`.
    """Divide `text` into parts along the substrings specified by `indices`. 

    Assumes that the pairs of indices specified by `indices` are in order from
    first to last and the ranges specified by these pairs are all disjoint.

    `''.join(output)` should recover `text`.
    """
    if not indices:
        return [text]
    parts = [text[:indices[0][0]]]
    for index_pair, next_pair in zip(indices, indices[1:]):
        parts.append(text[index_pair[0]:index_pair[1]])
        parts.append(text[index_pair[1]:next_pair[0]])
    last_pair = indices[-1]
    parts.append(text[last_pair[0]:last_pair[1]])
    parts.append(text[last_pair[1]:])
    return parts

Here is a basic example of `separate_indices_from_str`:

In [None]:
text = 'hello asdf asdf'
sample_output = separate_indices_from_str(text, [(0,5), (10,11)])
print(sample_output)
test_eq(''.join(sample_output), text)


['', 'hello', ' asdf', ' ', 'asdf']


In [None]:
#| export
def replace_string_by_indices(
        string: str, # String in which to make replacemenets 
        replace_ranges: Sequence[Union[Sequence[int], int]], # A list of lists/tuples of int's or a single list/tuple of int's. Each 
        replace_with: Sequence[str] | str # The str(s) which will replace the substrings at `replace_ranges` in `string`. `replace_with` must be a str exactly when `replace_ranges` is a Sequence of a single Sequence of int.
        ) -> str:  # The str obtained by replacing the substrings at `replace_range` in `string` by the strs specified by `replace_with`.
    """Replace parts of ``string`` at the specified locations"

    Use this with `find_regex_in_text`.

    **Parameters**

    - ``string`` - `str`
    - ``replace_ranges`` - `Sequence[Sequence[int] | int]`
        - Either a list of lists/tuples of one or two int's. A list/tuple
        ``[a,b]`` or ``(a,b)`` means that ``string[a:b]`` is to be replaced.
        ``[a]`` or ``(a)`` means that ``string[a:]`` is to be replaced. The ranges should
        not overlap and should be arranged in chronological order.
    - ``replace_with`` - `Sequence[str] | str`
        - The str's which will replace the parts represented by 
        ``replace_ranges``. ``replace_ranges`` and ``replace_with`` must be
        both lists or both not lists. If they are lists, they must be of 
        the same length.

    **Returns**

    - str

    """
    if isinstance(replace_with, str):
        replace_ranges = [replace_ranges]
        replace_with = [replace_with]
    if len(replace_ranges) != len(replace_with):
        raise ValueError(
            'The lengths of `replace_ranges` and `replace_with` are different.')
    if len(replace_ranges) == 0:
        return string

    str_parts = _str_parts(string, replace_ranges, replace_with)
    return "".join(str_parts)


def _str_parts(string, replace_ranges, replace_with):
    """Divide `string` into parts divided outside of `replace_ranges`
    and with `replace_with` inserted."""
    str_parts = []
    for i, replace_string in enumerate(replace_ranges):
        replace_string = replace_with[i]
        if i > 0 and len(replace_ranges[i-1]) == 1:
            unreplaced_start_index = len(string)
        elif i > 0 and len(replace_ranges[i-1]) != 1:
            unreplaced_start_index = replace_ranges[i-1][1]
        else:
            unreplaced_start_index = 0
        unreplaced_end_index = replace_ranges[i][0]
        str_parts.append(string[unreplaced_start_index:unreplaced_end_index])
        str_parts.append(replace_string)
    # Add the last (unreplaced) part to str_parts.
    if len(replace_ranges[-1]) == 1:
        unreplaced_start_index = len(string)
    else:
        unreplaced_start_index = replace_ranges[-1][1]
    str_parts.append(string[unreplaced_start_index:])
    return str_parts

The following are basic examples of `replace_strings_by_indices`:

In [None]:
test_eq(replace_string_by_indices('hello world', replace_ranges=(0,5), replace_with='hi'), 'hi world')
test_eq(replace_string_by_indices('hello somebody', replace_ranges=[(0,1), (6,10)], replace_with=['', '']), 'ello body')

If `replace_ranges` and `replace_with` are of different length, then a `ValueError` is raised:

In [None]:
with ExceptionExpected(ex=ValueError, regex="are different"):
    replace_string_by_indices('hello world', replace_ranges = [(0,5), (6,10)], replace_with = [''])

#### Finding LaTeX string

In [None]:
#| export
def latex_indices(
        text: str,
        ) -> list[tuple[int, int]]:
    """Returns the indices in the text containing LaTeX str.
    
    This may not work correctly if the text has a LaTeX
    formatting issue.
    
    **Parameters**

    - text - str

    **Returns**

    - tuple[int]
        - Each tuple is of the form `(start, end)` where
        `text[start:end]` is a LaTeX string, including any leading trailing
        dollar signs (`$` or `$$`).
    """
    # r'(?<!\\)\$.*(?<!\\)\$|(?<!\\)\$(?<!\\)\$.*(?<!\\)\$(?<!\\)\$'
    # return find_regex_in_text(text, '\$\$[^\$]*\$\$|\$[^\$]*\$')

    # return find_regex_in_text(text, r'((?<!\\)\$\$?)[^\$]*\1')
    pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$|(?<!\\)\$.*?(?<!\\)\$", re.DOTALL)
    return find_regex_in_text(text, pattern)


def inline_latex_indices(
        text: str,
        ) -> list[tuple[int, int]]:
    """Returns the indices in the text containing inline LaTeX str surrounded by
    `$$`.
    
    This may not work correctly if the text has a LaTeX
    formatting issue or if any LaTeX string has a dollar sign `\$`.
    
    **Parameters**

    - text - str

    **Returns**

    - tuple[int]
        - Each tuple is of the form `(start, end)` where
        `text[start:end]` is a LaTeX string, including any leading trailing
        dollar signs (`$$`).
    """
    # r'(?<!\\)\$.*(?<!\\)\$|(?<!\\)\$(?<!\\)\$.*(?<!\\)\$(?<!\\)\$'
    # return find_regex_in_text(text, '\$\$[^\$]*\$\$|\$[^\$]*\$')

    # return find_regex_in_text(text, r'((?<!\\)\$\$?)[^\$]*\1')
    pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$", re.DOTALL)
    return find_regex_in_text(text, pattern)

# def math_mode_str_in_text(
#         text: str # The str in which to find the latex math mode str.
#         ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with LaTeX math mode text.
#     """
#     Return the indices of the math mode text.
#     """
#     pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$|(?<!\\)\$.*?(?<!\\)\$", re.DOTALL)
#     return find_regex_in_text(text, pattern)

Here are some basic uses of the `latex_indices` function:

In [None]:
text = r'$$5 \neq 7$$ is a LaTeX equation.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$$5 \neq 7$$')

text = r'$\mathcal{O}_X$ denotes the structure sheaf.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$\mathcal{O}_X$')

text = r'$$\n5 \neq 7\n$$'
listy = latex_indices(text)
assert len(listy) == 1


If there is a dollar sign symbol `\$` *outside* of a LaTeX string, then the `latex_indices` function works as expected; the dollar signs are not considered to be part of any LaTeX string:

In [None]:
text = r'\$6.2.4 helo blah $15+6+21$'  # Avoid detecting \$ as latex start/end
listy = latex_indices(text)
start, end = listy[0]
test_eq(text[start:end], r'$15+6+21$')

In the following example, the text has dollar sign symbols `\$` which do not surround math mode text

In [None]:
text = r'\$6.2.4 helo blah $\$37$ are needed for stuff.' 
listy = latex_indices(text)
start, end = listy[0]
test_eq(len(listy), 1)
print(text[listy[0][0]:listy[0][1]])  # This should print `$\$`, which is at the start of `$\$37$`.
test_eq(text[start:end], r'$\$37$')

$\$37$


In [None]:
#| export

In the following example, note that `\$S.10` is (correctly) not recognized as a LaTeX math mode string. Moreover, multi-line math mode strings are also recognized.

In [None]:
text = """
\$S.10 We have some latex string $a$ $hi$

$$
asdf
$$
"""
latex_indices(text)

[(34, 37), (38, 42), (44, 54)]

In [None]:
print(text[34:37])
print(text[38:42])
print(text[44:54])

$a$
$hi$
$$
asdf
$$


In [None]:
#| hide
sample_output = latex_indices(text)
expected_substrings = ['$a$', '$hi$', '$$\nasdf\n$$']
for i in range(3):
    expected_substring = expected_substrings[i]
    start, end = sample_output[i]
    test_eq(text[start:end], expected_substring)


The `inline_latex_indices` function finds the indices only for in-line LaTeX math mode strings (which are surrounded by `$$`)

In [None]:
text = """
\$S.10 We have some latex string $a$ $hi$

$$
asdf
$$
"""
inline_latex_indices(text)

[(44, 54)]

In [None]:
print(text[44:54])

$$
asdf
$$


In [None]:
#| hide

sample_output = inline_latex_indices(text)
start, end = sample_output[0]
test_eq(text[start:end], '$$\nasdf\n$$')