# helper

> Helper functions 

In [None]:
#| default_exp helper

In [None]:
#| export
from __future__ import annotations
from collections import OrderedDict
from collections.abc import Iterable
import datetime
from datetime import timezone
import errno
import glob
from graphlib import TopologicalSorter
from itertools import product
import os
from os import PathLike
from pathlib import Path
import platform
import re
from typing import Callable, Optional, Pattern, Sequence, Union

from deprecated import deprecated
from natsort import natsorted

In [None]:
#| hide
import operator
from fastcore.test import *
from unittest import mock

## regex

In [None]:
#| export
def find_regex_in_text(
        text: str, # Text in which to find regex patter
        pattern: str | Pattern[str] # The regex pattern
        ) -> list[tuple[int]]: # Each tuple is of the form `(a,b)` where `text[a:b]` is the regex match.
    # TODO: rename into regex_indices_in_text
    # TODO: swap parameters.
    """Return ranges in `text` where `pattern` occurs.
    """
    matches = re.finditer(pattern, text)
    return [match.span() for match in matches]

The following example finds the occurrence of the Markdown footnote:

In [None]:
regex_pattern = r'\[\^\d\]'
text = '[^1]: asdf'

output = find_regex_in_text(text, regex_pattern)
test_eq(output, [(0,4)])

start, end = output[0]
test_eq(text[start:end], '[^1]')

If there are multiple matches for the regex pattern, then they are all included in the outputted list.

In [None]:
regex_pattern = r'\d+'  # Searches for one or more consecutive digits
text = '9000 is a big number. But you know what is bigger? 9001.'

output = find_regex_in_text(text, regex_pattern)
test_eq(len(output), 2)

start, end = output[0]
test_eq(text[start:end], '9000')

start, end = output[1]
test_eq(text[start:end], '9001')

The following example detects YAML frontmatter text as used in Obsidian. This regex pattern is also used in `markdown.markdown.file.find_front_matter_meta_in_markdown_text`.


The regex pattern used is able to detect the frontmatter even when it is empty.

In [None]:
sample_regex = r'---\n([\S\s]*?)?(?(1)\n|)---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0,7)]

sample_str = '---\naliases: [this_is_an_aliases_for_the_Obsidian_note]\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0, len(sample_str))]  # The entire sample_str is detected.


Contrast the regex pattern above with the pattern `---\n[\S\s]*?\n---`, which does not detect empty YAML frontmatter text.

In [None]:
sample_regex = '---\n[\S\s]*?\n---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert not sample_output

In [None]:
#| export
def replace_string_by_indices(
        string: str, # String in which to make replacemenets 
        replace_ranges: Sequence[Union[Sequence[int], int]], # A list of lists/tuples of int's or a single list/tuple of int's. Each 
        replace_with: Sequence[str] | str # The str(s) which will replace the substrings at `replace_ranges` in `string`. `replace_with` must be a str exactly when `replace_ranges` is a Sequence of a single Sequence of int.
        ) -> str:  # The str obtained by replacing the substrings at `replace_range` in `string` by the strs specified by `replace_with`.
    """Replace parts of ``string`` at the specified locations"

    Use this with `find_regex_in_text`.

    **Parameters**

    - ``string`` - `str`
    - ``replace_ranges`` - `Sequence[Sequence[int] | int]`
        - Either a list of lists/tuples of one or two int's. A list/tuple
        ``[a,b]`` or ``(a,b)`` means that ``string[a:b]`` is to be replaced.
        ``[a]`` or ``(a)`` means that ``string[a:]`` is to be replaced. The ranges should
        not overlap and should be arranged in chronological order.
    - ``replace_with`` - `Sequence[str] | str`
        - The str's which will replace the parts represented by 
        ``replace_ranges``. ``replace_ranges`` and ``replace_with`` must be
        both lists or both not lists. If they are lists, they must be of 
        the same length.

    **Returns**

    - str

    """
    if isinstance(replace_with, str):
        replace_ranges = [replace_ranges]
        replace_with = [replace_with]
    if len(replace_ranges) != len(replace_with):
        raise ValueError(
            'The lengths of `replace_ranges` and `replace_with` are different.')
    if len(replace_ranges) == 0:
        return string

    str_parts = _str_parts(string, replace_ranges, replace_with)
    return "".join(str_parts)


def _str_parts(string, replace_ranges, replace_with):
    """Divide `string` into parts divided outside of `replace_ranges`
    and with `replace_with` inserted."""
    str_parts = []
    for i, replace_string in enumerate(replace_ranges):
        replace_string = replace_with[i]
        if i > 0 and len(replace_ranges[i-1]) == 1:
            unreplaced_start_index = len(string)
        elif i > 0 and len(replace_ranges[i-1]) != 1:
            unreplaced_start_index = replace_ranges[i-1][1]
        else:
            unreplaced_start_index = 0
        unreplaced_end_index = replace_ranges[i][0]
        str_parts.append(string[unreplaced_start_index:unreplaced_end_index])
        str_parts.append(replace_string)
    # Add the last (unreplaced) part to str_parts.
    if len(replace_ranges[-1]) == 1:
        unreplaced_start_index = len(string)
    else:
        unreplaced_start_index = replace_ranges[-1][1]
    str_parts.append(string[unreplaced_start_index:])
    return str_parts

The following are basic examples:

In [None]:
test_eq(replace_string_by_indices('hello world', replace_ranges=(0,5), replace_with='hi'), 'hi world')
test_eq(replace_string_by_indices('hello somebody', replace_ranges=[(0,1), (6,10)], replace_with=['', '']), 'ello body')

If `replace_ranges` and `replace_with` are of different length, then a `ValueError` is raised:

In [None]:
with ExceptionExpected(ex=ValueError, regex="are different"):
    replace_string_by_indices('hello world', replace_ranges = [(0,5), (6,10)], replace_with = [''])

## Definitions and notations

I surround definitions and notations by double asterisks `**`. The `double_asterisk_indices` method finds strings surrounded by double asterisks, the `notation_asterisk_indices` method finds notation str, and the `definition_asterisk_indices` method finds definition str.

In [None]:
#| export
def double_asterisk_indices(
        text: str # the str in which to find the indices of double asterisk surrounded text.
        ) -> list[tuple[int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with double asterisks, including the double asterisks.
    # TODO: fix double asterisks in math mode.
    """Return the indices in `str` of text surrounded by double asterisks.
    
    Assumes there no LaTeX math mode string has double asterisks.

    **See Also**
    
    - `notation_asterisk_indices`
    - `definition_asterisk_indices`
    """
    return find_regex_in_text(text, pattern='\*\*[^*]+\*\*')



In [None]:
# |hide
# Test double asterisks in math mode.
text = r"**$M^{**}$**"  # I would like this to get the entire string, but currently, this is not the case.
double_asterisk_indices(text)

[(0, 8)]

In [None]:
#| export
def notation_asterisk_indices(
        text: str # the str in which to find the indices of notations surrounded by double asterisks.
        ) -> list[tuple[int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with LaTeX math mode text with double asterisks, including the double asterisks.
    """Return the indices of notation text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a notation almost always
    when it is purely LaTeX math mode text. 

    Assumes that no LaTeX math mode string has the dollar sign character
    within it.
    """
    return find_regex_in_text(
        text, pattern='\*\*\$\$[^$]+\$\$\*\*|\*\*\$[^$]+\$\*\*')
    # I previous used this, but it was not picking up notation LaTeX str
    # containing asterisks, e.g. `**$\pi^*$**``, `**$\pi_*$**`.`
    return find_regex_in_text(
        text, pattern='\*\*\$\$[^*$]+\$\$\*\*|\*\*\$[^*$]+\$\*\*')


def definition_asterisk_indices(text: str) -> list[tuple[int]]:
    """Returns the indices of definition text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a definition almost always
    when it is not purely LaTeX math mode text.
    
    Assumes that no LaTeX math mode string has double asterisks and that no
    LaTeX math mode string has the dollar sign character within it.
    """
    all_double_asterisks = double_asterisk_indices(text)
    notations = notation_asterisk_indices(text)
    return [tuppy for tuppy in all_double_asterisks if tuppy not in notations]

#### Examples

In the following example, `scheme` and `structure sheaf` are definitions, whereas `$\mathcal{O}_X$` is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = double_asterisk_indices(text)

start, end = listy[0]
test_eq(text[start:end], '**scheme**')

start, end = listy[1]
test_eq(text[start:end], '**structure sheaf**')

start, end = listy[2]
test_eq(text[start:end], '**$\mathcal{O}_X$**')

listy = notation_asterisk_indices(text)
start, end = listy[0]
test_eq(text[start:end], '**$\mathcal{O}_X$**')
test_eq(len(listy), 1)

listy = definition_asterisk_indices(text)
print(listy)
test_eq(len(listy), 2)



[(2, 12), (25, 44)]


The following example has a definition which starts and ends with dollar sign `$` characters:

In [None]:
text = r'A **$G$-group over a ring $A$** is'
listy = notation_asterisk_indices(text)
test_eq(len(listy), 0)

listy = definition_asterisk_indices(text)
test(len(listy), 0, operator.ge)

The following example tests `notation_asterisk_indices` for LaTeX str with single asterisks in them:

In [None]:
# Tests LaTeX str with asterisks in them:
text = (r'''The **direct image of a sheaf $F^{\prime}$ on $X_{E}^{\prime}$** '''
        r'''is defined to be **$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$** and the '''
        r'''**inverse image of a sheaf $F$ on $X_{E}$** is defined to be **$\pi^{*} F=a\left(\pi^{p} F\right)$**.''')
listy = notation_asterisk_indices(text)
test_eq(len(listy), 2)
start, end = listy[0]
test_eq(text[start:end], r'**$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$**')

There are pure LaTeX strings which should be considered definitions, but for the purposes of the code here will be considered notations. For example, Hausdorff spaces in topology are also called $T_2$ spaces:

In [None]:
text = (r"A topological space $X$ is called **$T_2$** if for all $x,y \in X$, "
        r"there exist open neightborhoods $V$ and $W$ around $x$ and $y$ respectively "
        r"such that $V \cap W = 0$.")
listy = notation_asterisk_indices(text)
test_eq(text[listy[0][0]:listy[0][1]], r'**$T_2$**')

Unfortunately, the current implementation of the above methods do not work correctly if there are LaTeX string with double asterisks `**` within them.

In [None]:
# TODO: If this is fixed, delete this.
text = r'The double dual of $M$ is denoted by **$M^{**}$**.'
listy = definition_asterisk_indices(text)
print(f"The `definition_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a definition. This is incorrect!")

listy = notation_asterisk_indices(text)
print(f"The `notation_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a notation. I guess this is correct.")

The `definition_asterisk_indices` function detects the substring **$M^{** as a definition. This is incorrect!
The `notation_asterisk_indices` function detects the substring **$M^{**}$** as a notation. I guess this is correct.


A workaround is to replace asterisks `*` with the LaTeX `\ast` command:

In [None]:
text = r'The double dual of $M$ is denoted by **$M^{\ast\ast}$**.'
listy = definition_asterisk_indices(text)
test_eq(len(listy), 0)

listy = notation_asterisk_indices(text)
test_eq(len(listy), 1)
test_eq(text[listy[0][0]:listy[0][1]], r'**$M^{\ast\ast}$**')

In [None]:
#| export
def defs_and_notats_separations(
        text: str 
        )-> list[tuple[int, bool]]:
    """Finds the indices in the text where double asterisks occur and
    categorizes whether each index is for a definition or a notation.
    
    **Parameters**

    - text - str

    **Returns**

    - list[tuple[int, bool]]
        - Each tuple is of the form `(start, end, is_notation)`, where
        `text[start:end]` is the double-asterisk surrounded string,
        including the double asterisks.
    """
    all_double_asterisks = double_asterisk_indices(text)
    notations = notation_asterisk_indices(text)
    return [(start, end, (start, end) in notations)
            for start, end in all_double_asterisks]

In the following example, the first two double-asterisk-surrounded-strings are definitions, and the third is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = defs_and_notats_separations(text)
assert not listy[0][2]
assert not listy[1][2]
assert listy[2][2]

## LaTeX string

In [None]:
#| export
def latex_indices(text: str) -> list[tuple[int]]:
    """Returns the indices in the text containing LaTeX str.
    
    This may not work correctly if the text has a LaTeX
    formatting issue or if any LaTeX string has a dollar sign `\$`.
    
    **Parameters**

    - text - str

    **Returns**

    - tuple[int]
        - Each tuple is of the form `(start, end)` where
        `text[start:end]` is a LaTeX string, including any leading trailing
        dollar signs (`$` or `$$`).
    """
    # r'(?<!\\)\$.*(?<!\\)\$|(?<!\\)\$(?<!\\)\$.*(?<!\\)\$(?<!\\)\$'
    return find_regex_in_text(text, r'((?<!\\)\$\$?)[^\$]*\1')
    # return find_regex_in_text(text, '\$\$[^\$]*\$\$|\$[^\$]*\$')

Here are some basic uses:

In [None]:
text = r'$$5 \neq 7$$ is a LaTeX equation.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$$5 \neq 7$$')

text = r'$\mathcal{O}_X$ denotes the structure sheaf.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$\mathcal{O}_X$')

text = r'$$\n5 \neq 7\n$$'
listy = latex_indices(text)
assert len(listy) == 1


If there is a dollar sign symbol `\$` *outside* of a LaTeX string, then the `latex_indices` function works as expected; the dollar signs are not considered to be part of any LaTeX string:

In [None]:
text = r'\$6.2.4 helo blah $15+6+21$'  # Avoid detecting \$ as latex start/end
listy = latex_indices(text)
start, end = listy[0]
test_eq(text[start:end], r'$15+6+21$')

However, the current implementation of the `latex_indices` function does not correctly detect LaTeX strings with dollar sign symbols `\$` in them:

In [None]:
# TODO: delete this example if the issue is fixed.
text = r'\$6.2.4 helo blah $\$37$ are needed for stuff.' 
listy = latex_indices(text)
start, end = listy[0]
assert len(listy) == 1
print(text[listy[0][0]:listy[0][1]])  # This should print `$\$`, which is at the start of `$\$37$`.
# assert text[start:end] == r'$\$37$'

$\$


## Numbers

In [None]:
#| export
def is_number(
        x: Union[float, int, complex, str]
        ) -> bool:
    """Return `True` if the input `x` represents a number.
    
    This function is different from Python's built-in `is_numeric`
    function, which returns `True` when all characters of a string
    are digits.
    """
    if isinstance(x, (float, int, complex)):
        return True
    #For the case where string is None
    if x is None:
        return False
    if x and x[0] == '-': x = x[1:]
    return x.replace(".", "1", 1).isdigit()

In [None]:
assert is_number("3.45")
assert is_number(1 + 5j)
assert is_number(5)
assert is_number(0.0)
assert not is_number("3.43.55")
assert not is_number("hie")
assert not is_number("[^1]")
assert not is_number(None)

## Files and folders

#### File existence

In [None]:
#| export
def existing_path(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**
    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parmaeter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)


@deprecated(reason="The function has been renamed to `existing_path`")
def file_existence_test(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**
    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parmaeter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)

In the following example, we `mock` an existing  

In [None]:
with (mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_path_exists.return_value = True
    mock_is_abs.retrn_value = True
    path_1 = existing_path('mock_existing_relative_path', 'mock_existing_absolute_path')
    test_eq(Path('mock_existing_absolute_path') / 'mock_existing_relative_path', path_1)


Some basic usage:

If the desired path is very long in Windows, then the `\\?\` may be appended in front so that Python can actually find the path, cf. https://stackoverflow.com/questions/36219317/pathname-too-long-to-open:

If the parameter `relative_to` is not `None` and not absolute, then a `ValueError` is raised:

If the parameter `relative_to` is `None` and the paramether `path` is not absolute, then a `ValueError` is raised:

If `relative_to` does not exist or if `path` does not exist, then a `FileNotFoundError` is raised: