# helper

> Helper functions 

In [None]:
#| default_exp helper

In [None]:
#| export
from __future__ import annotations
from collections import OrderedDict
from collections.abc import Iterable
import datetime
from datetime import timezone
import errno
import glob
from graphlib import TopologicalSorter
from itertools import product
import os
from os import PathLike
from pathlib import Path
from pathvalidate import sanitize_filename
import platform
import random
import re
from typing import Callable, Optional, Pattern, Sequence, Union

import bs4
from bs4 import BeautifulSoup
from deprecated import deprecated
from natsort import natsorted

In [None]:
import operator
from unittest import mock

from pathvalidate import validate_filename

from string import ascii_uppercase
from fastcore.test import *
from nbdev import show_doc

## Tests

In [None]:
#| export
def _test_directory() -> Path:
    """Returns the `nbs/_tests` directory of the `trouver` repository.
    
    Assumes that the current working directory is either the root of the
    repository or the `nbs` folder and the `nbs/_tests` folder exists in the
    repository but the root of the repository does not have a folder named
    `_tests`.
    """
    cwd = os.getcwd()
    cwd_name = os.path.basename(cwd)
    if cwd_name == 'nbs':
        return Path(cwd) / '_tests'
    else:
        return Path(cwd) / 'nbs'/ '_tests'

In [None]:
show_doc(_test_directory)

---

[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L41){target="_blank" style="float:right; font-size:smaller"}

### _test_directory

>      _test_directory ()

Returns the `nbs/_tests` directory of the `trouver` repository.

Assumes that the current working directory is either the root of the
repository or the `nbs` folder and the `nbs/_tests` folder exists in the
repository but the root of the repository does not have a folder named
`_tests`.

Test files for `trouver` are contained in `nbs/_tests` directory within the repository. The hidden `_test_directory()` method returns this directory.

In [None]:
assert os.path.exists(_test_directory())

In [None]:
os.listdir(_test_directory())

['empty_model_vault',
 'latex_examples',
 'ml_examples',
 'test_vault_1',
 'test_vault_10',
 'test_vault_2',
 'test_vault_3',
 'test_vault_4',
 'test_vault_5',
 'test_vault_6',
 'test_vault_7',
 'test_vault_8',
 'test_vault_9']

## regex

In [None]:
#| export
def find_regex_in_text(
        text: str, # Text in which to find regex patter
        pattern: str | Pattern[str] # The regex pattern
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(a,b)` where `text[a:b]` is the regex match.
    # TODO: rename into regex_indices_in_text
    # TODO: swap parameters.
    """Return ranges in `text` where `pattern` occurs.
    """
    matches = re.finditer(pattern, text)
    return [match.span() for match in matches]

The following example finds the occurrence of the Markdown footnote:

In [None]:
regex_pattern = r'\[\^\d\]'
text = '[^1]: asdf'

output = find_regex_in_text(text, regex_pattern)
test_eq(output, [(0,4)])

start, end = output[0]
test_eq(text[start:end], '[^1]')

If there are multiple matches for the regex pattern, then they are all included in the outputted list.

In [None]:
regex_pattern = r'\d+'  # Searches for one or more consecutive digits
text = '9000 is a big number. But you know what is bigger? 9001.'

output = find_regex_in_text(text, regex_pattern)
test_eq(len(output), 2)

start, end = output[0]
test_eq(text[start:end], '9000')

start, end = output[1]
test_eq(text[start:end], '9001')

The following example detects YAML frontmatter text as used in Obsidian. This regex pattern is also used in `markdown.markdown.file.find_front_matter_meta_in_markdown_text`.


The regex pattern used is able to detect the frontmatter even when it is empty.

In [None]:
sample_regex = r'---\n([\S\s]*?)?(?(1)\n|)---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0,7)]

sample_str = '---\naliases: [this_is_an_aliases_for_the_Obsidian_note]\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert sample_output == [(0, len(sample_str))]  # The entire sample_str is detected.


Contrast the regex pattern above with the pattern `---\n[\S\s]*?\n---`, which does not detect empty YAML frontmatter text.

In [None]:
sample_regex = '---\n[\S\s]*?\n---'
sample_str = '---\n---'
sample_output = find_regex_in_text(sample_str, sample_regex)
assert not sample_output

In [None]:
#| export
def separate_indices_from_str(
        text: str,
        indices: list[tuple[int, int]] # The indices for substrings in `text` to separate.
        ) -> list[str]: # Each str is a substring of `text`, either a substring of `text` specified by `indices`, or substrings in between the substrings specified by `indices`.
    """Divide `text` into parts along the substrings specified by `indices`. 

    Assumes that the pairs of indices specified by `indices` are in order from
    first to last and the ranges specified by these pairs are all disjoint.

    `''.join(output)` should recover `text`.
    """
    if not indices:
        return [text]
    parts = [text[:indices[0][0]]]
    for index_pair, next_pair in zip(indices, indices[1:]):
        parts.append(text[index_pair[0]:index_pair[1]])
        parts.append(text[index_pair[1]:next_pair[0]])
    last_pair = indices[-1]
    parts.append(text[last_pair[0]:last_pair[1]])
    parts.append(text[last_pair[1]:])
    return parts

Here is a basic example of `separate_indices_from_str`:

In [None]:
text = 'hello asdf asdf'
sample_output = separate_indices_from_str(text, [(0,5), (10,11)])
print(sample_output)
test_eq(''.join(sample_output), text)


In [None]:
#| export
def replace_string_by_indices(
        string: str, # String in which to make replacemenets 
        replace_ranges: Sequence[Union[Sequence[int], int]], # A list of lists/tuples of int's or a single list/tuple of int's. Each 
        replace_with: Sequence[str] | str # The str(s) which will replace the substrings at `replace_ranges` in `string`. `replace_with` must be a str exactly when `replace_ranges` is a Sequence of a single Sequence of int.
        ) -> str:  # The str obtained by replacing the substrings at `replace_range` in `string` by the strs specified by `replace_with`.
    """Replace parts of ``string`` at the specified locations"

    Use this with `find_regex_in_text`.

    **Parameters**

    - ``string`` - `str`
    - ``replace_ranges`` - `Sequence[Sequence[int] | int]`
        - Either a list of lists/tuples of one or two int's. A list/tuple
        ``[a,b]`` or ``(a,b)`` means that ``string[a:b]`` is to be replaced.
        ``[a]`` or ``(a)`` means that ``string[a:]`` is to be replaced. The ranges should
        not overlap and should be arranged in chronological order.
    - ``replace_with`` - `Sequence[str] | str`
        - The str's which will replace the parts represented by 
        ``replace_ranges``. ``replace_ranges`` and ``replace_with`` must be
        both lists or both not lists. If they are lists, they must be of 
        the same length.

    **Returns**

    - str

    """
    if isinstance(replace_with, str):
        replace_ranges = [replace_ranges]
        replace_with = [replace_with]
    if len(replace_ranges) != len(replace_with):
        raise ValueError(
            'The lengths of `replace_ranges` and `replace_with` are different.')
    if len(replace_ranges) == 0:
        return string

    str_parts = _str_parts(string, replace_ranges, replace_with)
    return "".join(str_parts)


def _str_parts(string, replace_ranges, replace_with):
    """Divide `string` into parts divided outside of `replace_ranges`
    and with `replace_with` inserted."""
    str_parts = []
    for i, replace_string in enumerate(replace_ranges):
        replace_string = replace_with[i]
        if i > 0 and len(replace_ranges[i-1]) == 1:
            unreplaced_start_index = len(string)
        elif i > 0 and len(replace_ranges[i-1]) != 1:
            unreplaced_start_index = replace_ranges[i-1][1]
        else:
            unreplaced_start_index = 0
        unreplaced_end_index = replace_ranges[i][0]
        str_parts.append(string[unreplaced_start_index:unreplaced_end_index])
        str_parts.append(replace_string)
    # Add the last (unreplaced) part to str_parts.
    if len(replace_ranges[-1]) == 1:
        unreplaced_start_index = len(string)
    else:
        unreplaced_start_index = replace_ranges[-1][1]
    str_parts.append(string[unreplaced_start_index:])
    return str_parts

The following are basic examples of `replace_strings_by_indices`:

In [None]:
test_eq(replace_string_by_indices('hello world', replace_ranges=(0,5), replace_with='hi'), 'hi world')
test_eq(replace_string_by_indices('hello somebody', replace_ranges=[(0,1), (6,10)], replace_with=['', '']), 'ello body')

If `replace_ranges` and `replace_with` are of different length, then a `ValueError` is raised:

In [None]:
with ExceptionExpected(ex=ValueError, regex="are different"):
    replace_string_by_indices('hello world', replace_ranges = [(0,5), (6,10)], replace_with = [''])

#### Finding LaTeX string

In [None]:
#| export
def latex_indices(
        text: str,
        ) -> list[tuple[int, int]]:
    """Returns the indices in the text containing LaTeX str.
    
    This may not work correctly if the text has a LaTeX
    formatting issue or if any LaTeX string has a dollar sign `\$`.
    
    **Parameters**

    - text - str

    **Returns**

    - tuple[int]
        - Each tuple is of the form `(start, end)` where
        `text[start:end]` is a LaTeX string, including any leading trailing
        dollar signs (`$` or `$$`).
    """
    # r'(?<!\\)\$.*(?<!\\)\$|(?<!\\)\$(?<!\\)\$.*(?<!\\)\$(?<!\\)\$'
    # return find_regex_in_text(text, '\$\$[^\$]*\$\$|\$[^\$]*\$')

    # return find_regex_in_text(text, r'((?<!\\)\$\$?)[^\$]*\1')
    pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$|(?<!\\)\$.*?(?<!\\)\$", re.DOTALL)
    return find_regex_in_text(text, pattern)


def inline_latex_indices(
        text: str,
        ) -> list[tuple[int, int]]:
    """Returns the indices in the text containing inline LaTeX str surrounded by
    `$$`.
    
    This may not work correctly if the text has a LaTeX
    formatting issue or if any LaTeX string has a dollar sign `\$`.
    
    **Parameters**

    - text - str

    **Returns**

    - tuple[int]
        - Each tuple is of the form `(start, end)` where
        `text[start:end]` is a LaTeX string, including any leading trailing
        dollar signs (`$$`).
    """
    # r'(?<!\\)\$.*(?<!\\)\$|(?<!\\)\$(?<!\\)\$.*(?<!\\)\$(?<!\\)\$'
    # return find_regex_in_text(text, '\$\$[^\$]*\$\$|\$[^\$]*\$')

    # return find_regex_in_text(text, r'((?<!\\)\$\$?)[^\$]*\1')
    pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$", re.DOTALL)
    return find_regex_in_text(text, pattern)

# def math_mode_str_in_text(
#         text: str # The str in which to find the latex math mode str.
#         ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with LaTeX math mode text.
#     """
#     Return the indices of the math mode text.
#     """
#     pattern = re.compile(r"(?<!\\)\$\$.*?(?<!\\)\$\$|(?<!\\)\$.*?(?<!\\)\$", re.DOTALL)
#     return find_regex_in_text(text, pattern)

Here are some basic uses of the `latex_indices` function:

In [None]:
text = r'$$5 \neq 7$$ is a LaTeX equation.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$$5 \neq 7$$')

text = r'$\mathcal{O}_X$ denotes the structure sheaf.'
listy = latex_indices(text)
assert len(listy) == 1
start, end = listy[0]
test_eq(text[start:end], r'$\mathcal{O}_X$')

text = r'$$\n5 \neq 7\n$$'
listy = latex_indices(text)
assert len(listy) == 1


If there is a dollar sign symbol `\$` *outside* of a LaTeX string, then the `latex_indices` function works as expected; the dollar signs are not considered to be part of any LaTeX string:

In [None]:
text = r'\$6.2.4 helo blah $15+6+21$'  # Avoid detecting \$ as latex start/end
listy = latex_indices(text)
start, end = listy[0]
test_eq(text[start:end], r'$15+6+21$')

In the following example, the text has dollar sign symbols `\$` which do not surround math mode text

In [None]:
text = r'\$6.2.4 helo blah $\$37$ are needed for stuff.' 
listy = latex_indices(text)
start, end = listy[0]
test_eq(len(listy), 1)
print(text[listy[0][0]:listy[0][1]])  # This should print `$\$`, which is at the start of `$\$37$`.
test_eq(text[start:end], r'$\$37$')

$\$37$


In [None]:
#| export

In the following example, note that `\$S.10` is (correctly) not recognized as a LaTeX math mode string. Moreover, multi-line math mode strings are also recognized.

In [None]:
text = """
\$S.10 We have some latex string $a$ $hi$

$$
asdf
$$
"""
latex_indices(text)

[(34, 37), (38, 42), (44, 54)]

In [None]:
print(text[34:37])
print(text[38:42])
print(text[44:54])

$a$
$hi$
$$
asdf
$$


In [None]:
#| hide
sample_output = latex_indices(text)
expected_substrings = ['$a$', '$hi$', '$$\nasdf\n$$']
for i in range(3):
    expected_substring = expected_substrings[i]
    start, end = sample_output[i]
    test_eq(text[start:end], expected_substring)


The `inline_latex_indices` function finds the indices only for in-line LaTeX math mode strings (which are surrounded by `$$`)

In [None]:
text = """
\$S.10 We have some latex string $a$ $hi$

$$
asdf
$$
"""
inline_latex_indices(text)

[(44, 54)]

In [None]:
print(text[44:54])

$$
asdf
$$


In [None]:
#| hide

sample_output = inline_latex_indices(text)
start, end = sample_output[0]
test_eq(text[start:end], '$$\nasdf\n$$')

## HTML

#### Consolidating special characters that are changed with the `__str__` function of `bs4.element.Tags` objects

In [None]:
#| export
def html_tag_str(
        html_tag: bs4.element.Tag
        ) -> str:
    """
    Return the full string of `html_tag`, accounting for 
    special characters that `bs4` changes
    """
    special_chars = {'&lt;': '<', '&gt;': '>', '&amp;': '&'}
    text_to_return = str(html_tag)
    for special_char, replace_with in special_chars.items():
        text_to_return = text_to_return.replace(special_char, replace_with)
    return text_to_return

When using the `__str__` function of `bs4.element.Tag` objects, special characters such as `<`, `>` and `&` change into `&lt;`, `&gt;` and `&amp;`, etc. The `html_tag_str` function makes it so that these characters are changed back.

In [None]:
soup = BeautifulSoup('', 'html.parser')
tag = soup.new_tag('span')
tag.string = '&hi<'
test_eq(html_tag_str(tag), '<span>&hi<</span>')

#### Handling less than `<` symbols in latex math mode strings

`BeautifulSoup`'s `html.parser` parses less than `<` symbols without a following space as the beginning of an HTML tag, even when the symbol `<` is used within a LaTeX math mode string. To get around this, we detect when this happens and add a space after these symbols.

In [None]:
#| export
def find_lt_symbols_without_space_in_math_mode(
        text: str
        ) -> list[int]: # The index of  
    """
    Return the indices in `text` with math mode less than `<` symbols without
    a space that follows.
    """
    latex_inds = latex_indices(text)
    lt_pattern = re.compile(r'<(?! )')
    inds_of_lt_without_spaces_after = []
    for start, end in latex_inds:
        latex_str = text[start:end]
        relative_lt_inds = find_regex_in_text(latex_str, lt_pattern)
        inds_of_lt_without_spaces_after.extend([
            start + relative_lt_ind for relative_lt_ind, _ in relative_lt_inds
        ])
    return inds_of_lt_without_spaces_after 


In the following example, there are a few math mode strings with less than `<` symbols. Some of these symbols are followed by spaces and others are not.

In [None]:
text = r"""
here is a math mode $a<b$. Here is another $a< b$.
Here is an in-line one:

$$ asdf <cbba$$

Here is another:

$$
asdf < basdf
$$
"""
output = find_lt_symbols_without_space_in_math_mode(text)
print(output)
test_eq(len(output), 2)
test_eq(text[output[0] + 1], 'b')
test_eq(text[output[1] + 1], 'c')

[23, 85]


In [None]:
text_2 = r"""
<b>Now there is an HTML tag</b>. But it shouldn't be detected
because the tag is not within math mode text.
But this inequality is: $a <d$
"""
output = find_lt_symbols_without_space_in_math_mode(text_2)
print(output)
test_eq(len(output), 1)
test_eq(text_2[output[0] + 1], 'd')

[136]


In [None]:
#| export
def add_space_to_lt_symbols_without_space(
        text: str
        ) -> str:
    """Add space after less than `<` symbols if the symbol
    is not followed by a space.
    """
    lt_wo_space_inds = find_lt_symbols_without_space_in_math_mode(text)
    lt_wo_space_ranges = [(ind, ind+1) for ind in lt_wo_space_inds]
    return replace_string_by_indices(
        text, replace_ranges=lt_wo_space_ranges,
        replace_with=['< '] * len(lt_wo_space_inds))
    

Let us again use `text` from the example for the `find_lt_symbols_without_space_in_math_mode` function:

In [None]:
print(add_space_to_lt_symbols_without_space(text))
assert not find_lt_symbols_without_space_in_math_mode(add_space_to_lt_symbols_without_space(text))


here is a math mode $a< b$. Here is another $a< b$.
Here is an in-line one:

$$ asdf < cbba$$

Here is another:

$$
asdf < basdf
$$



#### Removing HTML tags in a text and obtaining the data of the tags.

In [None]:
#| export
def remove_html_tags_in_text(
        text: str, # The text in which to remove the HTML tags.
        replace_with_attributes: Optional[Union[str, list[str]]] = None, # Attribute(s) within the HTML tags which should be used to replace the text of the tags. If `None`, then the texts are not replaced with the attributes. If multiple attributes are specified, then only one attribute is used to replace the text for each HTML tag (independently at random of other replacements). Each attribute's text has an equal chance of being selected for replacement. Repeats are ignored.
        definitely_replace: bool = False, # If `True` and if a given HTML tag has an attribute specified in `replace_with_attributes`, then the text for that tag will definitely be replaced by the text of one of the attributes. Otherwise, the original text and each attribute's text have an equal chance of being selected.
        seed: int = None # Random seed 
        ) -> tuple[str, list[tuple[bs4.element.Tag, int, int]]]: # The text `removed` without HTML tags and a list whose elements consist of the removed HTML tags and the starting and ending indices of the text corresponding to the removed tags within `removed`.
    """Remove the HTML tags in `text`.

    HTML tags are assumed to be not nested.

    """
    random.seed(seed)
    parsed_soup = BeautifulSoup(text, 'html.parser')
    replace_with_attributes = _init_replace_with_attributes(
        replace_with_attributes)

    position = 0
    replaced_contents = []
    for content in parsed_soup.contents:
        position = _process_content(
            parsed_soup, replace_with_attributes, definitely_replace, content,
            position, replaced_contents)
    text_to_return = html_tag_str(parsed_soup)
    return text_to_return, replaced_contents


def _init_replace_with_attributes(
        replace_with_attributes: Optional[Union[str, list[str]]]
        ) -> set[str]:
    if replace_with_attributes is None:
        replace_with_attributes = []
    elif isinstance(replace_with_attributes, str):
        replace_with_attributes = [replace_with_attributes]
    return set(replace_with_attributes)


def _select_replacement_text(
        content: bs4.element.Tag,
        replace_with_attributes: set[str],
        definitely_replace: bool) -> str:
    if not replace_with_attributes:
        return content.string
    selection_pool = []    
    if not definitely_replace:
        selection_pool.append(content.string)
    for attribute, value in content.attrs.items():
        if attribute not in replace_with_attributes:
            continue
        selection_pool.append(value)
    return random.choice(selection_pool)


def _process_content(
        parsed_soup: BeautifulSoup,
        replace_with_attributes: set[str],
        definitely_replace: bool,
        content,
        position: int,
        replaced_contents: list) -> int:
    
    if not isinstance(content, bs4.element.Tag):
        return position + len(content)
    replacement_text = _select_replacement_text(
        content, replace_with_attributes, definitely_replace)
    replaced_content = content.replace_with(
        parsed_soup.new_string(replacement_text))
    replaced_contents.append((
        replaced_content,
        position,
        position + len(replacement_text)))
    return position + len(replacement_text)
    

The `remove_html_tags_in_text` function removes HTML tags, preserving the underlying text by default.

In [None]:
html = 'Let $K$ be a field. An <b definition="Abelian variety over a field">Abelian variety over $K$</b> is a variety that'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html)
print(text_without_html_tags)

test_eq(text_without_html_tags, 'Let $K$ be a field. An Abelian variety over $K$ is a variety that')

Let $K$ be a field. An Abelian variety over $K$ is a variety that


In the following example, there is a less than `<` symbol, which is definitely not the opening of an HTML tag. The following verifies that the placeholder `&lt;` is not used to replace the less than symbol, which is what `bs4.BeautifulSoup`'s `html.parser` does.

In [None]:
text = 'Hello, this has a less than symbol: $a< b$'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '< ' in text
assert 'lt' not in html_tags


The same applies to the greater than `>` symbol, and `&` symbols

In [None]:
text = 'Hello, this has a greater than symbol: $a>b$'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '>' in text
assert 'gt' not in html_tags

text = r'Hello $$ f &= 3 \\ g &= 5'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '&' in text
assert '&amp;' not in html_tags



The `remove_html_tags_in_text` function additionally returns a list with information about the tags that are removed. Each item in this list is a tuple `(tag, start, end)`, where `tag` is the tag that has been removed, and `start` and `end` are the indices within the string output `text_without_html_tags` of the function at which the text replacing the tag can be found.

In the example above (continued below), there is excactly one tag that is removed.

In [None]:
print(removed_tags)
removed_tag, start, end = removed_tags[0]
print(text_without_html_tags[start:end])

test_eq(text_without_html_tags[start:end], 'Abelian variety over $K$')

[(<b definition="Abelian variety over a field">Abelian variety over $K$</b>, 23, 47)]
Abelian variety over $K$


In [None]:
#| hide
html = '<span>Santa</span> want to eat some popcorn for once. He does not want <div>milk and cookies today</div>. \n And <a href="">this is a link.</a>'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html)

test_eq(text_without_html_tags, 'Santa want to eat some popcorn for once. He does not want milk and cookies today. \n And this is a link.')
test_eq(len(removed_tags), 3)

removed_tag, start, end = removed_tags[0]
test_eq(text_without_html_tags[start:end], 'Santa')

removed_tag, start, end = removed_tags[1]
test_eq(text_without_html_tags[start:end], 'milk and cookies today')

removed_tag, start, end = removed_tags[2]
test_eq(text_without_html_tags[start:end], 'this is a link.')

The `remove_html_tags_in_text` function can be used to replace the underlying text of HTML tags with specified attribute values.

In the below example, the text has a tag which contains a `typo` attribute. Passing `'typo'` to the `replace_with_attributes` parameter and passing `True` to the `definitely_replace` parameter guarantees that the value of the `typo` attribute is used instead of the text of the tag.

In [None]:
html = r'The following tag fixes a typo and simultaneously keeps around the data of that typo: <span typo="$\operatorname{Gul}(K)$">$\operatorname{Gal}(K)$</span>'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html, replace_with_attributes='typo', definitely_replace=True)
print(text_without_html_tags)

test_eq(text_without_html_tags, 'The following tag fixes a typo and simultaneously keeps around the data of that typo: $\\operatorname{Gul}(K)$')

removed_tag, start, end = removed_tags[0]
test_eq(text_without_html_tags[start:end], '$\\operatorname{Gul}(K)$')


The following tag fixes a typo and simultaneously keeps around the data of that typo: $\operatorname{Gul}(K)$


If the `definitely_replace` parameter is `False` (which it is by default), then the original text might be preserved or it might be replaced.

In [None]:
html = r'<span typo="$\operatorname{Gul}(K)$">$\operatorname{Gal}(K)$</span>'
possible_outputs = [
    r'$\operatorname{Gal}(K)$',
    r'$\operatorname{Gul}(K)$'
]
output, _ = remove_html_tags_in_text(html, replace_with_attributes='typo', definitely_replace=False)
assert output in possible_outputs


#### Adding HTML tag data

On the other hand, we may also need to add HTML tag data to a text.

In [None]:
#| export
def add_HTML_tag_data_to_raw_text(
        text: str, # The text onto which to add HTML tags. This is assumed to contain no HTML tags.
        tags_and_locations: list[tuple[bs4.element.Tag, int, int]] # Each tuple consists of the tag object to add as well as the indices within `text` to. The ranges specified by the tuples are assumed to not overlap with one another.
        ) -> str: # The modification of `text` in which the tags are added at the specified locations; the ranges in `text` are replaced..
    """
    Add specified HTML tags to the specified locations/ranges in `text`.

    See the `add_HTML_tag_data_to_text` function for adding HTML
    tag data to text that may or may not already have HTML tags.
    """
    # sort by starting index
    tags_and_locations = sorted(
        tags_and_locations, key=lambda x: x[1])
    replace_ranges = [(start, end) for _, start, end in tags_and_locations]
    replace_with = [html_tag_str(html_tag) for html_tag, _, _ in tags_and_locations]
    return replace_string_by_indices(text, replace_ranges, replace_with)

In [None]:
text = "Now this will have an HTML tag. This will also have an HTML tag too!"
tags_and_locations = [
    (BeautifulSoup('<span some_attr="hi">this</span>', 'html.parser'), 4,8),
    (BeautifulSoup('<div some_attr="hi">This</div>', 'html.parser'), 32,36)
]
output = add_HTML_tag_data_to_raw_text(text, tags_and_locations)
print(output)
test_eq(output, 'Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!')

Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!


Now let us look at the same example, with the order in `tags_and_locations` reversed.

In [None]:
text = "Now this will have an HTML tag. This will also have an HTML tag too!"
tags_and_locations = [
    (BeautifulSoup('<div some_attr="hi">This</div>', 'html.parser'), 32,36),
    (BeautifulSoup('<span some_attr="hi">this</span>', 'html.parser'), 4,8)
]
output = add_HTML_tag_data_to_raw_text(text, tags_and_locations)
print(output)
test_eq(output, 'Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!')

Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!


## Definitions and notations

I surround definitions and notations by double asterisks `**`. The `double_asterisk_indices` method finds strings surrounded by double asterisks, the `notation_asterisk_indices` method finds notation str, and the `definition_asterisk_indices` method finds definition str.

In [None]:
#| export
def double_asterisk_indices(
        text: str # the str in which to find the indices of double asterisk surrounded text.
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with double asterisks, including the double asterisks.
    # TODO: fix double asterisks in math mode.
    """Return the indices in `str` of text surrounded by double asterisks.
    
    Assumes there no LaTeX math mode string has double asterisks.

    **See Also**
    
    - `notation_asterisk_indices`
    - `definition_asterisk_indices`
    """
    return find_regex_in_text(text, pattern='\*\*[^*]+\*\*')



In [None]:
# |hide
# Test double asterisks in math mode.
text = r"**$M^{**}$**"  # I would like this to get the entire string, but currently, this is not the case.
double_asterisk_indices(text)

[(0, 8)]

In [None]:
#| export
def notation_asterisk_indices(
        text: str # the str in which to find the indices of notations surrounded by double asterisks.
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a part in `text` with LaTeX math mode text with double asterisks, including the double asterisks.
    """Return the indices of notation text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a notation almost always
    when it is purely LaTeX math mode text. 

    Assumes that no LaTeX math mode string has the dollar sign character
    within it.
    """
    return find_regex_in_text(
        text, pattern='\*\*\$\$[^$]+\$\$\*\*|\*\*\$[^$]+\$\*\*')
    # I previous used this, but it was not picking up notation LaTeX str
    # containing asterisks, e.g. `**$\pi^*$**``, `**$\pi_*$**`.`
    return find_regex_in_text(
        text, pattern='\*\*\$\$[^*$]+\$\$\*\*|\*\*\$[^*$]+\$\*\*')


def definition_asterisk_indices(
        text: str # The str in which to find the indices of the definitions surrounded by double asterisks.
        ) -> list[tuple[int, int]]: # Each tuple is of the form `(start,end)`, where `text[start:end]` is a substring in `text` surrounded by double asterisks, including the double asterisks.
    """Return the indices of definition text surrounded by double asterisks.
    
    A double-asterisk-surrounded-text is a definition almost always
    when it is not purely LaTeX math mode text.
    
    Assumes that no LaTeX math mode string has double asterisks and that no
    LaTeX math mode string has the dollar sign character within it.
    """
    all_double_asterisks = double_asterisk_indices(text)
    notations = notation_asterisk_indices(text)
    return [tuppy for tuppy in all_double_asterisks if tuppy not in notations]

#### Examples

In the following example, `scheme` and `structure sheaf` are definitions, whereas `$\mathcal{O}_X$` is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = double_asterisk_indices(text)

start, end = listy[0]
test_eq(text[start:end], '**scheme**')

start, end = listy[1]
test_eq(text[start:end], '**structure sheaf**')

start, end = listy[2]
test_eq(text[start:end], '**$\mathcal{O}_X$**')

listy = notation_asterisk_indices(text)
start, end = listy[0]
test_eq(text[start:end], '**$\mathcal{O}_X$**')
test_eq(len(listy), 1)

listy = definition_asterisk_indices(text)
print(listy)
test_eq(len(listy), 2)



[(2, 12), (25, 44)]


The following example has a definition which starts and ends with dollar sign `$` characters:

In [None]:
text = r'A **$G$-group over a ring $A$** is'
listy = notation_asterisk_indices(text)
test_eq(len(listy), 0)

listy = definition_asterisk_indices(text)
test(len(listy), 0, operator.ge)

The following example tests `notation_asterisk_indices` for LaTeX str with single asterisks in them:

In [None]:
# Tests LaTeX str with asterisks in them:
text = (r'''The **direct image of a sheaf $F^{\prime}$ on $X_{E}^{\prime}$** '''
        r'''is defined to be **$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$** and the '''
        r'''**inverse image of a sheaf $F$ on $X_{E}$** is defined to be **$\pi^{*} F=a\left(\pi^{p} F\right)$**.''')
listy = notation_asterisk_indices(text)
test_eq(len(listy), 2)
start, end = listy[0]
test_eq(text[start:end], r'**$\pi_{*} F^{\prime}=\pi_{p} F^{\prime}$**')

There are pure LaTeX strings which should be considered definitions, but for the purposes of the code here will be considered notations. For example, Hausdorff spaces in topology are also called $T_2$ spaces:

In [None]:
text = (r"A topological space $X$ is called **$T_2$** if for all $x,y \in X$, "
        r"there exist open neightborhoods $V$ and $W$ around $x$ and $y$ respectively "
        r"such that $V \cap W = 0$.")
listy = notation_asterisk_indices(text)
test_eq(text[listy[0][0]:listy[0][1]], r'**$T_2$**')

Unfortunately, the current implementation of the above methods do not work correctly if there are LaTeX string with double asterisks `**` within them.

In [None]:
# TODO: If this is fixed, delete this.
text = r'The double dual of $M$ is denoted by **$M^{**}$**.'
listy = definition_asterisk_indices(text)
print(f"The `definition_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a definition. This is incorrect!")

listy = notation_asterisk_indices(text)
print(f"The `notation_asterisk_indices` function detects the substring {text[listy[0][0]:listy[0][1]]} as a notation. I guess this is correct.")

The `definition_asterisk_indices` function detects the substring **$M^{** as a definition. This is incorrect!
The `notation_asterisk_indices` function detects the substring **$M^{**}$** as a notation. I guess this is correct.


A workaround is to replace asterisks `*` with the LaTeX `\ast` command:

In [None]:
text = r'The double dual of $M$ is denoted by **$M^{\ast\ast}$**.'
listy = definition_asterisk_indices(text)
test_eq(len(listy), 0)

listy = notation_asterisk_indices(text)
test_eq(len(listy), 1)
test_eq(text[listy[0][0]:listy[0][1]], r'**$M^{\ast\ast}$**')

In [None]:
#| export
def defs_and_notats_separations(
        text: str 
        )-> list[tuple[int, bool]]:
    """Finds the indices in the text where double asterisks occur and
    categorizes whether each index is for a definition or a notation.
    
    **Parameters**

    - text - str

    **Returns**

    - list[tuple[int, bool]]
        - Each tuple is of the form `(start, end, is_notation)`, where
        `text[start:end]` is the double-asterisk surrounded string,
        including the double asterisks.
    """
    all_double_asterisks = double_asterisk_indices(text)
    notations = notation_asterisk_indices(text)
    return [(start, end, (start, end) in notations)
            for start, end in all_double_asterisks]

In the following example, the first two double-asterisk-surrounded-strings are definitions, and the third is a notation:

In [None]:
text = r'A **scheme** is a... the **structure sheaf** of a scheme $X$ is the sheaf **$\mathcal{O}_X$**.'
listy = defs_and_notats_separations(text)
assert not listy[0][2]
assert not listy[1][2]
assert listy[2][2]

## Numbers

In [None]:
#| export
def is_number(
        x: Union[float, int, complex, str]
        ) -> bool:
    """Return `True` if the input `x` represents a number.
    
    This function is different from Python's built-in `is_numeric`
    function, which returns `True` when all characters of a string
    are digits.
    """
    if isinstance(x, (float, int, complex)):
        return True
    #For the case where string is None
    if x is None:
        return False
    if x and x[0] == '-': x = x[1:]
    return x.replace(".", "1", 1).isdigit()

In [None]:
assert is_number("3.45")
assert is_number(1 + 5j)
assert is_number(5)
assert is_number(0.0)
assert not is_number("3.43.55")
assert not is_number("hie")
assert not is_number("[^1]")
assert not is_number(None)

## Accented characters

## Files and folders

#### File existence

In [None]:
#| export
def existing_path(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**

    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    - `ValueError`
        - If `relative_to` is not `None` and yet not an absolute path, or
        if `relative_to` is `None` at yet `path` is not an absolute path.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parameter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)


@deprecated(reason="The function has been renamed to `existing_path`")
def file_existence_test(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """
    **Deprecated. Use `existing_path` instead.**
    
    Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**
    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parmaeter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)

In the following example, the `existing_path` method returns an existing absolute path $p_2 \backslash p_1$ which is equivalent to a specified path $p_1$ relative to an existing absolute path $p_2$. Note that all paths and `os` methods are mocked:

In [None]:
with (mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_path_exists.return_value = True
    mock_is_abs.return_value = True
    path_1 = existing_path('mock_existing_relative_path', 'mock_existing_absolute_path')
    test_eq(Path('mock_existing_absolute_path') / 'mock_existing_relative_path', path_1)


If the desired path is very long in Windows, then the `\\?\` may be appended in front of the absolute path so that Python can actually find the path, cf. https://stackoverflow.com/questions/36219317/pathname-too-long-to-open:

In [None]:
# TODO provide an example
with (mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
  print('hi') 

hi


If the parameter `relative_to`, which is supposed to be an absolute path, is not `None` and not absolute, then a `ValueError` is raised:

In [None]:
with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    path = 'mock_relative_path_that_is_not_None'
    relative_to = 'mock_non_absolute_path'
    existing_path('mock_relative_to_that_is_not_None', relative_to)


with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    # It does not matter what `path`` is - as long as `relative_to`` is not `None` and not absolute, the ValueError is raised.
    path = None  
    relative_to = 'mock_non_absolute_path'
    existing_path('mock_relative_to_that_is_not_None', relative_to)

In [None]:
# |hide 
# I am verifying that a mock method's return value is constant if it is not reset 
with (mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    print(mock_is_abs())
    print(mock_is_abs())
    print(mock_is_abs())

False
False
False


If the parameter `relative_to` is `None` and the paramether `path` is not absolute, then a `ValueError` is raised:

In [None]:
with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    relative_to = None
    path = 'mock_non_absolute_path'
    existing_path(path, relative_to)

If `relative_to` does not exist or if `path` does not exist, then a `FileNotFoundError` is raised:

In [None]:
# In this example, both `relative_to` and `path` are specified, and `relative_to`
# is a non-existent path.`
with (ExceptionExpected(ex=FileNotFoundError),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    relative_to = 'mock_non_existent_absolute_path'
    path = 'mock_some_relative_path'
    def relative_to_does_not_exist(path_to_check):
      return False if path_to_check is relative_to else True
    def relative_to_is_absolute_path(path_to_check):
      return True if path_to_check is relative_to else False

    mock_path_exists.side_effect = relative_to_does_not_exist
    mock_is_abs.side_effect = relative_to_is_absolute_path
    existing_path(path, relative_to)


# In this example, both `relative_to` and `path` are specified, and `path`
# is a non-existent path.`, whereas `relative_to` exists.
with (ExceptionExpected(ex=FileNotFoundError),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    relative_to = 'mock_existent_absolute_path'
    path = 'mock_non_existent_relative_path'
    def only_relative_to_exists(path_to_check):
      # only `relative_to` exists; all other paths of interest do not exist.
      return path_to_check is relative_to
    def relative_to_is_absolute_path(path_to_check):
      return True if path_to_check is relative_to else False

    mock_path_exists.side_effect = only_relative_to_exists
    mock_is_abs.side_effect = relative_to_is_absolute_path
    existing_path(path, relative_to)


#### Paths without extensions

In [None]:
#| export
def path_name_no_ext(
        path: PathLike # The path of the file or directory. This may be absolute or relative to any directory.
        ) -> str: # The name of the file or directory without the extension.
    """Return the name of a file or directory from its path without the
    extension.
    
    The file or directory does not have to exist.
    """
    name_with_extension = os.path.basename(path)
    return os.path.splitext(name_with_extension)[0]

Basic usage:

In [None]:
path = Path('hypothetical_directory')
test_eq(path_name_no_ext(path / 'hypothetical_subdirectory'),  'hypothetical_subdirectory')
test_eq(path_name_no_ext(path / 'hypotehtical_subdirectory' / 'hypothetical_file.md'),  'hypothetical_file')

The path does not have to exist.

In [None]:
test_eq(path_name_no_ext(path / 'this_folder_does_not_exist'), 'this_folder_does_not_exist')

On paths to files with "multiple extensions", the function returns the file name without the last extension only.

In [None]:
test_eq(path_name_no_ext('archived_file_somewhere.7z.zip.tar'),  'archived_file_somewhere.7z.zip')

In [None]:
#| export
def path_no_ext(
    path: PathLike # The path of the file or directory. This may be absolute or relative to any directory.
    ) -> str: # The path of the file or directory without the extension. If `path` is a path to a directory, then the output should be essentially the same as `path`.
    """Returns the path of a file or directory without the extension.
    
    The file or directory does not have to exist.
    """
    return os.path.splitext(str(path))[0]

Basic usage - the path does not have to exist:

In [None]:
assert path_no_ext('C:\\hi') == 'C:\\hi'
assert path_no_ext('greetings\\file.txt') == 'greetings\\file'

#### Read text from file

In [None]:
#| export
def text_from_file(
        path: PathLike, # The absolute path of the file.
        encoding: str = 'utf8' # The encoding of the file to be read. Defaults to `'utf8'`.
        ) -> str: # The entire text from a file
    """Return the entire text from a file.

    Assuems that the file can be encoded in the specified `encoding`
    """
    with open(path, 'r', encoding=encoding) as file:
        text = file.read()
        file.close()
    return text

The `text_from_file` method is a quick method to extract the text from a file.

In [None]:
# TODO: examples/tests

In [None]:
#| export
def files_of_format_sorted(
        directory: PathLike, # The directory in which to find the files
        extension: str = 'txt' # Extension of the files to find. Defaults to 'txt'.
        ) -> list[str]:
    """Return a list of path str of files in the directory (but not subdirectories)
    sorted via `natsort`.
    """
    return natsorted(glob.glob(str(Path(directory) / f'*.{extension}')))

In the following example, we mock a folder with numbered files. `files_of_format_sorted` returns them in the "natural" order.

In [None]:
with (mock.patch('glob.glob') as mock_glob):
    mock_directory = Path('some_directory')
    # `glob_results`` is not sorted in "natural" order. 
    glob_results = [mock_directory / f'{i}.txt' for i in range (10,0, -1)]
    mock_glob.return_value = glob_results

    # mock to make sure that natsorted was called.
    with mock.patch(__name__ + '.natsorted') as mock_natsorted:
      mock_files = files_of_format_sorted(mock_directory)
      mock_natsorted.assert_called_with(mock_glob.return_value)

    # Now print out that the files are sorted in "natural" order.
    mock_files = files_of_format_sorted(mock_directory)
    print(mock_files)
    test_shuffled(glob_results, mock_files)

[Path('some_directory/1.txt'), Path('some_directory/2.txt'), Path('some_directory/3.txt'), Path('some_directory/4.txt'), Path('some_directory/5.txt'), Path('some_directory/6.txt'), Path('some_directory/7.txt'), Path('some_directory/8.txt'), Path('some_directory/9.txt'), Path('some_directory/10.txt')]


## Date and time

In [None]:
#| export
def current_time_formatted_to_minutes(
        ) -> str:
    """Return the current time to minutes.

    **Returns**

    - str
        - In UTC time, to minutes.
    """
    dt = datetime.datetime.now(timezone.utc)
    formatted = dt.isoformat(timespec='minutes')
    return formatted[:16]

The `current_time_formatted_to_minutes` function is a convenience function that returns a string formatting current UTC time:

In [None]:
time_str = current_time_formatted_to_minutes()
print(time_str)

2024-03-24T14:35


Use the `dattime.dattime.fromisoformat` function to convert back the formatted string to a `datetime.datetime` object:

In [None]:
dt = datetime.datetime.fromisoformat(time_str)

## Topological sort

In [None]:
# TODO: reformat the parameter specifications and add examples.

It is sometimes convenient to topological sort.

In [None]:
#| export
def containing_string_priority(str1: str, str2: str) -> int:
    """Returns 1, 0, -1 depending on whether one string contains the other.
    
    TODO make the string containment criterion looser, e.g. finite Galois etale cover
    "contains" finite etale cover.
    
    **Parameters**
    - str1 - str
    - str2 - str
    """
    if str2 in str1:
        return -1
    elif str1 in str2:
        return 1
    else:
        # return len(str2) - len(str1)
        return 0


def default_str_comparison(str1: str, str2: str) -> int:
    """
    
    **Parameters**
    - str1 - str
    - str2 - str
    """
    if str1 > str2:
        return 1
    elif str1 < str2:
        return -1
    else:
        return 0


def natsort_comparison(str1: str, str2: str) -> int:
    """
    
    **Parameters**
    - str1 - str
    - str2 - str
    """
    if str1 == str2:
        return 0
    listy = [str1, str2]
    natsorted(listy)
    if listy[0] == str1:
        return -1
    else:
        return 1

In [None]:
#| export
def graph_for_topological_sort(
        items_to_sort: Iterable[str],
        key_order: Callable[[str, str], int]) -> dict[str, set[str]]:
    """
    **Parameters**
    - items_to_sort - Iterable[str]
    - key_order: Callable[[str, str], int]
        - Comparing str1 against str2 results in a positive number if
        str1 is "greater" than str2 (i.e. str1 is of a later priority)
    
    **Returns**
    - dict[str, set[str]]
        - A dict whose keys are the elements `k` of `items_to_sort` and
        whose values are sets of elements `k2` of `items_to_sort` such that
        `key_order(k, k2) > 0`.
    """
    graph = dict()
    for key_1, key_2 in product(items_to_sort, items_to_sort):
        # print(key_1, key_2)
        if key_1 == key_2:
            continue
        # print(key_1, key_2)
        # print(key_order(key_1, key_2))
        if key_1 not in graph:
            graph[key_1] = set()
        if key_order(key_1, key_2) > 0:
            graph[key_1].add(key_2)
    return graph

In [None]:
#| export
def dict_with_keys_topologically_sorted(
        dict_to_sort: dict[str],
        key_order: Callable[[str, str], int],
        reverse: bool = False) -> OrderedDict[str]:
    """Returns an OrderedDict whose keys are sorted topologically by
    the specified key order.
    
    **Parameters**
    - dict_to_sort - dict[str]
        - The dict whose keys need to be ordered.
    - key_order
        - The comparison function on the keys of `dict_to_sort`. Defaults to
        the key function for the comparison ``containing_string_priority``.
    - reverse - bool
        - Defaults to `False`
        
    **Returns**
    - OrderedDict[str]
    """
    graph = graph_for_topological_sort(dict_to_sort, key_order)
    ts = TopologicalSorter(graph)
    keys_ordered = list(ts.static_order())
    if reverse:
        keys_ordered = list(reversed(keys_ordered))
    return OrderedDict((key, dict_to_sort[key]) for key in keys_ordered)


In [None]:
# TODO: test

## Alphabet
With Greek letters typed in LaTeX, I sometimes need to interpret them like their English equivalents, e.g. `\alpha` is like `A`, etc.

In [None]:
#| export
ALPHABET_TO_ALPHABET_GROUP_DICT = {'A': 'A-E', 'B': 'A-E', 'C': 'A-E', 'D': 'A-E', 'E': 'A-E', 'F': 'F-J', 'G': 'F-J', 'H': 'F-J', 'I': 'F-J', 'J': 'F-J', 'K': 'K-O', 'L': 'K-O', 'M': 'K-O', 'N': 'K-O', 'O': 'K-O', 'P': 'P-T', 'Q': 'P-T', 'R': 'P-T', 'S': 'P-T', 'T': 'P-T', 'U': 'U-Z', 'V': 'U-Z', 'W': 'U-Z', 'X': 'U-Z', 'Y': 'U-Z', 'Z': 'U-Z'}
ALPHABET_OR_GREEK_TO_ALPHABET_DICT = {}
def alphabet_to_alphabet_group(character) -> str:
    """
    Returns the alphabet group
    
    In my vaults, I often alphabetize things and also group
    the alphabet as follows:
    - A-E
    - F-J
    - K-O
    - P-T
    - U-V
    
    **Parameters**
    - character - str
    
    **Returns**
    - str or `None`
        - Returns `None` if `character` is not an alphabet.
    """
    character = character.upper()
    if character in ALPHABET_TO_ALPHABET_GROUP_DICT:
        return ALPHABET_TO_ALPHABET_GROUP_DICT[character]
    else:
        return None

def alphabet_or_latex_command_to_alphabet(character):
    """Returns the alphabet that the character "corresponds to".
    
    """
    # TODO
    return

def alphabet_or_latex_command_to_alphabet_group(character):
    return alphabet_to_alphabet_group(
        alphabet_or_latex_command_to_alphabet(character))

In [None]:
dicty = {}
for c in ascii_uppercase:
    dicty[c] = alphabet_to_alphabet_group(c)
print(dicty)

{'A': 'A-E', 'B': 'A-E', 'C': 'A-E', 'D': 'A-E', 'E': 'A-E', 'F': 'F-J', 'G': 'F-J', 'H': 'F-J', 'I': 'F-J', 'J': 'F-J', 'K': 'K-O', 'L': 'K-O', 'M': 'K-O', 'N': 'K-O', 'O': 'K-O', 'P': 'P-T', 'Q': 'P-T', 'R': 'P-T', 'S': 'P-T', 'T': 'P-T', 'U': 'U-Z', 'V': 'U-Z', 'W': 'U-Z', 'X': 'U-Z', 'Y': 'U-Z', 'Z': 'U-Z'}


## Getting a path-valid string from a string containing latex

In [None]:
#| export
CHARACTER_ORDERING_LIST =\
    ['A', 'a', r'\Alpha', r'\alpha', 'B', 'b', r'\Beta', r'\beta', 'C', 'c', r'\Gamma',
     r'\gamma', 'D', 'd', r'\Delta', r'\delta', 'E', 'e', r'\Epsilon', r'\epsilon',
     'F', 'f', 'G', 'g', 'H', 'h', r'\Eta', r'\eta', 'I', 'i', r'\Iota', r'\iota',
     'J', 'j', 'K', 'k', r'\Kappa', r'\kappa', 'L', 'l', r'\Lambda', r'\lambda', 'M',
     'm', r'\Mu', r'\mu', 'N', 'n', r'\Nu', r'\nu', 'O', 'o', r'\Omicron', r'\omicron'
     'P', 'p', r'\Pi', r'\pi', r'\Phi', r'\phi', r'\Psi', r'\psi', 'Q', 'q', 'R', 'r', 
     r'\Rho', r'\rho', 'S', 's', r'\Sigma', r'\sigma', 'T', 't', r'\Theta', r'\theta',
     r'\Tau', r'\tau', 'U', 'u', r'\Upsilon', r'\upsilon', 'V', 'v', 'W', 'w', r'\Omega', r'\omega',
     'X', 'x', '\Chi', '\chi', 'Y', 'y', 'Z', 'z', '\Zeta', '\zeta', '*', r'\bullet']
DECORATING_CHARACTERS =\
    [r'\tilde', r'\hat', r'\overline', r'\bar', r'\mathscr', r'\mathcal',
     r'\mathfrak', r'\\operatorname', r'\\text', r'\\bf']
NONEFFECTIVE_CHARACTERS =\
    ['^', '_', '{', '}', '(', ')', '[', ']']

In [None]:
#| export
TO_REMOVE = [
    '.', '$', ':', '?', '!', '#', '%', '&',
    '<', '>', '*', '?', '"', '@', '`', '|',  
    'mathscr', 'mathbf', 'mathrm', 'mathfrak', 'mathcal', 'mathbb', 'operatorname',
    'boldsymbol', 'bf',
    'text', 'begin', 'end', 'equation' , 'aligned', 'array', 'pmatrix', 'bmatrix',
    'quad', 'longrightarrow', 'rightarrow', 'left', 'right', 'longmapsto', 'mapsto',
    'stackrel']
TO_UNDERSCORE = [' ', '-', '^', '(', ',', '/', '{', '}', '[', ']', '(', ')', '\\', '=',]
TO_SUBSTITUTE = {
    '*': 'star',
    '+': 'plus',
    'leqslant': 'leq',
    'geqslant': 'geq',
    '\'': '_prime'
}

# TODO: make a universal latex to path string; it seems that latex.convert
# might do something different when naming files.

def latex_to_path_accepted_string(latex: str) -> str:
    """Convert a latex string to a path accepted string
    """
    for to_underscore in TO_UNDERSCORE:
        latex, _ = re.subn(re.escape(to_underscore), '_', latex)
    for symbol_to_substitute, substitute_with in TO_SUBSTITUTE.items():
        latex, _ = re.subn(re.escape(symbol_to_substitute), substitute_with, latex)
    for to_remove in TO_REMOVE:
        latex, _ = re.subn(re.escape(to_remove), '', latex)
    latex, _ = re.subn('_+', '_', latex)
    latex, _ = re.subn('^_', '', latex)
    latex, _ = re.subn('_$', '', latex)
    latex = sanitize_filename(latex)
    return latex

In [None]:
#| hide
sample_1 = r'{ }^* \mathscr{R}_{\Lambda}'
output_1 = latex_to_path_accepted_string(sample_1)
print(output_1)
validate_filename(output_1)

sample_2 = r'\\left(d_1, d_2\\right)'
output_2 = latex_to_path_accepted_string(sample_2)
print(output_2)
validate_filename(output_2)

sample_3 = r'\left(\Delta_q(n)\right)_0 \leqslant n \leqslant q-1'
output_3 = latex_to_path_accepted_string(sample_3)
print(output_3)
validate_filename(output_3)

sample_4 = r"A'"
output_4 = latex_to_path_accepted_string(sample_4)
print(output_4)
validate_filename(output_4)

sample_4 = r"A = \prod"
output_4 = latex_to_path_accepted_string(sample_4)
print(output_4)
validate_filename(output_4)

sample_5 = r"\begin{aligned}  & F_p: \quad \mathbf{G} \longrightarrow \mathbf{G} \\  & \left(\begin{array}{ll}  a & b \\  c & d  \end{array}\right) \longmapsto\left(\begin{array}{ll}  a^p & b^p \\  c^p & d^p  \end{array}\right) \\  &  \end{aligned}"
output_5 = latex_to_path_accepted_string(sample_5)
print(output_5)
validate_filename(output_5)

sample_6 = r"\boldsymbol{\Delta}_{m, n}=[\Delta(n): L(m)]"
output_6 = latex_to_path_accepted_string(sample_6)
print(output_6)
validate_filename(output_6)

sample_7 = r"$\mathbb{F}_q^{+} \stackrel{\chi+}{\longrightarrow} \mathscr{O}^{\times} \longrightarrow k^{\times}$"
output_7 = latex_to_path_accepted_string(sample_7)
print(output_7)
validate_filename(output_7)

sample_8 = r"$\sqrt{\alpha_0(-1) q}=\sum_{z \in \mathbb{F}_q^{\times}} \alpha_0(z) \chi_{+}(z)$"
output_8 = latex_to_path_accepted_string(sample_8)
print(output_8)
validate_filename(output_8)

sample_9 = r"$$\begin{equation} \label{escape rate} 	G_{F_t}(z,w) = \lim_{n\to\infty} \frac{1}{d^n} \log \| F_t^n(z,w) \|, \end{equation}$$"
output_9 = latex_to_path_accepted_string(sample_9)
print(output_9)
validate_filename(output_9)
validate_filename(output_9)

star_R_Lambda
d_1_d_2
Delta_q_n_0_leq_n_leq_q_1
A_prime
A_prod
F_p_G_G_ll_a_b_c_d_ll_a_p_b_p_c_p_d_p
Delta_m_n_Delta_n_L_m
F_q_plus_chiplus_O_times_k_times
sqrt_alpha_0_1_q_sum_z_in_F_q_times_alpha_0_z_chi_plus_z
label_escape_rate_G_F_t_z_w_lim_n_to_infty_frac_1_d_n_log_F_t_n_z_w


The `latex_to_path_accepted_string` function "cleans" a latex str into a path-valid string for the purposes of making files.

In [None]:
sample_1 = r'\mathcal{O}_X'
output_1 = latex_to_path_accepted_string(sample_1)
print(output_1)
assert 'O' in output_1 and 'X' in output_1
validate_filename(output_1)

sample_2 = r'\operatorname{Gal}(L/K)'
output_2 = latex_to_path_accepted_string(sample_2)
print(output_2)
assert 'Gal' in output_2 and 'L' in output_2 and 'K' in output_2
validate_filename(output_2)

# Example found in https://arxiv.org/abs/1607.04471
sample_3 = r'\begin{equation} \label{escape rate} 	G_{F_t}(z,w) = \lim_{n\to\infty} \frac{1}{d^n} \log \| F_t^n(z,w) \|, \end{equation}'
output_3 = latex_to_path_accepted_string(sample_3)
print(output_3)
validate_filename(output_3)


O_X
Gal_L_K
label_escape_rate_G_F_t_z_w_lim_n_to_infty_frac_1_d_n_log_F_t_n_z_w
