In [None]:
#| default_exp helper.html

# helper.html
> Helper functors dealing with HTML tags

In [None]:
#| export
import bs4
from bs4 import BeautifulSoup
import random
import re
from typing import Optional, Union


from trouver.helper.regex import latex_indices, find_regex_in_text, replace_string_by_indices


In [None]:
from fastcore.test import *

#### Consolidating special characters that are changed with the `__str__` function of `bs4.element.Tags` objects

In [None]:
#| export
def html_tag_str(
        html_tag: bs4.element.Tag
        ) -> str:
    """
    Return the full string of `html_tag`, accounting for 
    special characters that `bs4` changes
    """
    special_chars = {'&lt;': '<', '&gt;': '>', '&amp;': '&'}
    text_to_return = str(html_tag)
    for special_char, replace_with in special_chars.items():
        text_to_return = text_to_return.replace(special_char, replace_with)
    return text_to_return

When using the `__str__` function of `bs4.element.Tag` objects, special characters such as `<`, `>` and `&` change into `&lt;`, `&gt;` and `&amp;`, etc. The `html_tag_str` function makes it so that these characters are changed back.

In [None]:
soup = BeautifulSoup('', 'html.parser')
tag = soup.new_tag('span')
tag.string = '&hi<'
test_eq(html_tag_str(tag), '<span>&hi<</span>')

#### Handling less than `<` symbols in latex math mode strings

`BeautifulSoup`'s `html.parser` parses less than `<` symbols without a following space as the beginning of an HTML tag, even when the symbol `<` is used within a LaTeX math mode string. To get around this, we detect when this happens and add a space after these symbols.

In [None]:
#| export
def find_lt_symbols_without_space_in_math_mode(
        text: str
        ) -> list[int]: # The index of  
    """
    Return the indices in `text` with math mode less than `<` symbols without
    a space that follows.
    """
    latex_inds = latex_indices(text)
    lt_pattern = re.compile(r'<(?! )')
    inds_of_lt_without_spaces_after = []
    for start, end in latex_inds:
        latex_str = text[start:end]
        relative_lt_inds = find_regex_in_text(latex_str, lt_pattern)
        inds_of_lt_without_spaces_after.extend([
            start + relative_lt_ind for relative_lt_ind, _ in relative_lt_inds
        ])
    return inds_of_lt_without_spaces_after 


In the following example, there are a few math mode strings with less than `<` symbols. Some of these symbols are followed by spaces and others are not.

In [None]:
text = r"""
here is a math mode $a<b$. Here is another $a< b$.
Here is an in-line one:

$$ asdf <cbba$$

Here is another:

$$
asdf < basdf
$$
"""
output = find_lt_symbols_without_space_in_math_mode(text)
print(output)
test_eq(len(output), 2)
test_eq(text[output[0] + 1], 'b')
test_eq(text[output[1] + 1], 'c')

[23, 85]


In [None]:
text_2 = r"""
<b>Now there is an HTML tag</b>. But it shouldn't be detected
because the tag is not within math mode text.
But this inequality is: $a <d$
"""
output = find_lt_symbols_without_space_in_math_mode(text_2)
print(output)
test_eq(len(output), 1)
test_eq(text_2[output[0] + 1], 'd')

[136]


In [None]:
#| export
def add_space_to_lt_symbols_without_space(
        text: str
        ) -> str:
    """Add space after less than `<` symbols if the symbol
    is not followed by a space.
    """
    lt_wo_space_inds = find_lt_symbols_without_space_in_math_mode(text)
    lt_wo_space_ranges = [(ind, ind+1) for ind in lt_wo_space_inds]
    return replace_string_by_indices(
        text, replace_ranges=lt_wo_space_ranges,
        replace_with=['< '] * len(lt_wo_space_inds))
    

Let us again use `text` from the example for the `find_lt_symbols_without_space_in_math_mode` function:

In [None]:
print(add_space_to_lt_symbols_without_space(text))
assert not find_lt_symbols_without_space_in_math_mode(add_space_to_lt_symbols_without_space(text))


here is a math mode $a< b$. Here is another $a< b$.
Here is an in-line one:

$$ asdf < cbba$$

Here is another:

$$
asdf < basdf
$$



#### Removing HTML tags in a text and obtaining the data of the tags.

In [None]:
markup = '<b>Hello</b>'
soup = BeautifulSoup(markup, 'html.parser')
tag = soup.b
new_str = soup.new_string(' World')
tag.append(new_str)

In [None]:
new_str

' World'

In [None]:
#| export
def remove_html_tags_in_text(
        text: str, # The text in which to remove the HTML tags.
        replace_with_attributes: Optional[Union[str, list[str]]] = None, # Attribute(s) within the HTML tags which should be used to replace the text of the tags. If `None`, then the texts are not replaced with the attributes. If multiple attributes are specified, then only one attribute is used to replace the text for each HTML tag (independently at random of other replacements). Each attribute's text has an equal chance of being selected for replacement. Repeats are ignored.
        definitely_replace: bool = False, # If `True` and if a given HTML tag has an attribute specified in `replace_with_attributes`, then the text for that tag will definitely be replaced by the text of one of the attributes. Otherwise, the original text and each attribute's text have an equal chance of being selected.
        seed: int = None # Random seed 
        ) -> tuple[str, list[tuple[bs4.element.Tag, int, int]]]: # The text `removed` without HTML tags and a list whose elements consist of the removed HTML tags and the starting and ending indices of the text corresponding to the removed tags within `removed`.
    """Remove the HTML tags in `text`.

    HTML tags are assumed to be not nested.

    """
    random.seed(seed)
    parsed_soup = BeautifulSoup(text, 'html.parser')
    replace_with_attributes = _init_replace_with_attributes(
        replace_with_attributes)

    position = 0
    replaced_contents = []
    for content in parsed_soup.contents:
        position = _process_content(
            parsed_soup, replace_with_attributes, definitely_replace, content,
            position, replaced_contents)
    text_to_return = html_tag_str(parsed_soup)
    return text_to_return, replaced_contents


def _init_replace_with_attributes(
        replace_with_attributes: Optional[Union[str, list[str]]]
        ) -> set[str]:
    if replace_with_attributes is None:
        replace_with_attributes = []
    elif isinstance(replace_with_attributes, str):
        replace_with_attributes = [replace_with_attributes]
    return set(replace_with_attributes)


def _select_replacement_text(
        content: bs4.element.Tag,
        replace_with_attributes: set[str],
        definitely_replace: bool) -> str:
    if not replace_with_attributes:
        return content.string
    selection_pool = []    
    if not definitely_replace:
        selection_pool.append(content.string)
    for attribute, value in content.attrs.items():
        if attribute not in replace_with_attributes:
            continue
        selection_pool.append(value)
    return random.choice(selection_pool)


def _process_content(
        parsed_soup: BeautifulSoup,
        replace_with_attributes: set[str],
        definitely_replace: bool,
        content,
        position: int,
        replaced_contents: list) -> int:
    
    if not isinstance(content, bs4.element.Tag):
        return position + len(content)
    replacement_text = _select_replacement_text(
        content, replace_with_attributes, definitely_replace)
    
    try:
        replaced_content = content.replace_with(
            parsed_soup.new_string(replacement_text))
    except TypeError as e:
        raise e

    replaced_contents.append((
        replaced_content,
        position,
        position + len(replacement_text)))
    return position + len(replacement_text)
    

The `remove_html_tags_in_text` function removes HTML tags, preserving the underlying text by default.

In [None]:
html = 'Let $K$ be a field. An <b definition="Abelian variety over a field">Abelian variety over $K$</b> is a variety that'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html)
print(text_without_html_tags)

test_eq(text_without_html_tags, 'Let $K$ be a field. An Abelian variety over $K$ is a variety that')

Let $K$ be a field. An Abelian variety over $K$ is a variety that


In [None]:
removed_tags[0][0].attrs

{'definition': 'Abelian variety over a field'}

In the following example, there is a less than `<` symbol, which is definitely not the opening of an HTML tag. The following verifies that the placeholder `&lt;` is not used to replace the less than symbol, which is what `bs4.BeautifulSoup`'s `html.parser` does.

In [None]:
text = 'Hello, this has a less than symbol: $a< b$'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '< ' in text
assert 'lt' not in html_tags


The same applies to the greater than `>` symbol, and `&` symbols

In [None]:
text = 'Hello, this has a greater than symbol: $a>b$'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '>' in text
assert 'gt' not in html_tags

text = r'Hello $$ f &= 3 \\ g &= 5'
text, html_tags = remove_html_tags_in_text(text)
assert not html_tags
assert '&' in text
assert '&amp;' not in html_tags



The `remove_html_tags_in_text` function additionally returns a list with information about the tags that are removed. Each item in this list is a tuple `(tag, start, end)`, where `tag` is the tag that has been removed, and `start` and `end` are the indices within the string output `text_without_html_tags` of the function at which the text replacing the tag can be found.

In the example above (continued below), there is excactly one tag that is removed.

In [None]:
print(removed_tags)
removed_tag, start, end = removed_tags[0]
print(text_without_html_tags[start:end])

test_eq(text_without_html_tags[start:end], 'Abelian variety over $K$')

[(<b definition="Abelian variety over a field">Abelian variety over $K$</b>, 23, 47)]
Abelian variety over $K$


In [None]:
#| hide
html = '<span>Santa</span> want to eat some popcorn for once. He does not want <div>milk and cookies today</div>. \n And <a href="">this is a link.</a>'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html)

test_eq(text_without_html_tags, 'Santa want to eat some popcorn for once. He does not want milk and cookies today. \n And this is a link.')
test_eq(len(removed_tags), 3)

removed_tag, start, end = removed_tags[0]
test_eq(text_without_html_tags[start:end], 'Santa')

removed_tag, start, end = removed_tags[1]
test_eq(text_without_html_tags[start:end], 'milk and cookies today')

removed_tag, start, end = removed_tags[2]
test_eq(text_without_html_tags[start:end], 'this is a link.')

The `remove_html_tags_in_text` function can be used to replace the underlying text of HTML tags with specified attribute values.

In the below example, the text has a tag which contains a `typo` attribute. Passing `'typo'` to the `replace_with_attributes` parameter and passing `True` to the `definitely_replace` parameter guarantees that the value of the `typo` attribute is used instead of the text of the tag.

In [None]:
html = r'The following tag fixes a typo and simultaneously keeps around the data of that typo: <span typo="$\operatorname{Gul}(K)$">$\operatorname{Gal}(K)$</span>'
text_without_html_tags, removed_tags = remove_html_tags_in_text(html, replace_with_attributes='typo', definitely_replace=True)
print(text_without_html_tags)

test_eq(text_without_html_tags, 'The following tag fixes a typo and simultaneously keeps around the data of that typo: $\\operatorname{Gul}(K)$')

removed_tag, start, end = removed_tags[0]
test_eq(text_without_html_tags[start:end], '$\\operatorname{Gul}(K)$')


The following tag fixes a typo and simultaneously keeps around the data of that typo: $\operatorname{Gul}(K)$


If the `definitely_replace` parameter is `False` (which it is by default), then the original text might be preserved or it might be replaced.

In [None]:
html = r'<span typo="$\operatorname{Gul}(K)$">$\operatorname{Gal}(K)$</span>'
possible_outputs = [
    r'$\operatorname{Gal}(K)$',
    r'$\operatorname{Gul}(K)$'
]
output, _ = remove_html_tags_in_text(html, replace_with_attributes='typo', definitely_replace=False)
assert output in possible_outputs


#### Adding HTML tag data

On the other hand, we may also need to add HTML tag data to a text.

In [None]:
#| export
def add_HTML_tag_data_to_raw_text(
        text: str, # The text onto which to add HTML tags. This is assumed to contain no HTML tags.
        tags_and_locations: list[tuple[bs4.element.Tag, int, int]] # Each tuple consists of the tag object to add as well as the indices within `text` to. The ranges specified by the tuples are assumed to not overlap with one another.
        ) -> str: # The modification of `text` in which the tags are added at the specified locations; the ranges in `text` are replaced..
    """
    Add specified HTML tags to the specified locations/ranges in `text`.

    See the `add_HTML_tag_data_to_text` function for adding HTML
    tag data to text that may or may not already have HTML tags.
    """
    # sort by starting index
    tags_and_locations = sorted(
        tags_and_locations, key=lambda x: x[1])
    replace_ranges = [(start, end) for _, start, end in tags_and_locations]
    replace_with = [html_tag_str(html_tag) for html_tag, _, _ in tags_and_locations]
    return replace_string_by_indices(text, replace_ranges, replace_with)

In [None]:
text = "Now this will have an HTML tag. This will also have an HTML tag too!"
tags_and_locations = [
    (BeautifulSoup('<span some_attr="hi">this</span>', 'html.parser'), 4,8),
    (BeautifulSoup('<div some_attr="hi">This</div>', 'html.parser'), 32,36)
]
output = add_HTML_tag_data_to_raw_text(text, tags_and_locations)
print(output)
test_eq(output, 'Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!')

Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!


Now let us look at the same example, with the order in `tags_and_locations` reversed.

In [None]:
text = "Now this will have an HTML tag. This will also have an HTML tag too!"
tags_and_locations = [
    (BeautifulSoup('<div some_attr="hi">This</div>', 'html.parser'), 32,36),
    (BeautifulSoup('<span some_attr="hi">this</span>', 'html.parser'), 4,8)
]
output = add_HTML_tag_data_to_raw_text(text, tags_and_locations)
print(output)
test_eq(output, 'Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!')

Now <span some_attr="hi">this</span> will have an HTML tag. <div some_attr="hi">This</div> will also have an HTML tag too!
