# 03_html

> HTML modification

In [None]:
#| default_exp html

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from collections.abc import Callable, Set
from typing import Type
import bs4
import re

In [None]:
#| exporti
word_or_punct_re = re.compile(r'\w+|[^\w]+')
word_re = re.compile(r'\w+')
default_exclude_classes = (
    bs4.element.PreformattedString,
    bs4.element.Stylesheet,
    bs4.element.Script,
    bs4.element.RubyTextString, # type: ignore
    bs4.element.RubyParenthesisString, # type: ignore
)

In [None]:
#| export
def hyphenate_soup(
    soup: bs4.BeautifulSoup,  # soup to be modified
    hyphenator: Callable[[str], str],  # hyphenator
    exclude_classes: tuple[Type[bs4.element.PageElement],...]=default_exclude_classes,  # do not modify inside these
) -> None:
    """Call hyphenator on words that appear in suitable elements of soup,
    and replace the contents of those elements. Suitable elements are those
    containing text whose class is not (a subclass of something) in
    exclude_classes."""
    for t in soup.find_all(string=True):
        if isinstance(t, exclude_classes):
            continue
        print(type(t), t.string)
        pieces = re.findall(word_or_punct_re, str(t.string))
        for i, word in enumerate(pieces):
            if not word_re.fullmatch(word):
                continue
            hyphenated = hyphenator(word)
            pieces[i] = hyphenated
        new = ''.join(pieces)
        t.string.replace_with(new)


In [None]:
show_doc(hyphenate_soup)

---

[source](https://github.com/jkseppan/shyster/blob/main/shyster/html.py#L25){target="_blank" style="float:right; font-size:smaller"}

### hyphenate_soup

>      hyphenate_soup (soup:bs4.BeautifulSoup,
>                      hyphenator:collections.abc.Callable[[str],str], exclude_c
>                      lasses:tuple[typing.Type[bs4.element.PageElement],...]=(<
>                      class 'bs4.element.PreformattedString'>, <class
>                      'bs4.element.Stylesheet'>, <class 'bs4.element.Script'>,
>                      <class 'bs4.element.RubyTextString'>, <class
>                      'bs4.element.RubyParenthesisString'>))

Call hyphenator on words that appear in suitable elements of soup,
and replace the contents of those elements. Suitable elements are those
containing text whose class is not (a subclass of something) in
exclude_classes.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| soup | BeautifulSoup |  | soup to be modified |
| hyphenator | Callable |  | hyphenator |
| exclude_classes | tuple | (<class 'bs4.element.PreformattedString'>, <class 'bs4.element.Stylesheet'>, <class 'bs4.element.Script'>, <class 'bs4.element.RubyTextString'>, <class 'bs4.element.RubyParenthesisString'>) | do not modify inside these |
| **Returns** | **None** |  |  |

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()