# 02_hyphenation

> Hyphenator

In [None]:
#| default_exp hyphenation

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| exporti
import re
import itertools as it
from collections.abc import Sequence, Mapping, Iterable
from shyster.file import read_patterns
from shyster.pattern import convert_patterns, convert_exceptions
from pathlib import Path

First a simple function to add hyphens at given positions:

In [None]:
#| exporti
def add_hyphens(
    s: str,  # word to hyphenate
    positions: Sequence[int],  # positions to insert hyphens (increasing order)
    hyphen: str='-'  # hyphen character
) -> str:  # word with hyphens
    i0, i1 = it.tee(iter(positions))
    i0 = it.chain((0,), i0)
    i1 = it.chain(i1, (len(s),))
    substrings = (s[p0:p1] for (p0,p1) in zip(i0, i1))
    return hyphen.join(substrings).strip(hyphen)

In [None]:
show_doc(add_hyphens)

---

[source](https://github.com/jkseppan/shyster/blob/main/shyster/hyphenation.py#L12){target="_blank" style="float:right; font-size:smaller"}

### add_hyphens

>      add_hyphens (s:str, positions:collections.abc.Sequence[int],
>                   hyphen:str='-')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| s | str |  | word to hyphenate |
| positions | Sequence |  | positions to insert hyphens (increasing order) |
| hyphen | str | - | hyphen character |
| **Returns** | **str** |  | **word with hyphens** |

In [None]:
assert add_hyphens('saippuakauppias', ()) == 'saippuakauppias'
assert add_hyphens('saippuakauppias', (7,)) == 'saippua-kauppias'
assert add_hyphens('saippuakauppias', (4, 7, 11)) == f'saip-pua-kaup-pias'
assert add_hyphens('', ()) == ''

The following function implements the Liang hyphenation algorithm,
given the patterns and exceptions. For each possible hyphenation slot,
we take the maximum of all weights given by the patterns, and if
the maximum is odd, we insert a hyphen. TeX has parameters called
`\lefthyphenmin` and `\righthyphenmin`, with default values 2 and 3
(respectively), meaning that hyphens with only one letter to their left
or only one or two to their right are forbidden. The default patterns
produce such hyphens so we must also filter them out.

In [None]:
#| export
class hyphenator:
    """Hyphenates words"""
    __slots__ = ('regex', 'mapping', 'exceptions', 'hyphen', 'lefthyphenmin', 'righthyphenmin')
    regex: re.Pattern  # first return value from `pattern.convert_patterns`
    mapping: Mapping[str, tuple[int,...]]  # second return value from `pattern.convert_patterns`
    exceptions: Mapping[str, str]  # return value from `pattern.convert_exceptions`
    hyphen: str  # hyphen character
    lefthyphenmin: int  # at least this many characters before the first hyphen
    righthyphenmin: int  # at least this many characters after the last hyphen

    def __init__(
        self,
        initializer: (str   # filename of hyphen.tex, or an iterable of its lines, or None
                      | Path
                      | Iterable[str]
                      | None),
        hyphen: str='-',
        lefthyphenmin: int=2,
        righthyphenmin: int=3,
    ):
        if initializer is None:
            # the user will set these up explicitly
            self.regex = re.compile('')
            self.mapping = {}
            self.exceptions = {}
        else:
            f = None
            if isinstance(initializer, (str, Path)):
                f = open(initializer, 'rt')
                it = f.readlines()
            elif isinstance(initializer, Iterable):
                it = initializer
            else:
                raise TypeError(f"don't know how to use {type(initializer)}")
            pat, exc = read_patterns(it)
            if f:
                f.close()
            self.regex, self.mapping = convert_patterns(pat)
            self.exceptions = convert_exceptions(exc)
            
        self.hyphen = hyphen
        self.lefthyphenmin = lefthyphenmin
        self.righthyphenmin = righthyphenmin
    
    def __call__(self, word: str):
        return self.hyphenate(word)
        
    def hyphenate(self, word: str) -> str:
        if (result := self.exceptions.get(word)):
            return result.replace('-', self.hyphen)
        word = f'\x1f{word}\x1f'
        weights = bytearray(len(word))
        for match in self.regex.finditer(word):
            pos = match.span()[0]-1
            key = match.group(1)
            rule = self.mapping[key]
            for i, w in enumerate(rule):
                weights[pos+i] = max(weights[pos+i], w)
        positions = (i for (i,w) in enumerate(weights)
                     if w&1==1 and i>=self.lefthyphenmin and i<=len(word)-2-self.righthyphenmin)
        return add_hyphens(word[1:-1], positions, hyphen=self.hyphen)


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()