# 01_pattern

> Hyphenation patterns

In [None]:
#| default_exp pattern

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
import string, re
import itertools as it
from collections.abc import Iterable, Mapping
import datrie  # type: ignore

TeX patterns look like `2a1ly4`. There are letters and numbers, and the numbers
denote weights that fall between the letters, also before the first letter and
after the last letter:

| | | | | | | |
|-|-|-|-|-|-|-|
| |a| |l| |y| |
|2| |1| |0| |4|

Missing numbers mean zero.

In [None]:
#| exporti
def _cvt(
    pattern: str  # pattern as read from the TeX patterns file
) -> tuple[int, ...]:  # position i has the weight of the slot before character i
    res = [0 for _ in pattern]
    pos = 0
    for ch in pattern:
        if ch in string.digits:
            res[pos] = int(ch)
        else:
            pos += 1
    return tuple(res[:pos+1])

In [None]:
show_doc(_cvt)

---

[source](https://github.com/jkseppan/shyster/blob/main/shyster/pattern.py#L13){target="_blank" style="float:right; font-size:smaller"}

### _cvt

>      _cvt (pattern:str)

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| pattern | str | pattern as read from the TeX patterns file |
| **Returns** | **tuple** | **position i has the weight of the slot before character i** |

The following function turns many patterns into one trie.

In [None]:
#| export
def convert_patterns(
    patterns: Iterable[str]  # TeX style patterns
) -> datrie.Trie:  # trie mapping matched substrings to weights
    num = re.compile('[0-9]')
    patterns = sorted(patterns, key=lambda x: num.sub('', x))
    alphabet = set(it.chain.from_iterable(patterns)) - set(string.digits) | {'\x1F'}
    trie = datrie.Trie(alphabet)
    for pat in patterns:
        pat = pat.replace('.', '\x1f')
        trie[num.sub('', pat)] = _cvt(pat)
    return trie

In [None]:
show_doc(convert_patterns)

---

[source](https://github.com/jkseppan/shyster/blob/main/shyster/pattern.py#L26){target="_blank" style="float:right; font-size:smaller"}

### convert_patterns

>      convert_patterns (patterns:collections.abc.Iterable[str])

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| patterns | Iterable | TeX style patterns |
| **Returns** | **Trie** | **trie mapping matched substrings to weights** |

In [None]:
t = convert_patterns('''4m1p pu2t 5pute put3er
l1g4 lgo3 igo 2ith 4hm
hy3ph he2n hena4 hen5at ina n2at itio 2io'''.split())
test_eq(t.prefix_items('puter'), 
       [('put', (0, 0, 2, 0)),
        ('pute', (5, 0, 0, 0, 0)),
        ('puter', (0, 0, 0, 3, 0, 0))])

TeX exceptions are simply words with hyphens where hyphenation should happen.

In [None]:
#| export
def convert_exceptions(
    exceptions: Iterable[str]
) -> Mapping[str, tuple[str,...]]:  # mapping from word to word parts
    return {w.replace('-', ''): tuple(w.split('-')) for w in exceptions}

In [None]:
show_doc(convert_exceptions)

---

[source](https://github.com/jkseppan/shyster/blob/main/shyster/pattern.py#L38){target="_blank" style="float:right; font-size:smaller"}

### convert_exceptions

>      convert_exceptions (exceptions:collections.abc.Iterable[str])

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| exceptions | Iterable |  |
| **Returns** | **Mapping** | **mapping from word to word parts** |

In [None]:
assert convert_exceptions(['saippua-kauppias', 'xyzzy']) == {
    'saippuakauppias': ('saippua', 'kauppias'), 
    'xyzzy': ('xyzzy',)
}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()