In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from shyster.file import *
from shyster.pattern import *
from shyster.hyphenation import *
from shyster.html import *
from bs4 import BeautifulSoup

# shyster

> Add soft hyphens to HTML documents

The problem this package is
trying to solve is that while I can set `hyphens: auto;` in CSS,
many browsers do a poor job of hyphenating Finnish.  Even if they
have Finnish hyphenation patterns, they often fail to recognise
compound words, which should be hyphenated at compound boundaries
(saippua-kauppias, not saip-pua-kaup-pias).  One solution is to set
`hyphens: manual;` and add soft hyphens at acceptable hyphenation
spots.


## Install

```sh
pip install https://github.com/jkseppan/shyster/
```

## How to use

In [None]:
pat, ex = read_patterns(open('patterns/hyph-fi.tex').readlines())
pat_re, pat_map = convert_patterns(pat)
ex = convert_exceptions(ex)
hyph = hyphenator(pat_re, pat_map, ex, righthyphenmin=2)

[hyph(word) for word in 
 'Jukolan talo, eteläisessä Hämeessä, seisoo erään mäen pohjaisella rinteellä, liki Toukolan kylää'\
 .replace(',','').split()]

['Ju-ko-lan',
 'ta-lo',
 'ete-läi-ses-sä',
 'Hä-mees-sä',
 'sei-soo',
 'erään',
 'mäen',
 'poh-jai-sel-la',
 'rin-teel-lä',
 'li-ki',
 'Tou-ko-lan',
 'ky-lää']

In [None]:
html = """
<!doctype html><title>Seitsemän veljestä</title>
<script>var veljekset = 7;</script>
<body>
<p style="margin-top: 2em">Jukolan talo, eteläisessä Hämeessä, seisoo erään mäen pohjaisella
rinteellä, liki Toukolan kylää. Sen läheisin ympäristö on kivinen
tanner, mutta alempana alkaa pellot, joissa, ennenkuin talo oli häviöön
mennyt, aaltoili teräinen vilja.</p>
</body>
"""
soup = BeautifulSoup(html, 'lxml')
hyph = hyphenator(pat_re, pat_map, ex, righthyphenmin=2)
hyphenate_soup(soup, hyph)
print(str(soup))

<class 'bs4.element.NavigableString'> Seitsemän veljestä
<class 'bs4.element.NavigableString'> 

<class 'bs4.element.NavigableString'> 

<class 'bs4.element.NavigableString'> 

<class 'bs4.element.NavigableString'> Jukolan talo, eteläisessä Hämeessä, seisoo erään mäen pohjaisella
rinteellä, liki Toukolan kylää. Sen läheisin ympäristö on kivinen
tanner, mutta alempana alkaa pellot, joissa, ennenkuin talo oli häviöön
mennyt, aaltoili teräinen vilja.
<class 'bs4.element.NavigableString'> 

<class 'bs4.element.NavigableString'> 

<!DOCTYPE html>
<html><head><title>Seit-se-män vel-jes-tä</title>
<script>var veljekset = 7;</script>
</head><body>
<p style="margin-top: 2em">Ju-ko-lan ta-lo, ete-läi-ses-sä Hä-mees-sä, sei-soo erään mäen poh-jai-sel-la
rin-teel-lä, li-ki Tou-ko-lan ky-lää. Sen lä-hei-sin ym-pä-ris-tö on ki-vi-nen
tan-ner, mut-ta alem-pa-na al-kaa pel-lot, jois-sa, en-nen-kuin ta-lo oli hä-vi-öön
men-nyt, aal-toi-li te-räi-nen vil-ja.</p>
</body>
</html>


In [None]:
pat, ex = read_patterns(open('patterns/hyphen.tex').readlines())
pat_re, pat_map = convert_patterns(pat)
ex = convert_exceptions(ex)
hyph = hyphenator(pat_re, pat_map, ex)

import textwrap
textwrap.wrap(' '.join(hyph(word) for word in '''
It is a truth universally acknowledged, that a single man in possession of a good fortune,
must be in want of a wife.
'''.replace(',','').replace('.','').split()))

['It is a truth univer-sally ac-knowl-edged that a single man in posses-',
 'sion of a good for-tune must be in want of a wife']

## Copying

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.

The files in `patterns/` are distributed with this program as example input files.
The Finnish patterns are covered by the terms "Patterns may be freely distributed"
and the English ones by "Unlimited copying and redistribution of this file are permitted as long
as this file is not modified."