# Compare different syllabification/hyphenation approaches

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
import numpy as np

In [3]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver

data_loader_and_saver = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data")

In [4]:
tokens = data_loader_and_saver.load_data("train_tokens_one_metre_line_all_metres_recognized")

train_tokens_one_metre_line_all_metres_recognized.json: loaded 41762 records.


In [5]:
tokens[1]

[['tak', 'z', 'podkroví', 'se', 'všichni', 'vytratili'],
 ['v', 'té', 'doměnce', 'že', 'přece', 'usíná'],
 ['jsou', 'v', 'polích', 'daleko', 'kde', 'hoří', 'úpal', 'bílý'],
 ['a', 'plahočí', 'se', 'známá', 'krajina'],
 ['tam', 'rvou', 'se', 'se', 'zemí', 'a', 'temné', 'chleby', 'jedí'],
 ['tam', 'rvou', 'se', 'po', 'polích', 'a', 'tady', 'šalvěj', 'voní'],
 ['zde', 'černé', 'hodiny', 'jen', 'tik', 'a', 'tak', 'a', 'více', 'nepovědí'],
 ['a', 'po', 'zdi', 'vápenné', 'se', 'řídké', 'mouchy', 'honí'],
 ['a', 'strop', 'ten', 'rayon', 'známý', 'zoufalý'],
 ['jenž', 'oči', 'vysílí', 'a', 'tupá', 'kamna', 'rzivá'],
 ['a', 'okno', 'nesmělé', 'jež', 'zamlklé', 'nechali'],
 ['jímž', 'celé', 'měsíce', 'se', 'nikdo', 'nepodívá'],
 ['než', 'vrátí', 'se', 'snad', 'šalvěj', 'vyvane'],
 ['stín', 'trámů', 'večerní', 'boj', 'muší', 'ukryje'],
 ['on', 'v', 'okno', 'nesmělé', 'se', 'dívat', 'přestane'],
 ['a', 'po', 'klekání', 'teprve', 'se', 'pohnou', 'veřeje']]

In [6]:
sampa_tokens = data_loader_and_saver.load_data("train_sampa_tokens_one_metre_line_all_metres_recognized")
sampa_tokens[1]

train_sampa_tokens_one_metre_line_all_metres_recognized.json: loaded 41762 records.


[['tak', 's', 'potkrovi:', 'sE', 'fSIxJI', 'vItracIlI'],
 ['f', 'tE:', 'domJEnt_sE', 'ZE', 'pP\\Et_sE', 'usi:na:'],
 ['jso_u', 'f', 'poli:x', 'dalEko', 'gdE', 'h\\oP\\i:', 'u:pal', 'bi:li:'],
 ['a', 'plah\\ot_Si:', 'sE', 'zna:ma:', 'krajIna'],
 ['tam', 'rvo_u', 'sE', 'sE', 'zEmi:', 'a', 'tEmnE:', 'xlEbI', 'jEJ\\i:'],
 ['tam', 'rvo_u', 'sE', 'po', 'poli:x', 'a', 'tadI', 'SalvjEj', 'voJi:'],
 ['zdE',
  't_SErnE:',
  'h\\oJ\\InI',
  'jEn',
  'cIk',
  'a',
  'tak',
  'a',
  'vi:t_sE',
  'nEpovjEJ\\i:'],
 ['a', 'po', 'zJ\\I', 'va:pEnE:', 'sE', 'P\\i:tkE:', 'mo_uxI', 'h\\oJi:'],
 ['a', 'strop', 'tEn', 'rajon', 'zna:mi:', 'zo_ufali:'],
 ['jEnS', 'ot_SI', 'vIsi:li:', 'a', 'tupa:', 'kamna', 'rzIva:'],
 ['a', 'okno', 'nEsmJElE:', 'jES', 'zaml=klE:', 'nExalI'],
 ['ji:mS', 't_sElE:', 'mJEsi:t_sE', 'sE', 'JIgdo', 'nEpoJ\\i:va:'],
 ['nES', 'vra:ci:', 'sE', 'snat', 'SalvjEj', 'vIvanE'],
 ['sci:n', 'tra:mu:', 'vEt_SErJi:', 'boj', 'muSi:', 'ukrIjE'],
 ['on', 'f', 'okno', 'nEsmJElE:', 'sE', 'J\\i:vat', 

In [7]:
from typing import Callable


def get_syllable_cnts(tokens: list, syllabification_func: Callable) -> list:
    """
    Apply syllabification function to every token and return number of syllables in every line.
    :param tokens: Tokens to syllabify
    :param syllabification_func: Syllabification function
    :return: Number of syllables in every line
    """
    return [[np.array([len(syllabification_func(token)) for token in line]) for line in poem] for poem in tokens]

In [8]:
from src.kveta.sampa_syllable_parser import SampaSyllableParser

sampa_parser = SampaSyllableParser()

ref_syllable_cnts = get_syllable_cnts(sampa_tokens, sampa_parser.split_to_syllables)
ref_syllable_cnts[1]

[array([1, 1, 3, 1, 2, 4]),
 array([1, 1, 3, 1, 2, 3]),
 array([1, 1, 2, 3, 1, 2, 2, 2]),
 array([1, 3, 1, 2, 3]),
 array([1, 1, 1, 1, 2, 1, 2, 2, 2]),
 array([1, 1, 1, 1, 2, 1, 2, 2, 2]),
 array([1, 2, 3, 1, 1, 1, 1, 1, 2, 4]),
 array([1, 1, 1, 3, 1, 2, 2, 2]),
 array([1, 1, 1, 2, 2, 3]),
 array([1, 2, 3, 1, 2, 2, 2]),
 array([1, 2, 3, 1, 3, 3]),
 array([1, 2, 3, 1, 2, 4]),
 array([1, 2, 1, 1, 2, 3]),
 array([1, 2, 3, 1, 2, 3]),
 array([1, 1, 2, 3, 1, 2, 3]),
 array([1, 1, 3, 3, 1, 2, 3])]

In [9]:
from src.syllabification.pat_data_loader import PatDataLoader
from src.syllabification.czech_tex_hyphenator import CzechTexHyphenator

pat_data_loader = PatDataLoader(base_dir, "src/syllabification/resources")
patterns = pat_data_loader.load_data("csskhyphen.pat")
tex_hyphenator = CzechTexHyphenator(patterns)

tex_syllable_cnts = get_syllable_cnts(tokens, tex_hyphenator.hyphenate_word)
tex_syllable_cnts[1]

[array([1, 1, 3, 1, 2, 5]),
 array([1, 1, 3, 1, 2, 2]),
 array([1, 1, 2, 3, 1, 1, 1, 1]),
 array([1, 3, 1, 2, 3]),
 array([1, 1, 1, 1, 1, 1, 2, 2, 1]),
 array([1, 1, 1, 1, 2, 1, 1, 2, 1]),
 array([1, 2, 3, 1, 1, 1, 1, 1, 1, 4]),
 array([1, 1, 1, 3, 1, 2, 2, 1]),
 array([1, 1, 1, 1, 2, 3]),
 array([1, 1, 3, 1, 1, 2, 2]),
 array([1, 1, 3, 1, 3, 3]),
 array([1, 1, 3, 1, 3, 4]),
 array([1, 2, 1, 1, 2, 3]),
 array([1, 2, 3, 1, 1, 2]),
 array([1, 1, 1, 3, 1, 2, 4]),
 array([1, 1, 3, 3, 1, 2, 3])]

In [11]:
from src.syllabification.tex_heuristics import tex_heuristics

tex_heuristics_syllable_cnts = get_syllable_cnts(tokens, lambda x: tex_heuristics(tex_hyphenator.hyphenate_word(x)))
tex_heuristics_syllable_cnts[1]

[array([1, 1, 3, 1, 2, 4]),
 array([1, 1, 3, 1, 2, 2]),
 array([1, 1, 2, 3, 1, 1, 1, 1]),
 array([1, 3, 1, 2, 3]),
 array([1, 1, 1, 1, 1, 1, 2, 2, 1]),
 array([1, 1, 1, 1, 2, 1, 1, 2, 1]),
 array([1, 2, 3, 1, 1, 1, 1, 1, 1, 4]),
 array([1, 1, 1, 3, 1, 2, 2, 1]),
 array([1, 1, 1, 1, 2, 3]),
 array([1, 1, 3, 1, 1, 2, 2]),
 array([1, 1, 3, 1, 3, 3]),
 array([1, 1, 3, 1, 2, 4]),
 array([1, 2, 1, 1, 2, 3]),
 array([1, 2, 3, 1, 1, 2]),
 array([1, 1, 1, 3, 1, 2, 3]),
 array([1, 1, 3, 3, 1, 2, 3])]

In [16]:
import pyphen

pyphen_dict = pyphen.Pyphen(lang="cs_CZ")

pyphen_syllable_cnts = get_syllable_cnts(tokens, lambda x: pyphen_dict.inserted(x).split("-"))
pyphen_syllable_cnts[1]

[array([1, 1, 3, 1, 2, 4]),
 array([1, 1, 3, 1, 2, 2]),
 array([1, 1, 2, 3, 1, 2, 1, 2]),
 array([1, 3, 1, 2, 3]),
 array([1, 1, 1, 1, 2, 1, 2, 2, 2]),
 array([1, 1, 1, 1, 2, 1, 2, 2, 2]),
 array([1, 2, 3, 1, 1, 1, 1, 1, 2, 4]),
 array([1, 1, 1, 3, 1, 2, 2, 2]),
 array([1, 1, 1, 1, 2, 3]),
 array([1, 1, 3, 1, 2, 2, 2]),
 array([1, 2, 3, 1, 3, 3]),
 array([1, 2, 3, 1, 2, 4]),
 array([1, 2, 1, 1, 2, 3]),
 array([1, 2, 3, 1, 2, 2]),
 array([1, 1, 2, 3, 1, 2, 3]),
 array([1, 1, 3, 3, 1, 2, 3])]

In [12]:
def get_avg_token_syll_cnt_diff(syllabification_approach: str, ref_syllable_cnts: list, pred_syllable_cnts: list) -> float:
    """
    Get the average difference in number of syllables for one token for some syllabification approach.
    :param syllabification_approach: Syllabification approach name
    :param ref_syllable_cnts: Referential syllable counts
    :param pred_syllable_cnts: Predicted syllable counts
    :return: Average difference in number of syllables for one token
    """
    token_len_diffs = [x for poem_ref, poem_tex in zip(ref_syllable_cnts, pred_syllable_cnts) for line_ref, line_tex in zip(poem_ref, poem_tex) for x in
                       np.abs(np.subtract(line_ref, line_tex))]
    avg_token_len_diff = np.average(token_len_diffs)

    print(f"For {syllabification_approach} the average difference in number of syllables for one token is: {avg_token_len_diff:.2f}")

    return avg_token_len_diff

In [13]:
get_avg_token_syll_cnt_diff("Czech TEX hyphenation patterns", ref_syllable_cnts, tex_syllable_cnts)

For Czech TEX hyphenation patterns the average difference in number of syllables for one token is: 0.21


0.21010996421526468

In [14]:
get_avg_token_syll_cnt_diff("Czech TEX hyphenation patterns with heuristics", ref_syllable_cnts, tex_heuristics_syllable_cnts)

For Czech TEX hyphenation patterns with heuristics the average difference in number of syllables for one token is: 0.15


0.14877193439904024

In [17]:
get_avg_token_syll_cnt_diff("Pyphen", ref_syllable_cnts, pyphen_syllable_cnts)

For Pyphen the average difference in number of syllables for one token is: 0.06


0.0577840970166776

In [18]:
def get_avg_percentage_not_ok_tokens_in_line(syllabification_approach: str, ref_syllable_cnts: list, pred_syllable_cnts: list) -> float:
    """
    Get average percentage of incorrectly syllabified tokens in one line.
    :param syllabification_approach: Syllabification approach name
    :param ref_syllable_cnts: Referential syllable counts
    :param pred_syllable_cnts: Predicted syllable counts
    :return: Average percentage of incorrectly syllabified tokens in one line
    """
    not_ok_tokens_percentages = [sum(line_ref != line_pred) / len(line_ref) for poem_ref, poem_pred in zip(ref_syllable_cnts, pred_syllable_cnts) for
                                 line_ref, line_pred in
                                 zip(poem_ref, poem_pred)]
    avg_percentage_not_ok_tokens = np.average(not_ok_tokens_percentages)

    print(f"For {syllabification_approach} on average {avg_percentage_not_ok_tokens * 100:.2f} % of tokens in line have incorrect number of syllables")

    return avg_percentage_not_ok_tokens

In [19]:
get_avg_percentage_not_ok_tokens_in_line("Czech TEX hyphenation patterns", ref_syllable_cnts, tex_syllable_cnts)

For Czech TEX hyphenation patterns on average 21.63 % of tokens in line have incorrect number of syllables


0.2163037549080216

In [20]:
get_avg_percentage_not_ok_tokens_in_line("Czech TEX hyphenation patterns with heuristics", ref_syllable_cnts, tex_heuristics_syllable_cnts)

For Czech TEX hyphenation patterns with heuristics on average 15.31 % of tokens in line have incorrect number of syllables


0.1530683181373172

In [21]:
get_avg_percentage_not_ok_tokens_in_line("Pyphen", ref_syllable_cnts, pyphen_syllable_cnts)

For Pyphen on average 6.18 % of tokens in line have incorrect number of syllables


0.06178628648536689

In [22]:
for sampa_poem, poem in zip(sampa_tokens[:1], tokens[:1]):
    for sampa_line, line in zip(sampa_poem, poem):
        for sampa_token, token in zip(sampa_line, line):
            print(
                f"Token: {token}, X-SAMPA syllables: {sampa_parser.split_to_syllables(sampa_token)}, TeX patterns: {tex_hyphenator.hyphenate_word(token)}, TeX heuristics: {tex_heuristics(tex_hyphenator.hyphenate_word(token))} Pyphen: {pyphen_dict.inserted(token)}")

Token: můj, X-SAMPA syllables: ['mu:j'], TeX patterns: ['můj'], TeX heuristics: ['můj'] Pyphen: můj
Token: koníček, X-SAMPA syllables: ['ko', 'Ji:', 't_SEk'], TeX patterns: ['ko', 'ní', 'ček'], TeX heuristics: ['ko', 'ní', 'ček'] Pyphen: ko-ní-ček
Token: vraný, X-SAMPA syllables: ['vra', 'ni:'], TeX patterns: ['vra', 'ný'], TeX heuristics: ['vra', 'ný'] Pyphen: vra-ný
Token: jako, X-SAMPA syllables: ['ja', 'ko'], TeX patterns: ['jako'], TeX heuristics: ['jako'] Pyphen: ja-ko
Token: malovaný, X-SAMPA syllables: ['ma', 'lo', 'va', 'ni:'], TeX patterns: ['ma', 'lo', 'va', 'ný'], TeX heuristics: ['ma', 'lo', 'va', 'ný'] Pyphen: ma-lo-va-ný
Token: hopsa, X-SAMPA syllables: ['h\\o', 'psa'], TeX patterns: ['ho', 'p', 'sa'], TeX heuristics: ['hop', 'sa'] Pyphen: hopsa
Token: hejsa, X-SAMPA syllables: ['h\\E', 'jsa'], TeX patterns: ['hej', 'sa'], TeX heuristics: ['hej', 'sa'] Pyphen: hejsa
Token: hej, X-SAMPA syllables: ['h\\Ej'], TeX patterns: ['hej'], TeX heuristics: ['hej'] Pyphen: hej
Token