Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion scriptshifter/trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

from importlib import import_module
from re import Pattern, compile
from unicodedata import normalize as precomp_normalize

from scriptshifter.exceptions import BREAK, CONT
from scriptshifter.tables import (
BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
get_lang_ignore, get_lang_map, get_lang_normalize)

Expand Down Expand Up @@ -342,6 +343,17 @@ def _normalize_src(ctx, norm_rules):
NOTE: this manipluates the protected source attribute so it may not
correspond to the originally provided source.
"""
# Normalize precomposed Unicode characters.
#
# In using diacritics, LC standards prefer the decomposed form (combining
# diacritic + base character) to the pre-composed form (single Unicode
# symbol for the letter with diacritic).
#
# Note: only safe for R2S.
if ctx.t_dir == FEAT_R2S:
logger.debug("Normalizing pre-composed symbols.")
ctx._src = precomp_normalize("NFD", ctx.src)

for nk, nv in norm_rules.items():
ctx._src = ctx.src.replace(nk, nv)

Expand Down