Skip to content

Commit

Permalink
[egs] Fixes to yomdle_zh RE encoding direction, etc. (#2791)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChunChiehChang authored and danpovey committed Oct 19, 2018
1 parent f5e8171 commit 2668098
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 2 deletions.
58 changes: 58 additions & 0 deletions egs/wsj/s5/utils/lang/bpe/bidi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
# Copyright 2018 Chun-Chieh Chang

# This script is largely written by Stephen Rawls
# and uses the python package https://pypi.org/project/PyICU_BiDi/
# The code leaves right to left text alone and reverses left to right text.

import icu_bidi
import io
import sys
import unicodedata
# R=strong right-to-left; AL=strong arabic right-to-left
rtl_set = set(chr(i) for i in range(sys.maxunicode)
if unicodedata.bidirectional(chr(i)) in ['R','AL'])
def determine_text_direction(text):
# Easy case first
for char in text:
if char in rtl_set:
return icu_bidi.UBiDiLevel.UBIDI_RTL
# If we made it here we did not encounter any strongly rtl char
return icu_bidi.UBiDiLevel.UBIDI_LTR

def utf8_visual_to_logical(text):
text_dir = determine_text_direction(text)

bidi = icu_bidi.Bidi()
bidi.inverse = True
bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS

bidi.set_para(text, text_dir, None)

res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)

return res

def utf8_logical_to_visual(text):
text_dir = determine_text_direction(text)

bidi = icu_bidi.Bidi()

bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS

bidi.set_para(text, text_dir, None)

res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)

return res


##main##
sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
for line in sys.stdin:
line = line.strip()
line = utf8_logical_to_visual(line)[::-1]
sys.stdout.write(line + '\n')
1 change: 1 addition & 0 deletions egs/yomdle_zh/v1/local/create_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ local/create_line_image_from_page_image.py \

echo "Downloading table for CangJie."
wget -P $download_dir/ $cangjie_url || exit 1;
sed -ie '1,8d' $download_dir/cj5-cc.txt
2 changes: 1 addition & 1 deletion egs/yomdle_zh/v1/local/train_lm_lr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ if [ $stage -le 0 ]; then

rm ${dir}/data/text/* 2>/dev/null || true

cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
cat ${extra_lm} | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt

# Note: the name 'dev' is treated specially by pocolm, it automatically
# becomes the dev set.
Expand Down
2 changes: 1 addition & 1 deletion egs/yomdle_zh/v1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ fi
if [ $stage -le 6 ]; then
echo "$0: Aligning the training data using the e2e chain model..."
echo "Date: $(date)."
steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \
--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
$data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
fi
Expand Down

0 comments on commit 2668098

Please sign in to comment.