|
| 1 | +#!/bin/bash |
| 2 | +# Copyright 2018 Xiaohui Zhang |
| 3 | + |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 11 | +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED |
| 12 | +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, |
| 13 | +# MERCHANTABLITY OR NON-INFRINGEMENT. |
| 14 | +# See the Apache 2 License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +# This script adds word-position-dependent phones and constructs a host of other |
| 18 | +# derived files, that go in data/lang/. |
| 19 | + |
| 20 | +# Begin configuration section. |
| 21 | +prep_lang_opts= |
| 22 | +stage=0 |
| 23 | +word_list= # if a word list (mapping words from the srcdict to IDs) is provided, |
| 24 | +# we'll make sure the IDs of these words are kept as before. |
| 25 | +# end configuration sections |
| 26 | + |
| 27 | +echo "$0 $@" # Print the command line for logging |
| 28 | + |
| 29 | +. utils/parse_options.sh |
| 30 | + |
| 31 | +if [ $# -ne 7 ]; then |
| 32 | + echo "usage: utils/prepare_extended_lang.sh <dict-src-dir> <oov-dict-entry> <extra-lexicon> " |
| 33 | + echo "<phone-symbol-table> <extended-dict-dir> <tmp-dir> <extended-lang-dir>" |
| 34 | + echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '<SPOKEN_NOISE>' lexicon_extra.txt" |
| 35 | + echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext" |
| 36 | + echo "The goal is to extend the lexicon from <dict-src-dir> with extra lexical entries from " |
| 37 | + echo "<extra-lexicon>, putting the extended lexicon into <extended-dict-dir>, and then build" |
| 38 | + echo "a valid lang dir <extended-lang-dir>. This is useful when we want to extend the vocab" |
| 39 | + echo "in test time." |
| 40 | + echo "<dict-src-dir> must be a valid dictionary dir and <oov-dict-entry> is the oov word " |
| 41 | + echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built " |
| 42 | + echo "lang dir is required, for validating provided lexical entries." |
| 43 | + echo "options: " |
| 44 | + echo " --prep-lang-opts STRING # options to pass to utils/prepare_lang.sh" |
| 45 | + echo " --word-list <filename> # default: \"\"; if not empty, re-order the " |
| 46 | + echo " # words in the generated words.txt so that the" |
| 47 | + echo " # words from the provided list have their ids" |
| 48 | + echo " # kept unchanged." |
| 49 | + exit 1; |
| 50 | +fi |
| 51 | + |
| 52 | +srcdict=$1 |
| 53 | +oov_word=$2 |
| 54 | +extra_lexicon=$3 |
| 55 | +phone_symbol_table=$4 |
| 56 | +extdict=$5 # extended dict dir |
| 57 | +tmpdir=$6 |
| 58 | +extlang=$7 # extended lang dir |
| 59 | + |
| 60 | +mkdir -p $extlang $tmpdir |
| 61 | + |
| 62 | +[ -f path.sh ] && . ./path.sh |
| 63 | + |
| 64 | +! utils/validate_dict_dir.pl $srcdict && \ |
| 65 | + echo "*Error validating directory $srcdict*" && exit 1; |
| 66 | + |
| 67 | +if [[ ! -f $srcdict/lexicon.txt ]]; then |
| 68 | + echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" |
| 69 | + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \ |
| 70 | + > $srcdict/lexicon.txt || exit 1; |
| 71 | +fi |
| 72 | + |
| 73 | +if [[ ! -f $srcdict/lexiconp.txt ]]; then |
| 74 | + echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt" |
| 75 | + perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1; |
| 76 | +fi |
| 77 | + |
| 78 | +# Checks if the phone sets match. |
| 79 | +echo "$(basename $0): Validating the source lexicon" |
| 80 | +cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table ' |
| 81 | +BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} |
| 82 | +{ for (x = 2; x <= NF; ++x) { |
| 83 | + if (!($x in phones)) { |
| 84 | + print "The source lexicon contains a phone not in the phones.txt: "$x; |
| 85 | + print "You must provide a phones.txt from the lang built with the source lexicon."; |
| 86 | + exit 1; |
| 87 | + } |
| 88 | +}}' || exit 1; |
| 89 | + |
| 90 | +echo "$(basename $0): Validating the extra lexicon" |
| 91 | +cat $extra_lexicon | awk -v f=$phone_symbol_table ' |
| 92 | +BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} |
| 93 | +{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) { |
| 94 | + print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; } |
| 95 | + } |
| 96 | +}' || exit 1; |
| 97 | + |
| 98 | +if [ $stage -le 0 ]; then |
| 99 | + # Genearte the extended dict dir |
| 100 | + echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt" |
| 101 | + [ -d $extdict ] && rm -r $extdict 2>/dev/null |
| 102 | + cp -R $srcdict $extdict 2>/dev/null |
| 103 | + |
| 104 | + # Reformat the source lexicon |
| 105 | + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \ |
| 106 | + >$tmpdir/lexicon.txt || exit 1; |
| 107 | + |
| 108 | + # Filter lexical entries which are already in the source lexicon |
| 109 | + awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \ |
| 110 | + awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \ |
| 111 | + > $extdict/lexicon_extra.txt || exit 1; |
| 112 | + |
| 113 | + echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt" |
| 114 | + perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \ |
| 115 | + cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \ |
| 116 | + sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1; |
| 117 | + |
| 118 | + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt >$extdict/lexicon.txt || exit 1; |
| 119 | + |
| 120 | + # Create lexicon_silprobs.txt |
| 121 | + silprob=false |
| 122 | + [ -f $srcdict/lexiconp_silprob.txt ] && silprob=true |
| 123 | + if "$silprob"; then |
| 124 | + echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt" |
| 125 | + # Here we assume no acoustic evidence for the extra word-pron pairs. |
| 126 | + # So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00 |
| 127 | + overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt` |
| 128 | + awk -v overall=$overall_silprob '{ |
| 129 | + printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00); |
| 130 | + for(n=2;n<=NF;n++) printf " "$n; printf("\n"); |
| 131 | + }' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \ |
| 132 | + sort -k1,1 -k2g,2 -k6 \ |
| 133 | + > $extdict/lexiconp_silprob.txt || exit 1; |
| 134 | + fi |
| 135 | + |
| 136 | + if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then |
| 137 | + utils/validate_dict_dir.pl $extdict # show the output. |
| 138 | + echo "$(basename $0): Validation failed on the extended dict" |
| 139 | + exit 1; |
| 140 | + fi |
| 141 | +fi |
| 142 | + |
| 143 | +if [ $stage -le 1 ]; then |
| 144 | + echo "$(basename $0): Preparing the extended lang dir." |
| 145 | + [ -d $extlang ] && rm -r $extlang 2>/dev/null |
| 146 | + utils/prepare_lang.sh $prep_lang_opts $extdict \ |
| 147 | + $oov_word $tmpdir $extlang || exit 1; |
| 148 | + |
| 149 | + # If a word list is provided, make sure the word-ids of these words are kept unchanged |
| 150 | + # in the extended word list. |
| 151 | + if [ -f $word_list ]; then |
| 152 | + # First, make sure there's no OOV in the provided word-list. |
| 153 | + if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \ |
| 154 | + {if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then |
| 155 | + echo "$(basename $0): The provided word list contains words out of the extended vocab." |
| 156 | + exit 1; |
| 157 | + fi |
| 158 | + awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \ |
| 159 | + 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \ |
| 160 | + { if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$ |
| 161 | + mv $extlang/words.txt.$$ $extlang/words.txt |
| 162 | + fi |
| 163 | +fi |
| 164 | + |
| 165 | +exit 0; |
0 commit comments