Skip to content

Commit 8ff7fd9

Browse files
xiaohui-zhangdanpovey
authored andcommitted
[scripts] Added utils/prepare_extended_lang.sh (extending vocab of a lang-dir) and rnnlm/change_vocab.sh (changing vocab of an existing rnnlm-dir) (#2247)
1 parent 4fda1b0 commit 8ff7fd9

File tree

4 files changed

+397
-0
lines changed

4 files changed

+397
-0
lines changed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env perl
2+
3+
# Copyright 2018 Xiaohui Zhang
4+
# Apache 2.0.
5+
#
6+
use strict;
7+
use warnings;
8+
use Getopt::Long;
9+
10+
my $Usage = <<EOU;
11+
# This is a simple script to add unigrams to an ARPA lm file.
12+
Usage: utils/lang/add_unigrams_arpa.pl [options] <oov-prob-file> <scale> <input-arpa >output-arpa
13+
<oov-prob-file> contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be
14+
scaled by a positive scalar <scale> and then be used as the unigram prob. of the added word.
15+
The scale should approximiately relect the OOV rate of the language in concern.
16+
EOU
17+
18+
my @F;
19+
my @OOVS;
20+
21+
if (@ARGV != 2) {
22+
die $Usage;
23+
}
24+
25+
# Gets parameters.
26+
my $oov_prob_file = shift @ARGV;
27+
my $scale = shift @ARGV;
28+
my $arpa_in = shift @ARGV;
29+
my $arpa_out = shift @ARGV;
30+
31+
# Opens files.
32+
open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file\n";
33+
while (<F>) { push @OOVS, $_; }
34+
my $num_oovs = @OOVS;
35+
36+
$scale > 0.0 || die "Bad scale";
37+
print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file\n";
38+
39+
my %vocab;
40+
my $unigram = 0;
41+
my $num_unigrams = 0;
42+
my @lines;
43+
44+
# Parse and record the head and unigrams in the ARPA LM.
45+
while(<STDIN>) {
46+
if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; }
47+
48+
if (m/^\\2-grams:$/) { last; }
49+
if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; }
50+
if (m/^\\2-grams:$/) { $unigram = 0; }
51+
52+
my @col = split(" ", $_);
53+
if ( $unigram == 1 ) {
54+
# Record in-vocab words into a map.
55+
if ( @col > 0 ) {
56+
my $word = $col[1];
57+
$vocab{$word} = 1;
58+
push(@lines, $_);
59+
} else {
60+
# Insert out-of-vocab words and their probs into the unigram list.
61+
foreach my $l (@OOVS) {
62+
my @A = split(" ", $l);
63+
@A == 2 || die "bad line in oov2prob: $_;";
64+
my $word = $A[0];
65+
my $prob = $A[1];
66+
if (exists($vocab{$word})) { next; }
67+
$num_unigrams ++;
68+
my $log10prob = (log($prob * $scale) / log(10.0));
69+
$vocab{$word} = 1;
70+
my $line = sprintf("%.6f\t$word\n", $log10prob);
71+
push(@lines, $line);
72+
}
73+
}
74+
} else { push(@lines, $_); }
75+
}
76+
77+
# Print the head and unigrams, with the updated # unigrams in the head.
78+
foreach my $l (@lines) {
79+
if ($l =~ m/ngram 1=/) {
80+
print "ngram 1=$num_unigrams\n";
81+
} else {
82+
print $l;
83+
}
84+
}
85+
86+
# Print the left fields.
87+
print "\n\\2-grams:\n";
88+
while(<STDIN>) {
89+
print;
90+
}
91+
92+
close(F);
93+
exit 0
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env perl
2+
3+
# Copyright 2018 Xiaohui Zhang
4+
# Apache 2.0.
5+
#
6+
use strict;
7+
use warnings;
8+
use Getopt::Long;
9+
10+
my $Usage = <<EOU;
11+
# This is a simple script to set/scale the unigram prob of the OOV dict entry in an ARPA lm file.
12+
Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa
13+
14+
Allowed options:
15+
--fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to
16+
the unigram prob of the OOV dict entry, rather than using it to
17+
scale the unigram prob.
18+
EOU
19+
20+
my $fixed_value = "false";
21+
GetOptions('fixed-value=s' => \$fixed_value);
22+
23+
($fixed_value eq "true" || $fixed_value eq "false") ||
24+
die "$0: Bad value for option --fixed-value\n";
25+
26+
if (@ARGV != 2) {
27+
die $Usage;
28+
}
29+
30+
# Gets parameters.
31+
my $unk_word = shift @ARGV;
32+
my $unk_scale = shift @ARGV;
33+
my $arpa_in = shift @ARGV;
34+
my $arpa_out = shift @ARGV;
35+
36+
$unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive
37+
if ( $fixed_value eq "true" ) {
38+
print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
39+
} else {
40+
print STDERR "$0: Scaling the unigram prob of $unk_word in LM file by $unk_scale.\n";
41+
}
42+
43+
my $unigram = 0; # wether we are visiting the unigram field or not.
44+
45+
# Change the unigram prob of the unk-word in the ARPA LM.
46+
while(<STDIN>) {
47+
if (m/^\\1-grams:$/) { $unigram = 1; }
48+
if (m/^\\2-grams:$/) { $unigram = 0; }
49+
my @col = split(" ", $_);
50+
if ( $unigram == 1 && @col > 1 && $col[1] eq $unk_word ) {
51+
if ( $fixed_value eq "true" ) {
52+
$col[0] = (log($unk_scale) / log(10.0));
53+
} else {
54+
$col[0] += (log($unk_scale) / log(10.0));
55+
}
56+
my $line = join("\t", @col);
57+
print "$line\n";
58+
} else {
59+
print;
60+
}
61+
}
62+
63+
exit 0
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#!/bin/bash
2+
# Copyright 2018 Xiaohui Zhang
3+
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11+
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12+
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13+
# MERCHANTABLITY OR NON-INFRINGEMENT.
14+
# See the Apache 2 License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# This script adds word-position-dependent phones and constructs a host of other
18+
# derived files, that go in data/lang/.
19+
20+
# Begin configuration section.
21+
prep_lang_opts=
22+
stage=0
23+
word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
24+
# we'll make sure the IDs of these words are kept as before.
25+
# end configuration sections
26+
27+
echo "$0 $@" # Print the command line for logging
28+
29+
. utils/parse_options.sh
30+
31+
if [ $# -ne 7 ]; then
32+
echo "usage: utils/prepare_extended_lang.sh <dict-src-dir> <oov-dict-entry> <extra-lexicon> "
33+
echo "<phone-symbol-table> <extended-dict-dir> <tmp-dir> <extended-lang-dir>"
34+
echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '<SPOKEN_NOISE>' lexicon_extra.txt"
35+
echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext"
36+
echo "The goal is to extend the lexicon from <dict-src-dir> with extra lexical entries from "
37+
echo "<extra-lexicon>, putting the extended lexicon into <extended-dict-dir>, and then build"
38+
echo "a valid lang dir <extended-lang-dir>. This is useful when we want to extend the vocab"
39+
echo "in test time."
40+
echo "<dict-src-dir> must be a valid dictionary dir and <oov-dict-entry> is the oov word "
41+
echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built "
42+
echo "lang dir is required, for validating provided lexical entries."
43+
echo "options: "
44+
echo " --prep-lang-opts STRING # options to pass to utils/prepare_lang.sh"
45+
echo " --word-list <filename> # default: \"\"; if not empty, re-order the "
46+
echo " # words in the generated words.txt so that the"
47+
echo " # words from the provided list have their ids"
48+
echo " # kept unchanged."
49+
exit 1;
50+
fi
51+
52+
srcdict=$1
53+
oov_word=$2
54+
extra_lexicon=$3
55+
phone_symbol_table=$4
56+
extdict=$5 # extended dict dir
57+
tmpdir=$6
58+
extlang=$7 # extended lang dir
59+
60+
mkdir -p $extlang $tmpdir
61+
62+
[ -f path.sh ] && . ./path.sh
63+
64+
! utils/validate_dict_dir.pl $srcdict && \
65+
echo "*Error validating directory $srcdict*" && exit 1;
66+
67+
if [[ ! -f $srcdict/lexicon.txt ]]; then
68+
echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
69+
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \
70+
> $srcdict/lexicon.txt || exit 1;
71+
fi
72+
73+
if [[ ! -f $srcdict/lexiconp.txt ]]; then
74+
echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt"
75+
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1;
76+
fi
77+
78+
# Checks if the phone sets match.
79+
echo "$(basename $0): Validating the source lexicon"
80+
cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table '
81+
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
82+
{ for (x = 2; x <= NF; ++x) {
83+
if (!($x in phones)) {
84+
print "The source lexicon contains a phone not in the phones.txt: "$x;
85+
print "You must provide a phones.txt from the lang built with the source lexicon.";
86+
exit 1;
87+
}
88+
}}' || exit 1;
89+
90+
echo "$(basename $0): Validating the extra lexicon"
91+
cat $extra_lexicon | awk -v f=$phone_symbol_table '
92+
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
93+
{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) {
94+
print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; }
95+
}
96+
}' || exit 1;
97+
98+
if [ $stage -le 0 ]; then
99+
# Genearte the extended dict dir
100+
echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt"
101+
[ -d $extdict ] && rm -r $extdict 2>/dev/null
102+
cp -R $srcdict $extdict 2>/dev/null
103+
104+
# Reformat the source lexicon
105+
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \
106+
>$tmpdir/lexicon.txt || exit 1;
107+
108+
# Filter lexical entries which are already in the source lexicon
109+
awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \
110+
awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \
111+
> $extdict/lexicon_extra.txt || exit 1;
112+
113+
echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt"
114+
perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \
115+
cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \
116+
sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1;
117+
118+
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt >$extdict/lexicon.txt || exit 1;
119+
120+
# Create lexicon_silprobs.txt
121+
silprob=false
122+
[ -f $srcdict/lexiconp_silprob.txt ] && silprob=true
123+
if "$silprob"; then
124+
echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt"
125+
# Here we assume no acoustic evidence for the extra word-pron pairs.
126+
# So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00
127+
overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt`
128+
awk -v overall=$overall_silprob '{
129+
printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00);
130+
for(n=2;n<=NF;n++) printf " "$n; printf("\n");
131+
}' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \
132+
sort -k1,1 -k2g,2 -k6 \
133+
> $extdict/lexiconp_silprob.txt || exit 1;
134+
fi
135+
136+
if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then
137+
utils/validate_dict_dir.pl $extdict # show the output.
138+
echo "$(basename $0): Validation failed on the extended dict"
139+
exit 1;
140+
fi
141+
fi
142+
143+
if [ $stage -le 1 ]; then
144+
echo "$(basename $0): Preparing the extended lang dir."
145+
[ -d $extlang ] && rm -r $extlang 2>/dev/null
146+
utils/prepare_lang.sh $prep_lang_opts $extdict \
147+
$oov_word $tmpdir $extlang || exit 1;
148+
149+
# If a word list is provided, make sure the word-ids of these words are kept unchanged
150+
# in the extended word list.
151+
if [ -f $word_list ]; then
152+
# First, make sure there's no OOV in the provided word-list.
153+
if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \
154+
{if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then
155+
echo "$(basename $0): The provided word list contains words out of the extended vocab."
156+
exit 1;
157+
fi
158+
awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \
159+
'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \
160+
{ if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$
161+
mv $extlang/words.txt.$$ $extlang/words.txt
162+
fi
163+
fi
164+
165+
exit 0;

0 commit comments

Comments
 (0)