Permalink
Browse files

[scripts] Added utils/prepare_extended_lang.sh (extending vocab of a …

…lang-dir) and rnnlm/change_vocab.sh (changing vocab of an existing rnnlm-dir) (#2247)
  • Loading branch information...
xiaohui-zhang authored and danpovey committed Apr 24, 2018
1 parent 4fda1b0 commit 8ff7fd9f84e89a652716956d8989e9205d7bf52f
@@ -0,0 +1,93 @@
#!/usr/bin/env perl
# Copyright 2018 Xiaohui Zhang
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;
my $Usage = <<EOU;
# This is a simple script to add unigrams to an ARPA lm file.
Usage: utils/lang/add_unigrams_arpa.pl [options] <oov-prob-file> <scale> <input-arpa >output-arpa
<oov-prob-file> contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be
scaled by a positive scalar <scale> and then be used as the unigram prob. of the added word.
The scale should approximiately relect the OOV rate of the language in concern.
EOU
my @F;
my @OOVS;
if (@ARGV != 2) {
die $Usage;
}
# Gets parameters.
my $oov_prob_file = shift @ARGV;
my $scale = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;
# Opens files.
open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file\n";
while (<F>) { push @OOVS, $_; }
my $num_oovs = @OOVS;
$scale > 0.0 || die "Bad scale";
print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file\n";
my %vocab;
my $unigram = 0;
my $num_unigrams = 0;
my @lines;
# Parse and record the head and unigrams in the ARPA LM.
while(<STDIN>) {
if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; }
if (m/^\\2-grams:$/) { last; }
if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; }
if (m/^\\2-grams:$/) { $unigram = 0; }
my @col = split(" ", $_);
if ( $unigram == 1 ) {
# Record in-vocab words into a map.
if ( @col > 0 ) {
my $word = $col[1];
$vocab{$word} = 1;
push(@lines, $_);
} else {
# Insert out-of-vocab words and their probs into the unigram list.
foreach my $l (@OOVS) {
my @A = split(" ", $l);
@A == 2 || die "bad line in oov2prob: $_;";
my $word = $A[0];
my $prob = $A[1];
if (exists($vocab{$word})) { next; }
$num_unigrams ++;
my $log10prob = (log($prob * $scale) / log(10.0));
$vocab{$word} = 1;
my $line = sprintf("%.6f\t$word\n", $log10prob);
push(@lines, $line);
}
}
} else { push(@lines, $_); }
}
# Print the head and unigrams, with the updated # unigrams in the head.
foreach my $l (@lines) {
if ($l =~ m/ngram 1=/) {
print "ngram 1=$num_unigrams\n";
} else {
print $l;
}
}
# Print the left fields.
print "\n\\2-grams:\n";
while(<STDIN>) {
print;
}
close(F);
exit 0
@@ -0,0 +1,63 @@
#!/usr/bin/env perl
# Copyright 2018 Xiaohui Zhang
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;
my $Usage = <<EOU;
# This is a simple script to set/scale the unigram prob of the OOV dict entry in an ARPA lm file.
Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa
Allowed options:
--fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to
the unigram prob of the OOV dict entry, rather than using it to
scale the unigram prob.
EOU
my $fixed_value = "false";
GetOptions('fixed-value=s' => \$fixed_value);
($fixed_value eq "true" || $fixed_value eq "false") ||
die "$0: Bad value for option --fixed-value\n";
if (@ARGV != 2) {
die $Usage;
}
# Gets parameters.
my $unk_word = shift @ARGV;
my $unk_scale = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;
$unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive
if ( $fixed_value eq "true" ) {
print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
} else {
print STDERR "$0: Scaling the unigram prob of $unk_word in LM file by $unk_scale.\n";
}
my $unigram = 0; # wether we are visiting the unigram field or not.
# Change the unigram prob of the unk-word in the ARPA LM.
while(<STDIN>) {
if (m/^\\1-grams:$/) { $unigram = 1; }
if (m/^\\2-grams:$/) { $unigram = 0; }
my @col = split(" ", $_);
if ( $unigram == 1 && @col > 1 && $col[1] eq $unk_word ) {
if ( $fixed_value eq "true" ) {
$col[0] = (log($unk_scale) / log(10.0));
} else {
$col[0] += (log($unk_scale) / log(10.0));
}
my $line = join("\t", @col);
print "$line\n";
} else {
print;
}
}
exit 0
@@ -0,0 +1,165 @@
#!/bin/bash
# Copyright 2018 Xiaohui Zhang
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.
# Begin configuration section.
prep_lang_opts=
stage=0
word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
# we'll make sure the IDs of these words are kept as before.
# end configuration sections
echo "$0 $@" # Print the command line for logging
. utils/parse_options.sh
if [ $# -ne 7 ]; then
echo "usage: utils/prepare_extended_lang.sh <dict-src-dir> <oov-dict-entry> <extra-lexicon> "
echo "<phone-symbol-table> <extended-dict-dir> <tmp-dir> <extended-lang-dir>"
echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '<SPOKEN_NOISE>' lexicon_extra.txt"
echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext"
echo "The goal is to extend the lexicon from <dict-src-dir> with extra lexical entries from "
echo "<extra-lexicon>, putting the extended lexicon into <extended-dict-dir>, and then build"
echo "a valid lang dir <extended-lang-dir>. This is useful when we want to extend the vocab"
echo "in test time."
echo "<dict-src-dir> must be a valid dictionary dir and <oov-dict-entry> is the oov word "
echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built "
echo "lang dir is required, for validating provided lexical entries."
echo "options: "
echo " --prep-lang-opts STRING # options to pass to utils/prepare_lang.sh"
echo " --word-list <filename> # default: \"\"; if not empty, re-order the "
echo " # words in the generated words.txt so that the"
echo " # words from the provided list have their ids"
echo " # kept unchanged."
exit 1;
fi
srcdict=$1
oov_word=$2
extra_lexicon=$3
phone_symbol_table=$4
extdict=$5 # extended dict dir
tmpdir=$6
extlang=$7 # extended lang dir
mkdir -p $extlang $tmpdir
[ -f path.sh ] && . ./path.sh
! utils/validate_dict_dir.pl $srcdict && \
echo "*Error validating directory $srcdict*" && exit 1;
if [[ ! -f $srcdict/lexicon.txt ]]; then
echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \
> $srcdict/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdict/lexiconp.txt ]]; then
echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt"
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1;
fi
# Checks if the phone sets match.
echo "$(basename $0): Validating the source lexicon"
cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table '
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
{ for (x = 2; x <= NF; ++x) {
if (!($x in phones)) {
print "The source lexicon contains a phone not in the phones.txt: "$x;
print "You must provide a phones.txt from the lang built with the source lexicon.";
exit 1;
}
}}' || exit 1;
echo "$(basename $0): Validating the extra lexicon"
cat $extra_lexicon | awk -v f=$phone_symbol_table '
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) {
print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; }
}
}' || exit 1;
if [ $stage -le 0 ]; then
# Genearte the extended dict dir
echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt"
[ -d $extdict ] && rm -r $extdict 2>/dev/null
cp -R $srcdict $extdict 2>/dev/null
# Reformat the source lexicon
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \
>$tmpdir/lexicon.txt || exit 1;
# Filter lexical entries which are already in the source lexicon
awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \
awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \
> $extdict/lexicon_extra.txt || exit 1;
echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt"
perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \
cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \
sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1;
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt >$extdict/lexicon.txt || exit 1;
# Create lexicon_silprobs.txt
silprob=false
[ -f $srcdict/lexiconp_silprob.txt ] && silprob=true
if "$silprob"; then
echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt"
# Here we assume no acoustic evidence for the extra word-pron pairs.
# So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00
overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt`
awk -v overall=$overall_silprob '{
printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00);
for(n=2;n<=NF;n++) printf " "$n; printf("\n");
}' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \
sort -k1,1 -k2g,2 -k6 \
> $extdict/lexiconp_silprob.txt || exit 1;
fi
if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then
utils/validate_dict_dir.pl $extdict # show the output.
echo "$(basename $0): Validation failed on the extended dict"
exit 1;
fi
fi
if [ $stage -le 1 ]; then
echo "$(basename $0): Preparing the extended lang dir."
[ -d $extlang ] && rm -r $extlang 2>/dev/null
utils/prepare_lang.sh $prep_lang_opts $extdict \
$oov_word $tmpdir $extlang || exit 1;
# If a word list is provided, make sure the word-ids of these words are kept unchanged
# in the extended word list.
if [ -f $word_list ]; then
# First, make sure there's no OOV in the provided word-list.
if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \
{if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then
echo "$(basename $0): The provided word list contains words out of the extended vocab."
exit 1;
fi
awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \
'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \
{ if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$
mv $extlang/words.txt.$$ $extlang/words.txt
fi
fi
exit 0;
Oops, something went wrong.

0 comments on commit 8ff7fd9

Please sign in to comment.