Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rnnlm adaptation by the cache model on swbd #2659

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 58 additions & 0 deletions egs/swbd/s5c/get_ppl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

. path.sh
. cmd.sh

# variables for lattice rescoring
run_rescore=false
ac_model_dir=exp/chain/tdnn_lstm_1e_sp
decode_dir_suffix=rnnlm_adaptation_dan_formula_max_2
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially

src_dir=exp/chain/tdnn_lstm_1e_sp/decode_eval2000_fsh_sw1_fg
stage=2

. parse_options.sh

if [ $stage -le 1 ]; then
# cat data/train/text | cut -d " " -f2- | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' > train.unigram

# cat data/train/text | cut -d " " -f2- | sym2int.pl --map-oov "<unk>" data/lang/words.txt | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' | sort -k1n > train.unigram
# cat data/train/text | sym2int.pl -f 2- --map-oov "<unk>" data/lang/words.txt > train.txt
cat data/eval2000/text | cut -d " " -f2- | tr A-Z a-z > test.raw
cat data/eval2000/text | cut -d " " -f1 > test.head
paste test.head test.raw | sym2int.pl -f 2- --map-oov "<unk>" data/lang/words.txt > test.txt
# lattice-arc-post --acoustic-scale=0.1 "ark:gunzip -c $src_dir/lat.*.gz|" post.txt
#
# cat post.txt | sed 's=_= =g' | awk '{print $1"_"$2,$6,$7}' | awk '{a[$1][$3]+=$2}END{for(i in a) for(j in a[i]) print i, j, a[i][j]}' > maps
#
# cat data/train/text | cut -d " " -f2- | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' > train.unigram
#
fi

dir=exp/rnnlm_lstm_1c

word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"

rnnlm-nbest-probs-adjust $(cat $dir/special_symbol_opts.txt) $dir/final.raw "$word_embedding" test.txt data/eval2000/utt2spk train.unigram

exit

if [ $stage -le 2 ]; then
echo Perform lattice-rescoring on $ac_model_dir
LM=fsh_sw1_tg
for decode_set in eval2000; do
decode_dir=${ac_model_dir}/decode_${decode_set}_$LM

# Lattice rescoring
rnnlm/lmrescore_rnnlm_lat_adapt.sh \
--cmd "$decode_cmd --mem 4G -l hostname='[bc]*'" \
--weight 0.5 --max-ngram-order $ngram_order \
data/lang_$LM $dir \
data/${decode_set}_hires ${decode_dir} \
${decode_dir}_${decode_dir_suffix} data/eval2000/utt2spk train.unigram

done
fi
6 changes: 2 additions & 4 deletions egs/swbd/s5c/local/score_sclite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ if [ -f $dir/../frame_subsampling_factor ]; then
fi

name=`basename $data`; # e.g. eval2000

mkdir -p $dir/scoring/log

if [ $stage -le 0 ]; then
Expand Down Expand Up @@ -92,7 +91,7 @@ if [ $stage -le 2 ]; then
done
fi

# For eval2000 score the subsets
# For eval2000 and rt03 score the subsets
case "$name" in
eval2000*)
# Score only the, swbd part...
Expand All @@ -114,8 +113,7 @@ case "$name" in
done
fi
;;
rt03* )

rt03*)
# Score only the swbd part...
if [ $stage -le 3 ]; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
Expand Down
127 changes: 127 additions & 0 deletions egs/swbd/s5c/rnnlm_adapt_by_cache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/bin/bash

. ./cmd.sh
. ./path.sh


cmd=queue.pl
stage=0
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially
ppl_from_transcription=false
ppl_from_lattice=true
run_rescore=true
two_spks_mode=true
one_best_mode=false

lm_weight=0.8
correction_weight=0.75

. ./utils/parse_options.sh

wordlist=data/lang/words.txt
ac_model_dir=exp/chain/tdnn_lstm_1e_sp
rnnlm_dir=exp/rnnlm_lstm_1c
text_dir=data/rnnlm_cache_adapt
mkdir -p $text_dir

if [ $stage -le 0 ]; then
for dataset in eval2000 rt03; do
data_dir=$text_dir/$dataset
mkdir -p $data_dir
cat data/train/text | cut -d " " -f2- > $data_dir/train.txt
cat $data_dir/train.txt $wordlist | sym2int.pl --map-oov "<unk>" $wordlist | \
sed "s= =\n=g" | grep . | sort | uniq -c | sort -k1nr | awk '{print $2, $1}' > $data_dir/train.unigram
cat data/$dataset/text | cut -d " " -f2- | tr A-Z a-z > $data_dir/$dataset.raw
cat data/$dataset/text | cut -d " " -f1 > $data_dir/$dataset.head
paste $data_dir/$dataset.head $data_dir/$dataset.raw | \
sym2int.pl -f 2- --map-oov "<unk>" $wordlist > $data_dir/$dataset.txt
done
fi

# compute perplexity by RNNLM adapted by a unigram cache model
# estimated from trainscription
if [ $stage -le 1 ] && $ppl_from_transcription; then
word_embedding="rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|"
for dataset in eval2000 rt03; do
data_dir=$text_dir/$dataset
echo Compute PPL from the adjusted RNNLM on $dataset...
rnnlm-nbest-probs-adjust --correction-weight=$correction_weight \
--two-speaker-mode=$two_spks_mode \
$(cat $rnnlm_dir/special_symbol_opts.txt) \
$rnnlm_dir/final.raw "$word_embedding" $data_dir/$dataset.txt \
data/$dataset/utt2spk $data_dir/train.unigram
done
fi

# compute perplexity by RNNLM adapted by a unigram cache model estimated
# from first pass decoded lattices
if [ $stage -le 2 ] && $ppl_from_lattice; then
word_embedding="rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|"
LM=fsh_sw1_fg
for dataset in eval2000 rt03; do
data_dir=$text_dir/$dataset
decode_dir=${ac_model_dir}/decode_${dataset}_$LM
ppl_name=ppl_cache
mkdir -p $data_dir/$ppl_name/log
nj=`cat $decode_dir/num_jobs` || exit 1;

echo Compute PPL from the adjusted RNNLM by lattice posteriors on $dataset...
$cmd JOB=1:$nj $data_dir/$ppl_name/log/cw$correction_weight/perplexity.JOB.log \
rnnlm-nbest-probs-adjust-lattice --correction-weight=$correction_weight \
--lm-scale=$lm_weight \
--two-speaker-mode=$two_spks_mode \
--one-best-mode=$one_best_mode \
$(cat $rnnlm_dir/special_symbol_opts.txt) \
$rnnlm_dir/final.raw "$word_embedding" $data_dir/$dataset.txt \
data/$dataset/utt2spk $data_dir/train.unigram \
"ark:gunzip -c $decode_dir/lat.JOB.gz|"

# Compute perplexity
[ -f $data_dir/$ppl_name/log/cw$correction_weight/ppls.log ] &&
rm $data_dir/$ppl_name/log/cw$correction_weight/ppls.log
# $dataset.txt contains all words including eos (end of sentence symbol)
word_count=`cat $data_dir/$dataset.txt | wc -w`
for i in `seq 1 $nj`; do
grep 'Log' $data_dir/$ppl_name/log/cw$correction_weight/perplexity.$i.log | \
awk '{n +=$NF}; END{print n}' >> $data_dir/$ppl_name/log/cw$correction_weight/ppls.log
done
awk '{n +=$1}; END{print n}' $data_dir/$ppl_name/log/cw$correction_weight/ppls.log \
> $data_dir/$ppl_name/log/cw$correction_weight/ppls_sum.log
logprobs=`cat $data_dir/$ppl_name/log/cw$correction_weight/ppls_sum.log`
echo "scale=3;$logprobs/$word_count"|bc > \
$data_dir/$ppl_name/log/cw$correction_weight/entropy.log
ppl=`awk '{printf("%.1f",exp(-$1))}' $data_dir/$ppl_name/log/cw$correction_weight/entropy.log`
echo "PPL by lattice posteriors on $dataset is $ppl" > \
$data_dir/$ppl_name/log/cw$correction_weight/ppl
echo "PPL by lattice posteriors on $dataset is $ppl"
done
fi
exit 1;

if [ $stage -le 3 ] && $run_rescore; then
LM=fsh_sw1_fg
decode_out_dir=exp/chain/cache
mkdir -p $decode_out_dir
for decode_set in eval2000 rt03; do
echo Perform pruned lattice-rescoring on $ac_model_dir on dataset $decode_set
decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
decode_out=$decode_out_dir/decode_${decode_set}_${LM}_lmw${lm_weight}_cw${correction_weight}_pruned
mkdir -p $decode_out
cp $decode_dir/../final.mdl $decode_out_dir/

rnnlm/lmrescore_rnnlm_lat_pruned_cache_adapt.sh \
--cmd "$decode_cmd --mem 4G -l hostname='[bc]*'" \
--weight $lm_weight \
--correction-weight $correction_weight \
--max-ngram-order $ngram_order \
--two-speaker-mode $two_spks_mode \
--one-best-mode $one_best_mode \
data/lang_$LM $rnnlm_dir \
data/${decode_set}_hires ${decode_dir} \
$decode_out data/${decode_set}/utt2spk \
$text_dir/$decode_set/train.unigram
done
fi
117 changes: 117 additions & 0 deletions scripts/rnnlm/lmrescore_rnnlm_lat_adapt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/bin/bash

# Copyright 2015 Guoguo Chen
# 2017 Hainan Xu
# Apache 2.0

# This script rescores lattices with KALDI RNNLM.

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4
N=10
weight=1.0 # Interpolation weight for RNNLM.
normalize=false
# End configuration section.

echo "$0 $@" # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 7 ]; then
echo Getting num-params = $#
echo "Does language model rescoring of lattices (remove old LM, add new LM)"
echo "with Kaldi RNNLM adapted by a unigram cache model."
echo ""
echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
echo " <data-dir> <input-decode-dir> <output-decode-dir> \\"
echo " <utt2spk-file> <background-unigram-file>"
echo " e.g.: $0 data/lang_tg exp/rnnlm_lstm_1c/ data/eval2000_hires \\"
echo " exp/chain/tdnn_lstm_1e/decode_eval2000_tg \\"
echo " exp/chain/tdnn_lstm_1e/decode_eval2000_tg_rnnlm \\"
echo " data/eval2000/utt2spk \\"
echo " data/rnnlm_cache_adapt/eval2000/train.unigram"
echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5
utt2convo=$6
unigram=$7

oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
oldlm=$oldlang/G.carpa
elif [ ! -f $oldlm ]; then
echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
exit 1;
fi

[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
[ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1;
[ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1;

[ ! -f $oldlang/words.txt ] &&\
echo "$0: Missing file $oldlang/words.txt" && exit 1;
! ls $indir/lat.*.gz >/dev/null &&\
echo "$0: No lattices input directory $indir" && exit 1;
awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
|| exit 1;

oldlm_command="fstproject --project_output=true $oldlm |"

special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt)

word_embedding=
if [ -f $rnnlm_dir/word_embedding.final.mat ]; then
word_embedding=$rnnlm_dir/word_embedding.final.mat
else
word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'"
fi

normalize_opt=
if $normalize; then
normalize_opt="--normalize-probs=true"
fi

mkdir -p $outdir/log
nj=$(cat $indir/num_jobs) || exit 1;
cp $indir/num_jobs $outdir

oldlm_weight=$(perl -e "print -1.0 * $weight;")
if [ "$oldlm" == "$oldlang/G.fst" ]; then
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore --lm-scale=$oldlm_weight \
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| lattice-determinize ark:- "ark,t:|gzip -c>$outdir/lat.tmp.JOB.gz" '&&' \
lattice-lmrescore-kaldi-rnnlm-adaptation --lm-scale=$weight $special_symbol_opts \
--max-ngram-order=$max_ngram_order $normalize_opt \
$word_embedding "$rnnlm_dir/final.raw" "ark,t:gunzip -c $outdir/lat.tmp.JOB.gz|" \
"ark,t:|gzip -c>$outdir/lat.JOB.gz" $utt2convo $unigram || exit 1;
rm $outdir/lat.tmp.*.gz
else
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \
lattice-lmrescore-kaldi-rnnlm-adaptation --lm-scale=$weight $special_symbol_opts \
--max-ngram-order=$max_ngram_order $normalize_opt \
$word_embedding "$rnnlm_dir/final.raw" ark:- \
"ark,t:|gzip -c>$outdir/lat.JOB.gz" $utt2convo $unigram || exit 1;
fi

if ! $skip_scoring ; then
err_msg="Not scoring because local/score.sh does not exist or not executable."
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "Not scoring because requested so..."
fi

exit 0;