kaldi-asr · keli78 · Feb 24, 2018 · Feb 24, 2018 · May 19, 2018 · Sep 1, 2018
diff --git a/egs/swbd/s5c/get_ppl.sh b/egs/swbd/s5c/get_ppl.sh
@@ -0,0 +1,58 @@
+
+. path.sh
+. cmd.sh
+
+# variables for lattice rescoring
+run_rescore=false
+ac_model_dir=exp/chain/tdnn_lstm_1e_sp
+decode_dir_suffix=rnnlm_adaptation_dan_formula_max_2
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+src_dir=exp/chain/tdnn_lstm_1e_sp/decode_eval2000_fsh_sw1_fg
+stage=2
+
+. parse_options.sh
+
+if [ $stage -le 1 ]; then
+#  cat data/train/text | cut -d " " -f2- | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' > train.unigram
+
+#  cat data/train/text | cut -d " " -f2- | sym2int.pl --map-oov "<unk>" data/lang/words.txt | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' | sort -k1n > train.unigram
+#  cat data/train/text | sym2int.pl -f 2- --map-oov "<unk>" data/lang/words.txt > train.txt
+  cat data/eval2000/text | cut -d " " -f2- | tr A-Z a-z > test.raw
+  cat data/eval2000/text | cut -d " " -f1 > test.head
+  paste test.head test.raw | sym2int.pl -f 2- --map-oov "<unk>" data/lang/words.txt > test.txt
+#  lattice-arc-post --acoustic-scale=0.1 "ark:gunzip -c $src_dir/lat.*.gz|" post.txt
+#
+#  cat post.txt | sed 's=_= =g' | awk '{print $1"_"$2,$6,$7}' | awk '{a[$1][$3]+=$2}END{for(i in a) for(j in a[i]) print i, j, a[i][j]}' > maps
+#
+#  cat data/train/text | cut -d " " -f2- | sed "s= =\n=g" | grep . | sort | uniq -c | awk '{print $2, $1}' > train.unigram
+#
+fi
+
+dir=exp/rnnlm_lstm_1c
+
+word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
+
+rnnlm-nbest-probs-adjust $(cat $dir/special_symbol_opts.txt) $dir/final.raw "$word_embedding" test.txt data/eval2000/utt2spk train.unigram
+
+exit
+
+if [ $stage -le 2 ]; then
+  echo Perform lattice-rescoring on $ac_model_dir
+  LM=fsh_sw1_tg
+  for decode_set in eval2000; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
+
+    # Lattice rescoring
+    rnnlm/lmrescore_rnnlm_lat_adapt.sh \
+      --cmd "$decode_cmd --mem 4G -l hostname='[bc]*'" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix} data/eval2000/utt2spk train.unigram
+
+  done
+fi
diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh
@@ -45,7 +45,6 @@ if [ -f $dir/../frame_subsampling_factor ]; then
 fi
 
 name=`basename $data`; # e.g. eval2000
-
 mkdir -p $dir/scoring/log
 
 if [ $stage -le 0 ]; then
@@ -92,7 +91,7 @@ if [ $stage -le 2 ]; then
   done
 fi
 
-# For eval2000 score the subsets
+# For eval2000 and rt03 score the subsets
 case "$name" in
   eval2000*)
     # Score only the, swbd part...
@@ -114,8 +113,7 @@ case "$name" in
       done
     fi
     ;;
-rt03* )
-
+  rt03*)
   # Score only the swbd part...
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do

diff --git a/egs/swbd/s5c/rnnlm_adapt_by_cache.sh b/egs/swbd/s5c/rnnlm_adapt_by_cache.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+. ./cmd.sh
+. ./path.sh
+
+
+cmd=queue.pl
+stage=0
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+ppl_from_transcription=false
+ppl_from_lattice=true
+run_rescore=true
+two_spks_mode=true
+one_best_mode=false
+
+lm_weight=0.8
+correction_weight=0.75
+
+. ./utils/parse_options.sh
+
+wordlist=data/lang/words.txt
+ac_model_dir=exp/chain/tdnn_lstm_1e_sp
+rnnlm_dir=exp/rnnlm_lstm_1c
+text_dir=data/rnnlm_cache_adapt
+mkdir -p $text_dir
+
+if [ $stage -le 0 ]; then
+  for dataset in eval2000 rt03; do
+    data_dir=$text_dir/$dataset
+    mkdir -p $data_dir 
+    cat data/train/text | cut -d " " -f2- > $data_dir/train.txt
+    cat $data_dir/train.txt $wordlist | sym2int.pl --map-oov "<unk>" $wordlist | \
+      sed "s= =\n=g" | grep . | sort | uniq -c | sort -k1nr  | awk '{print $2, $1}' > $data_dir/train.unigram
+    cat data/$dataset/text | cut -d " " -f2- | tr A-Z a-z > $data_dir/$dataset.raw
+    cat data/$dataset/text | cut -d " " -f1 > $data_dir/$dataset.head
+    paste $data_dir/$dataset.head $data_dir/$dataset.raw | \
+      sym2int.pl -f 2- --map-oov "<unk>" $wordlist > $data_dir/$dataset.txt
+  done
+fi
+
+# compute perplexity by RNNLM adapted by a unigram cache model 
+# estimated from trainscription
+if [ $stage -le 1 ] && $ppl_from_transcription; then
+  word_embedding="rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|"
+  for dataset in eval2000 rt03; do 
+    data_dir=$text_dir/$dataset
+    echo Compute PPL from the adjusted RNNLM on $dataset...
+    rnnlm-nbest-probs-adjust --correction-weight=$correction_weight \
+      --two-speaker-mode=$two_spks_mode \
+      $(cat $rnnlm_dir/special_symbol_opts.txt) \
+      $rnnlm_dir/final.raw "$word_embedding" $data_dir/$dataset.txt \
+      data/$dataset/utt2spk $data_dir/train.unigram
+  done
+fi
+
+# compute perplexity by RNNLM adapted by a unigram cache model estimated
+# from first pass decoded lattices
+if [ $stage -le 2 ] && $ppl_from_lattice; then
+  word_embedding="rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|"
+  LM=fsh_sw1_fg
+  for dataset in eval2000 rt03; do 
+    data_dir=$text_dir/$dataset
+    decode_dir=${ac_model_dir}/decode_${dataset}_$LM
+    ppl_name=ppl_cache
+    mkdir -p $data_dir/$ppl_name/log
+    nj=`cat $decode_dir/num_jobs` || exit 1;
+
+    echo Compute PPL from the adjusted RNNLM by lattice posteriors on $dataset...
+    $cmd JOB=1:$nj $data_dir/$ppl_name/log/cw$correction_weight/perplexity.JOB.log \
+    rnnlm-nbest-probs-adjust-lattice --correction-weight=$correction_weight \
+      --lm-scale=$lm_weight \
+      --two-speaker-mode=$two_spks_mode \
+      --one-best-mode=$one_best_mode \
+      $(cat $rnnlm_dir/special_symbol_opts.txt) \
+      $rnnlm_dir/final.raw "$word_embedding" $data_dir/$dataset.txt \
+      data/$dataset/utt2spk $data_dir/train.unigram \
+      "ark:gunzip -c $decode_dir/lat.JOB.gz|"
+
+    # Compute perplexity
+    [ -f $data_dir/$ppl_name/log/cw$correction_weight/ppls.log ] &&
+      rm $data_dir/$ppl_name/log/cw$correction_weight/ppls.log
+    # $dataset.txt contains all words including eos (end of sentence symbol)
+    word_count=`cat $data_dir/$dataset.txt | wc -w` 
+    for i in `seq 1 $nj`; do
+      grep 'Log' $data_dir/$ppl_name/log/cw$correction_weight/perplexity.$i.log | \
+        awk '{n +=$NF}; END{print n}' >> $data_dir/$ppl_name/log/cw$correction_weight/ppls.log
+    done
+    awk '{n +=$1}; END{print n}' $data_dir/$ppl_name/log/cw$correction_weight/ppls.log \
+      > $data_dir/$ppl_name/log/cw$correction_weight/ppls_sum.log
+    logprobs=`cat $data_dir/$ppl_name/log/cw$correction_weight/ppls_sum.log`
+    echo "scale=3;$logprobs/$word_count"|bc > \
+      $data_dir/$ppl_name/log/cw$correction_weight/entropy.log
+    ppl=`awk '{printf("%.1f",exp(-$1))}' $data_dir/$ppl_name/log/cw$correction_weight/entropy.log`
+    echo "PPL by lattice posteriors on $dataset is $ppl" > \
+      $data_dir/$ppl_name/log/cw$correction_weight/ppl
+    echo "PPL by lattice posteriors on $dataset is $ppl"
+  done
+fi
+exit 1;
+
+if [ $stage -le 3 ] && $run_rescore; then
+  LM=fsh_sw1_fg
+  decode_out_dir=exp/chain/cache
+  mkdir -p $decode_out_dir
+  for decode_set in eval2000 rt03; do
+    echo Perform pruned lattice-rescoring on $ac_model_dir on dataset $decode_set
+    decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
+    decode_out=$decode_out_dir/decode_${decode_set}_${LM}_lmw${lm_weight}_cw${correction_weight}_pruned
+    mkdir -p $decode_out
+    cp $decode_dir/../final.mdl $decode_out_dir/
+
+    rnnlm/lmrescore_rnnlm_lat_pruned_cache_adapt.sh \
+      --cmd "$decode_cmd --mem 4G -l hostname='[bc]*'" \
+      --weight $lm_weight \
+      --correction-weight $correction_weight \
+      --max-ngram-order $ngram_order \
+      --two-speaker-mode $two_spks_mode \
+      --one-best-mode $one_best_mode \
+      data/lang_$LM $rnnlm_dir \
+      data/${decode_set}_hires ${decode_dir} \
+      $decode_out data/${decode_set}/utt2spk \
+      $text_dir/$decode_set/train.unigram
+  done
+fi
diff --git a/scripts/rnnlm/lmrescore_rnnlm_lat_adapt.sh b/scripts/rnnlm/lmrescore_rnnlm_lat_adapt.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Copyright 2015  Guoguo Chen
+#           2017  Hainan Xu
+# Apache 2.0
+
+# This script rescores lattices with KALDI RNNLM.
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+max_ngram_order=4
+N=10
+weight=1.0  # Interpolation weight for RNNLM.
+normalize=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 7 ]; then
+   echo Getting num-params = $#
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "with Kaldi RNNLM adapted by a unigram cache model."
+   echo ""
+   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir> \\"
+   echo "                   <utt2spk-file> <background-unigram-file>"
+   echo " e.g.: $0 data/lang_tg exp/rnnlm_lstm_1c/ data/eval2000_hires \\"
+   echo "                   exp/chain/tdnn_lstm_1e/decode_eval2000_tg \\"
+   echo "                   exp/chain/tdnn_lstm_1e/decode_eval2000_tg_rnnlm \\"
+   echo "                   data/eval2000/utt2spk \\"
+   echo "                   data/rnnlm_cache_adapt/eval2000/train.unigram"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+rnnlm_dir=$2
+data=$3
+indir=$4
+outdir=$5
+utt2convo=$6
+unigram=$7
+
+oldlm=$oldlang/G.fst
+if [ -f $oldlang/G.carpa ]; then
+  oldlm=$oldlang/G.carpa
+elif [ ! -f $oldlm ]; then
+  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
+    exit 1;
+fi
+
+[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+[ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1;
+[ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1;
+
+[ ! -f $oldlang/words.txt ] &&\
+  echo "$0: Missing file $oldlang/words.txt" && exit 1;
+! ls $indir/lat.*.gz >/dev/null &&\
+  echo "$0: No lattices input directory $indir" && exit 1;
+awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
+  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
+  || exit 1;
+
+oldlm_command="fstproject --project_output=true $oldlm |"
+
+special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt)
+
+word_embedding=
+if [ -f $rnnlm_dir/word_embedding.final.mat ]; then
+  word_embedding=$rnnlm_dir/word_embedding.final.mat
+else
+  word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'"
+fi
+
+normalize_opt=
+if $normalize; then
+  normalize_opt="--normalize-probs=true"
+fi
+
+mkdir -p $outdir/log
+nj=$(cat $indir/num_jobs) || exit 1;
+cp $indir/num_jobs $outdir
+
+oldlm_weight=$(perl -e "print -1.0 * $weight;")
+if [ "$oldlm" == "$oldlang/G.fst" ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| lattice-determinize ark:- "ark,t:|gzip -c>$outdir/lat.tmp.JOB.gz"  '&&' \
+    lattice-lmrescore-kaldi-rnnlm-adaptation --lm-scale=$weight $special_symbol_opts \
+    --max-ngram-order=$max_ngram_order $normalize_opt \
+    $word_embedding "$rnnlm_dir/final.raw" "ark,t:gunzip -c $outdir/lat.tmp.JOB.gz|" \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" $utt2convo $unigram || exit 1;
+  rm $outdir/lat.tmp.*.gz
+else
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:-  \| \
+    lattice-lmrescore-kaldi-rnnlm-adaptation --lm-scale=$weight $special_symbol_opts \
+    --max-ngram-order=$max_ngram_order $normalize_opt \
+    $word_embedding "$rnnlm_dir/final.raw" ark:- \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" $utt2convo $unigram || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $data $oldlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;