kaldi-asr · hainan-xv · Apr 11, 2016 · Apr 15, 2016 · May 5, 2016 · Jul 22, 2016
diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh
@@ -26,13 +26,31 @@ export cuda_cmd="queue.pl --gpu 1 --mem 20G"
 #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
 #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=00:20:00"
 
-if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
-  # BUT cluster:
-  queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@@gpu"
-  storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
-fi
+#<<<<<<< HEAD
+# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay)
+export train_cmd="queue.pl -l arch=*64* --mem 1G"
+export decode_cmd="queue.pl -l arch=*64* --mem 2G"
+export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
+export scoring_cmd="queue.pl -l arch=*64*"
+export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G"
+export cuda_mem_cmd="queue.pl --gpu 1 -l mem_free=42G,ram_free=42G"
+export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"
+
+# To run locally, use:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+#export highmem_cmd=run.pl
+#export cuda_cmd=run.pl
+
+#=======
+#>>>>>>> 6c7c0170812a1f7dfb5c09c078787e79ee72333a
+#if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
+#  # BUT cluster:
+#  queue="all.q@@blade,all.q@@speech"
+#  gpu_queue="long.q@@gpu"
+#  storage="matylda5"
+#  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+#  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
+#  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
+#fi
 
diff --git a/egs/ami/s5/local/nnet3/run_ivector_common.sh b/egs/ami/s5/local/nnet3/run_ivector_common.sh
@@ -9,6 +9,7 @@ num_threads_ubm=32
 nj=10
 use_ihm_ali=false
 use_sat_alignments=true
+nj=30
 
 . cmd.sh
 . ./path.sh
@@ -109,9 +110,15 @@ if [ $stage -le 7 ]; then
 fi
 
 
-if [ $stage -le 8 ]; then
-  rm -f exp/$mic/nnet3/.error 2>/dev/null
-  ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires
+#<<<<<<< HEAD
+if [ $stage -le 6 ]; then
+  if [ -f exp/$mic/nnet3/.error ]; then rm exp/$mic/nnet3/.error 2>/dev/null; fi
+  ivectordir=exp/$mic/nnet3/ivectors_${train_set}_hires
+#=======
+#if [ $stage -le 8 ]; then
+#  rm -f exp/$mic/nnet3/.error 2>/dev/null
+#  ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires
+#>>>>>>> 6c7c0170812a1f7dfb5c09c078787e79ee72333a
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi

diff --git a/egs/ami/s5/local/run_cued_rnnlm.sh b/egs/ami/s5/local/run_cued_rnnlm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+mic=sdm1
+crit=vr
+n=50
+ngram_order=4
+
+. ./utils/parse_options.sh
+. ./cmd.sh
+. ./path.sh
+
+set -e
+
+local/train_cued_rnnlms.sh --crit $crit --train-text data/$mic/train/text data/$mic/cued_rnn_$crit
+
+final_lm=ami_fsh.o3g.kn
+LM=$final_lm.pr1-7
+
+for decode_set in dev eval; do
+  dir=exp/$mic/nnet3/tdnn_sp/
+  decode_dir=${dir}/decode_${decode_set}
+
+  # N-best rescoring
+  steps/rnnlmrescore.sh \
+    --rnnlm-ver cuedrnnlm \
+    --N $n --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.5 \
+    data/lang_$LM data/$mic/cued_rnn_$crit \
+    data/$mic/$decode_set ${decode_dir} \
+    ${decode_dir}.rnnlm.$crit.cued.$n-best 
+
+  # Lattice rescoring
+  steps/lmrescore_rnnlm_lat.sh \
+    --cmd "$decode_cmd --mem 16G" \
+    --rnnlm-ver cuedrnnlm  --weight 0.5 --max-ngram-order $ngram_order \
+    data/lang_$LM data/$mic/cued_rnn_$crit \
+    data/$mic/${decode_set}_hires ${decode_dir} \
+    ${decode_dir}.rnnlm.$crit.cued.lat.${ngram_order}gram
+
+done
diff --git a/egs/ami/s5/local/run_rnnlm.sh b/egs/ami/s5/local/run_rnnlm.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+mic=sdm1
+
+. ./utils/parse_options.sh
+. ./cmd.sh
+. ./path.sh
+
+local/train_rnnlms.sh --train-text data/$mic/train/text data/$mic/mik_rnn
+
+final_lm=ami_fsh.o3g.kn
+LM=$final_lm.pr1-7
+
+graph_dir=exp/$mic/tri4a/graph_${LM}
+
+
+for decode_set in eval dev; do
+  dir=exp/$mic/nnet3/tdnn_sp/
+  decode_dir=${dir}/decode_${decode_set}
+
+  # N-best rescoring with Tomas Mikolov's version.
+(  steps/rnnlmrescore.sh \
+    --rnnlm-ver rnnlm-0.3e \
+    --N 50 --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.5 \
+    data/lang_$LM data/$mic/mik_rnn \
+    data/$mic/$decode_set ${decode_dir} \
+    ${decode_dir}.rnnlm.mik.50-best || exit 1 ) &
+
+  # Lattice rescoring with Tomas Mikolov's version.
+(  steps/lmrescore_rnnlm_lat.sh \
+    --weight 0.5 --cmd "$decode_cmd --mem 16G" --max-ngram-order 5 \
+    data/lang_$LM data/$mic/mik_rnn \
+    data/$mic/$decode_set ${decode_dir} \
+    ${decode_dir}.rnnlm.mik.lat || exit 1;) &
+done
+
+wait
+
diff --git a/egs/ami/s5/local/train_cued_rnnlms.sh b/egs/ami/s5/local/train_cued_rnnlms.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+train_text=
+nwords=10000
+hidden=200
+cachesize=20
+crit=ce
+
+rnnlm_ver=cuedrnnlm
+
+bptt=5
+
+#set -v
+set -e
+
+. path.sh
+. cmd.sh
+
+. utils/parse_options.sh
+
+if [ $# != 1 ]; then
+   echo "Usage: $0 [options] <dest-dir>"
+   echo "For options, see top of script file"
+   exit 1;
+fi
+
+dir=$1
+srcdir=data/local/dict
+
+mkdir -p $dir
+
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
+export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
+
+cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
+
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
+cat $train_text | awk -v w=$dir/wordlist.all \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
+  | perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
+  | gzip -c > $dir/all.gz
+
+echo "Splitting data into train and validation sets."
+heldout_sent=10000
+gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
+gunzip -c $dir/all.gz | tail -n +$heldout_sent > $dir/train.in # training data
+
+
+  # The rest will consist of a word-class represented by <RNN_UNK>, that
+  # maps (with probabilities) to a whole class of words.
+
+# Get unigram counts from our training data, and use this to select word-list
+# for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
+# that we (manually, at the shell level) assign probabilities for words that
+# are in that class.  Note: this word-list doesn't need to include </s>; this
+# automatically gets added inside the rnnlm program.
+# Note: by concatenating with $dir/wordlist.all, we are doing add-one
+# smoothing of the counts.
+
+# get rid of this design - 
+cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+  sort -nr > $dir/unigram.counts
+
+total_nwords=`wc -l $dir/unigram.counts | awk '{print $1}'`
+
+#head -$nwords_input $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn.input | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id.input
+#head -$nwords_output $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn.output | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id.output
+head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id
+
+tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
+
+for type in train valid; do
+  mv $dir/$type.in $dir/$type
+done
+
+# Now randomize the order of the training data.
+cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
+ sort | cut -f 2 > $dir/foo
+mv $dir/foo $dir/train
+
+# OK we'll train the RNNLM on this data.
+
+echo "Training CUED-RNNLM on GPU"
+
+layer_str=$[$nwords+2]:$hidden:$[$nwords+2]
+bptt_delay=0
+
+echo $layer_str > $dir/layer_string
+$cuda_mem_cmd $dir/rnnlm.log \
+   steps/train_cued_rnnlm.sh -train -trainfile $dir/train \
+   -validfile $dir/valid -minibatch 64 -layers $layer_str \
+   -bptt $bptt -bptt-delay $bptt_delay -traincrit $crit -lrtune newbob \
+   -inputwlist $dir/wordlist.rnn.id -outputwlist $dir/wordlist.rnn.id \
+   -independent 1 -learnrate 1.0 \
+   -fullvocsize $total_nwords \
+   -writemodel $dir/rnnlm -randseed 1 -debug 2
+
+touch $dir/unk.probs  # dummy file, not used for cued-rnnlm
+
+# make it like a Kaldi table format, with fake utterance-ids.
+cat $dir/valid | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
+
+utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $dir $dir/tmp.valid $dir/valid.with_ids $dir/valid.scores
+
+nw=`cat $dir/valid.with_ids | sed 's= =\n=g' | wc -l | awk '{print $1}'`
+  # Note: valid.with_ids includes utterance-ids which
+  # is one per word, to account for the </s> at the end of each sentence; this is the
+  # correct number to normalize buy.
+  # not use "wc -w" here since it does not work well for non-ascii characters
+p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
+echo Perplexity is $p | tee $dir/perplexity.log
+
diff --git a/egs/ami/s5/local/train_rnnlms.sh b/egs/ami/s5/local/train_rnnlms.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+train_text=
+nwords=10000
+hidden=200
+
+rand_seed=0
+cmd=run.pl
+class=120 # Num-classes... should be somewhat larger than sqrt of nwords.
+direct=10 # Number of weights that are used for "direct" connections, in millions.
+rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
+threads=1 # for RNNLM-HS
+bptt=3 # length of BPTT unfolding in RNNLM
+bptt_block=20 # length of BPTT unfolding in RNNLM
+dict_suffix=
+# End configuration section.
+
+. path.sh
+. cmd.sh
+
+. utils/parse_options.sh
+
+if [ $# != 1 ]; then
+   echo "Usage: $0 [options] <dest-dir>"
+   echo "For options, see top of script file"
+   exit 1;
+fi
+
+dir=$1
+srcdir=data/local/dict
+
+mkdir -p $dir
+
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
+export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
+
+cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
+
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
+cat $train_text | awk -v w=$dir/wordlist.all \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
+  | gzip -c > $dir/all.gz
+
+echo "Splitting data into train and validation sets."
+heldout_sent=10000
+gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
+gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
+ perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
+  > $dir/train.in # training data
+
+
+  # The rest will consist of a word-class represented by <RNN_UNK>, that
+  # maps (with probabilities) to a whole class of words.
+
+# Get unigram counts from our training data, and use this to select word-list
+# for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
+# that we (manually, at the shell level) assign probabilities for words that
+# are in that class.  Note: this word-list doesn't need to include </s>; this
+# automatically gets added inside the rnnlm program.
+# Note: by concatenating with $dir/wordlist.all, we are doing add-one
+# smoothing of the counts.
+
+cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+  sort -nr > $dir/unigram.counts
+
+head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id
+
+tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
+
+tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
+awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts  >$dir/unk.probs
+
+
+for type in train valid; do
+  cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
+    'BEGIN{while((getline<w)>0) v[$1]=1;}
+    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
+    > $dir/$type
+done
+rm $dir/train.in # no longer needed-- and big.
+
+# Now randomize the order of the training data.
+cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
+ sort | cut -f 2 > $dir/foo
+mv $dir/foo $dir/train
+
+# OK we'll train the RNNLM on this data.
+
+# todo: change 100 to 320.
+# using 100 classes as square root of 10k.
+echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
+#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
+#  -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
+#  -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
+
+$cmd $dir/rnnlm.log \
+   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -threads $threads -independent -train $dir/train -valid $dir/valid \
+   -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt $bptt -bptt-block $bptt_block \
+   -direct-order 4 -direct $direct -binary
+
+# make it like a Kaldi table format, with fake utterance-ids.
+cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
+
+utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $dir $dir/tmp.valid $dir/valid.with_ids \
+  $dir/valid.scores
+nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
+  # is one per word, to account for the </s> at the end of each sentence; this is the
+  # correct number to normalize buy.
+p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
+echo Perplexity is $p | tee $dir/perplexity.log
+
+rm $dir/train $dir/all.gz
+
+# This is a better setup, but takes a long time to train:
+#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
+#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
+#  -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
+#  -direct-order 4 -direct 2000 -binary
diff --git a/egs/wsj/s5/local/wsj_train_rnnlms.sh b/egs/wsj/s5/local/wsj_train_rnnlms.sh
@@ -18,7 +18,8 @@ nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
 hidden=30
 class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
 direct=1000 # Number of weights that are used for "direct" connections, in millions.
-rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
+rnnlm_ver=cuedrnnlm # version of RNNLM to use
+rnnlm_ver=rnnlm-0.3e
 threads=1 # for RNNLM-HS
 bptt=2 # length of BPTT unfolding in RNNLM
 bptt_block=20 # length of BPTT unfolding in RNNLM