diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh deleted file mode 100755 index 9e795316352..00000000000 --- a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh +++ /dev/null @@ -1,198 +0,0 @@ -#!/bin/bash - -# by default, with cleanup: -# local/chain/run_tdnn.sh - -# without cleanup: -# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run the corresponding non-chain nnet3 system -# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_sp_bi/egs # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 17 ]; then - mkdir -p $dir - - echo "$0: creating neural net configs"; - - steps/nnet3/tdnn/make_configs.py \ - --self-repair-scale-nonlinearity 0.00001 \ - --feat-dir data/${train_set}_sp_hires_comb \ - --ivector-dir $train_ivector_dir \ - --tree-dir $tree_dir \ - --relu-dim 550 \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ - --use-presoftmax-prior-scale false \ - --xent-regularize 0.1 \ - --xent-separate-forward-affine true \ - --include-log-softmax false \ - --final-layer-normalize-target 1.0 \ - $dir/configs || exit 1; -fi - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - - - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi -exit 0 diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh index 6704f9d299e..e56946c1b54 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh @@ -259,14 +259,14 @@ fi if [ $stage -le 18 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage fi steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ + --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh index 3e14a4efc55..da0bb728e69 100755 --- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh +++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh @@ -1,12 +1,20 @@ #!/bin/bash # this script is used for comparing decoding results between systems. -# e.g. local/nnet3/compare_wer_general.sh exp/nnet3_cleaned/tdnn_{c,d}_sp +# e.g. local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_{c,d}_sp # For use with discriminatively trained systems you specify the epochs after a colon: # for instance, # local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_c_sp exp/nnet3_cleaned/tdnn_c_sp_smbr:{1,2,3} +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/nnet3_cleaned/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/nnet3_cleaned/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + echo "# $0 $*" include_looped=false @@ -14,6 +22,11 @@ if [ "$1" == "--looped" ]; then include_looped=true shift fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi @@ -71,6 +84,16 @@ for n in 0 1 2 3; do done echo fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum ${dirname}_online/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi done diff --git a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh index b4f2dd3e3b4..16093616b05 100755 --- a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh +++ b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh @@ -21,9 +21,9 @@ num_threads_ubm=32 nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it # becomes exp/nnet3_cleaned or whatever. -. cmd.sh +. ./cmd.sh . ./path.sh -. ./utils/parse_options.sh +. utils/parse_options.sh gmm_dir=exp/${gmm} diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh new file mode 120000 index 00000000000..8e03c924bc1 --- /dev/null +++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_lfr_1a.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh index 379c8040a27..f6e4fb71b75 100755 --- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh @@ -1,5 +1,8 @@ #!/bin/bash + +# 1b is as 1a but uses xconfigs. + # This is the standard "tdnn" system, built in nnet3; this script # is the version that's meant to run with data-cleanup, that doesn't # support parallel alignments. diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh new file mode 100755 index 00000000000..35789342ffb --- /dev/null +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# 1c is as 1b but using more 'chain-like' splicing and slightly +# smaller dim. Not better; maybe slightly worse. + +# note: the num-params is almost the same. +# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1{b,c}_sp +# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68) +# exp/nnet3_cleaned/tdnn1c_sp: num-iters=240 nj=2..12 num-params=10.1M dim=40+100->4187 combine=-1.16->-1.15 loglike:train/valid[159,239,combined]=(-1.22,-1.16,-1.15/-1.41,-1.38,-1.38) accuracy:train/valid[159,239,combined]=(0.66,0.67,0.68/0.62,0.63,0.63) + +# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1{b,c}_sp +# System tdnn1b_sp tdnn1c_sp +# WER on dev(orig) 11.7 11.9 +# WER on dev(rescored) 10.9 11.1 +# WER on test(orig) 11.7 11.8 +# WER on test(rescored) 11.0 11.2 +# Final train prob -0.9416 -1.1505 +# Final valid prob -1.1496 -1.3805 +# Final train acc 0.7241 0.6756 +# Final valid acc 0.6788 0.6255 + +# This is the standard "tdnn" system, built in nnet3; this script +# is the version that's meant to run with data-cleanup, that doesn't +# support parallel alignments. + + +# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp +# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68) + +# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp +# System tdnn1a_sp tdnn1b_sp +# WER on dev(orig) 11.9 11.7 +# WER on dev(rescored) 11.2 10.9 +# WER on test(orig) 11.6 11.7 +# WER on test(rescored) 11.0 11.0 +# Final train prob -0.9255 -0.9416 +# Final valid prob -1.1842 -1.1496 +# Final train acc 0.7245 0.7241 +# Final valid acc 0.6771 0.6788 + + +# by default, with cleanup: +# local/nnet3/run_tdnn.sh + +# without cleanup: +# local/nnet3/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + + +set -e -o pipefail -u + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +train_set=train_cleaned +gmm=tri3_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for exp dirs, e.g. _cleaned +tdnn_affix=1c #affix for TDNN directory e.g. "a" or "b", in case we change the configuration. + +# Options which are not passed through to run_ivector_common.sh +train_stage=-10 +remove_egs=true +srand=0 +reporting_email=dpovey@gmail.com +# set common_egs_dir to use previously dumped egs. +common_egs_dir= + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=750 + relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0) + output-layer name=output dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=3 \ + --trainer.samples-per-iter=400000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.0015 \ + --trainer.optimization.final-effective-lrate=0.00015 \ + --trainer.optimization.minibatch-size=256,128 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # note: for TDNNs, looped decoding gives exactly the same results + # as regular decoding, so there is no point in testing it separately. + # We use regular decoding because it supports multi-threaded (we just + # didn't create the binary for that, for looped decoding, so far). + rm $dir/.error || true 2>/dev/null + for dset in dev test; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh new file mode 100755 index 00000000000..666c2f1bb31 --- /dev/null +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh @@ -0,0 +1,200 @@ +#!/bin/bash + + +# run_tdnn_lfr_1a.sh is similar in configuration to run_tdnn_1c.sh, but it's a +# low-frame-rate system (see egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh +# for an example of such a system). + + +# by default, with cleanup: +# local/nnet3/run_tdnn_lfr.sh + +# without cleanup: +# local/nnet3/run_tdnn_lfr.sh --train-set train --gmm tri3 --nnet3-affix "" & + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +train_set=train_cleaned +gmm=tri3_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for exp dirs, e.g. _cleaned +tdnn_affix=1a #affix for TDNN directory e.g. "a" or "b", in case we change the configuration. + +# Options which are not passed through to run_ivector_common.sh +train_stage=-10 +remove_egs=true +srand=0 +reporting_email=dpovey@gmail.com +# set common_egs_dir to use previously dumped egs. +common_egs_dir= + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology and a reduced sampling rate. + # We use 4000 leaves, which is a little less than the number used + # in the baseline GMM system (5k) in this setup, since generally + # LFR systems do best with somewhat fewer leaves. + # + # To get the stats to build the tree this script only uses every third frame, + # but it dumps converted alignments that essentially have 3 different + # frame-shifted versions of the alignment interpolated together; these can be + # used without modification in getting labels for training. + steps/nnet3/chain/build_tree.sh \ + --repeat-frames true --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \ + $lang $ali_dir $treedir +fi + +if [ $stage -le 14 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=750 + relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0) + output-layer name=output dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=3 \ + --trainer.samples-per-iter=400000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.0015 \ + --trainer.optimization.final-effective-lrate=0.00015 \ + --trainer.optimization.minibatch-size=256,128 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$treedir \ + --lang=$lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + echo 3 >$dir/frame_subsampling_factor +fi + +if [ $stage -le 16 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt + utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph +fi + + +if [ $stage -le 17 ]; then + # note: for TDNNs, looped decoding gives exactly the same results + # as regular decoding, so there is no point in testing it separately. + # We use regular decoding because it supports multi-threaded (we just + # didn't create the binary for that, for looped decoding, so far). + rm $dir/.error || true 2>/dev/null + for dset in dev test; do + ( + steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \ + --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +exit 0; diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh index f1502dd2761..28c45836cf7 100755 --- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh @@ -9,15 +9,16 @@ # System tdnn_lstm1a_sp tdnn_lstm1b_sp # WER on dev(orig) 11.0 11.0 # [looped:] 11.0 11.1 -# WER on dev(rescored) 10.3 10.3 +# WER on dev(rescored) 10.4 10.3 # [looped:] 10.3 10.5 -# WER on test(orig) 10.8 10.6 +# WER on test(orig) 10.7 10.6 # [looped:] 10.7 10.7 # WER on test(rescored) 10.1 9.9 # [looped:] 10.0 10.0 -# Final train prob -0.68810.7954-0.68970.7946 -# Final valid prob -0.77960.7611-0.79890.7582 - +# Final train prob -0.6881 -0.6897 +# Final valid prob -0.7796 -0.7989 +# Final train acc 0.7954 0.7946 +# Final valid acc 0.7611 0.7582 # by default, with cleanup: # local/nnet3/run_tdnn_lstm.sh @@ -53,19 +54,11 @@ label_delay=5 chunk_width=40,30,20 chunk_left_context=40 chunk_right_context=0 -# decode chunk-size options (for non-looped decoding) -extra_left_context=50 -extra_right_context=0 # training options srand=0 remove_egs=true -#decode options -extra_left_context= -extra_right_context= -frames_per_chunk= - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -91,8 +84,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \ gmm_dir=exp/${gmm} graph_dir=$gmm_dir/graph ali_dir=exp/${gmm}_ali_${train_set}_sp_comb -dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix} -dir=${dir}_sp +dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp train_data_dir=data/${train_set}_sp_hires_comb train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb @@ -175,15 +167,14 @@ if [ $stage -le 13 ]; then fi if [ $stage -le 14 ]; then - [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; - [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; - [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true for dset in dev test; do ( steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ - --extra-left-context $extra_left_context \ - --extra-right-context $extra_right_context \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --frames-per-chunk $frames_per_chunk \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1 diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh index 1d3b12f2697..bc9a717419d 100755 --- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh @@ -8,6 +8,8 @@ # local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and # with 1.5 times larger hidden dimensions. +# exp/nnet3_cleaned/tdnn_lstm1c_sp: num-iters=246 nj=3..15 num-params=18.7M dim=40+100->4187 combine=-0.67->-0.66 loglike:train/valid[163,245,combined]=(-0.71,-0.63,-0.60/-0.92,-0.88,-0.85) accuracy:train/valid[163,245,combined]=(0.77,0.79,0.80/0.74,0.75,0.75) + # local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp exp/nnet3_cleaned/tdnn_lstm1c_sp # System tdnn_lstm1a_sp tdnn_lstm1b_sp tdnn_lstm1c_sp # WER on dev(orig) 11.0 11.0 11.0 diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh new file mode 100755 index 00000000000..3e8509bf4ac --- /dev/null +++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh @@ -0,0 +1,310 @@ +#!/bin/bash + + +# run_tdnn_lstm_lfr_1a.sh is like run_tdnn_lstm_1a.sh, but +# it's a low-frame-rate system. (however, using num-jobs-final=10, +# not 15, which was very high). + + +# Generally the WER is the same or slightly better than before. + +# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp 2>/dev/null +# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp +# System tdnn_lstm1c_sp tdnn_lstm_lfr1a_sp +# WER on dev(orig) 11.0 10.9 +# [looped:] 10.9 10.9 +# [online:] 10.8 +# WER on dev(rescored) 10.4 10.3 +# [looped:] 10.3 10.3 +# [online:] 10.3 +# WER on test(orig) 10.8 10.7 +# [looped:] 10.7 10.7 +# [online:] 10.7 +# WER on test(rescored) 10.1 10.2 +# [looped:] 10.1 10.1 +# [online:] 10.2 +# Final train prob -0.5998 -0.5437 +# Final valid prob -0.8542 -0.7286 +# Final train acc 0.7988 0.8343 +# Final valid acc 0.7521 0.7888 + + +# by default, with cleanup: +# local/nnet3/run_tdnn_lstm_lfr.sh + +# without cleanup: +# local/nnet3/run_tdnn_lstm_lfr.sh --train-set train --gmm tri3 --nnet3-affix "" & + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +train_set=train_cleaned +gmm=tri3_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for exp dirs, e.g. _cleaned + +# Options which are not passed through to run_ivector_common.sh +affix=1a +common_egs_dir= +reporting_email= + +# LSTM options +train_stage=-10 +label_delay=5 + +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 +# decode chunk-size options (for non-looped decoding) +extra_left_context=50 +extra_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology and a reduced sampling rate. + # We use 4000 leaves, which is a little less than the number used + # in the baseline GMM system (5k) in this setup, since generally + # LFR systems do best with somewhat fewer leaves. + # + # To get the stats to build the tree this script only uses every third frame, + # but it dumps converted alignments that essentially have 3 different + # frame-shifted versions of the alignment interpolated together; these can be + # used without modification in getting labels for training. + steps/nnet3/chain/build_tree.sh \ + --repeat-frames true --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \ + $lang $ali_dir $treedir +fi + + +if [ $stage -le 14 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=768 + relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3 + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$treedir \ + --lang=$lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + echo 3 >$dir/frame_subsampling_factor +fi + +if [ $stage -le 16 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt + utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph +fi + +if [ $stage -le 17 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh \ + --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \ + --cmd "$decode_cmd" --num-threads 4 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --frames-per-chunk $frames_per_chunk \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +if [ $stage -le 18 ]; then + # 'looped' decoding. + # note: you should NOT do this decoding step for setups that have bidirectional + # recurrence, like BLSTMs-- it doesn't make sense and will give bad results. + # we didn't write a -parallel version of this program yet, + # so it will take a bit longer as the --num-threads option is not supported. + # we just hardcode the --frames-per-chunk option as it doesn't have to + # match any value used in training, and it won't affect the results (unlike + # regular decoding). + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode_looped.sh \ + --acwt 0.333 --post-decode-acwt 3.0 \ + --nj $decode_nj --cmd "$decode_cmd" \ + --frames-per-chunk 30 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 19 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + # note: we just give it "$dset" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 0.333 --post-decode-acwt 3.0 \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + + +exit 0; diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS index acff4f9d7fe..e6732d21074 100644 --- a/egs/wsj/s5/RESULTS +++ b/egs/wsj/s5/RESULTS @@ -1,8 +1,15 @@ #!/bin/bash -# this RESULTS file was obtained by Haihua Xu in July 2013. - -for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +# this RESULTS file was obtained by Dan Povey in Feb 2017, after +# a rewrite of the run.sh file. +# To see results from the scripts local/nnet3/ and local/chain/, +# look at the top of those files, we don't put those in the +# RESULTS file. + +for dir in exp/*; do + steps/info/gmm_dir_info.pl $dir + for x in $dir/decode*dev93* $dir/decode*eval92*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done +done exit 0 # Use caution when comparing these results with other published results. @@ -13,107 +20,76 @@ exit 0 # in which we only test on utterances that are in either a 5k or 20k subset # of the vocabulary. -# The following results are updated with LDA+MLLT to use 7, not 9 frames of context, -# and also increased the learning rate for the "indirect" fMMI. - # monophone, deltas, trained on the 2k shortest utterances from the si84 data. -%WER 35.39 [ 2914 / 8234, 284 ins, 467 del, 2163 sub ] exp/mono0a/decode_tgpr_dev93/wer_10 -%WER 25.78 [ 1455 / 5643, 142 ins, 184 del, 1129 sub ] exp/mono0a/decode_tgpr_eval92/wer_9 +exp/mono0a: nj=10 align prob=-95.82 over 2.36h [retry=0.4%, fail=0.0%] states=132 gauss=973 +%WER 34.33 [ 2827 / 8234, 266 ins, 457 del, 2104 sub ] exp/mono0a/decode_nosp_tgpr_dev93/wer_10_0.0 +%WER 25.13 [ 1418 / 5643, 138 ins, 192 del, 1088 sub ] exp/mono0a/decode_nosp_tgpr_eval92/wer_10_0.0 + + # first triphone build. Built on half of SI-84. -%WER 20.00 [ 1647 / 8234, 257 ins, 197 del, 1193 sub ] exp/tri1/decode_tgpr_dev93/wer_17 -%WER 13.04 [ 736 / 5643, 137 ins, 61 del, 538 sub ] exp/tri1/decode_tgpr_eval92/wer_14 +exp/tri1: nj=10 align prob=-93.75 over 7.38h [retry=0.4%, fail=0.0%] states=1567 gauss=10025 tree-impr=5.06 +%WER 19.40 [ 1597 / 8234, 247 ins, 199 del, 1151 sub ] exp/tri1/decode_nosp_tgpr_dev93/wer_14_0.5 +%WER 12.76 [ 720 / 5643, 110 ins, 89 del, 521 sub ] exp/tri1/decode_nosp_tgpr_eval92/wer_14_1.0 -# the same, rescored with full trigram model [not pruned.] Note: the tg{1,2,3,4} are +# the above, rescored with full trigram model [not pruned.] Note: the tg{1,2,3,4} are # different rescoring methods. They all give about the same results. Note: 3 and 4 give # the "correct" LM scores. -%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg1/wer_14 -%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg2/wer_14 -%WER 18.75 [ 1544 / 8234, 266 ins, 152 del, 1126 sub ] exp/tri1/decode_tgpr_dev93_tg3/wer_15 -%WER 18.76 [ 1545 / 8234, 266 ins, 152 del, 1127 sub ] exp/tri1/decode_tgpr_dev93_tg4/wer_15 - -# tri2a is delta+delta-delta features. -%WER 17.93 [ 1476 / 8234, 256 ins, 161 del, 1059 sub ] exp/tri2a/decode_tgpr_dev93/wer_16 -%WER 12.42 [ 701 / 5643, 132 ins, 64 del, 505 sub ] exp/tri2a/decode_tgpr_eval92/wer_15 -# just demonstrates how to do decoding constrained by lattices. -%WER 16.76 [ 1380 / 8234, 275 ins, 132 del, 973 sub ] exp/tri2a/decode_tgpr_dev93_fromlats/wer_16 - -# This is an LDA+MLLT system. -%WER 16.43 [ 1353 / 8234, 241 ins, 162 del, 950 sub ] exp/tri2b/decode_tgpr_dev93/wer_16 -%WER 10.69 [ 603 / 5643, 154 ins, 47 del, 402 sub ] exp/tri2b/decode_tgpr_eval92/wer_14 - -# rescoring the lattices with trigram. -%WER 15.29 [ 1252 / 8191, 219 ins, 153 del, 880 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg/wer_18 -# using the "biglm" decoding method to avoid the lattice rescoring step [not faster though.] -%WER 15.31 [ 1261 / 8234, 227 ins, 158 del, 876 sub ] exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_18 -# using a Minimum Bayes Risk decoding method on top of the _tg lattices. -%WER 15.15 [ 1241 / 8191, 221 ins, 155 del, 865 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg_mbr/wer_18 - -# fMMI, default learning rate (0.001) - -%WER 15.19 [ 1251 / 8234, 213 ins, 148 del, 890 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it3/wer_15 -%WER 15.14 [ 1247 / 8234, 228 ins, 138 del, 881 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it4/wer_14 -%WER 15.06 [ 1240 / 8234, 211 ins, 152 del, 877 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it5/wer_15 -%WER 15.01 [ 1236 / 8234, 206 ins, 154 del, 876 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it6/wer_15 -%WER 14.99 [ 1234 / 8234, 210 ins, 159 del, 865 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it7/wer_15 -%WER 15.23 [ 1254 / 8234, 200 ins, 184 del, 870 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it8/wer_16 - -%WER 15.55 [ 1280 / 8234, 234 ins, 151 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it3/wer_15 -%WER 15.63 [ 1287 / 8234, 242 ins, 150 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it4/wer_15 -%WER 15.30 [ 1260 / 8234, 224 ins, 143 del, 893 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it5/wer_15 -%WER 15.34 [ 1263 / 8234, 216 ins, 156 del, 891 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it6/wer_16 -%WER 15.34 [ 1263 / 8234, 242 ins, 139 del, 882 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it7/wer_14 -%WER 15.30 [ 1260 / 8234, 245 ins, 134 del, 881 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it8/wer_13 - -%WER 15.21 [ 1252 / 8234, 218 ins, 148 del, 886 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it3/wer_15 -%WER 15.16 [ 1248 / 8234, 205 ins, 159 del, 884 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it4/wer_16 -%WER 15.22 [ 1253 / 8234, 229 ins, 147 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it5/wer_15 -%WER 14.90 [ 1227 / 8234, 203 ins, 150 del, 874 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it6/wer_15 -%WER 14.95 [ 1231 / 8234, 202 ins, 152 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it7/wer_15 -%WER 15.18 [ 1250 / 8234, 184 ins, 172 del, 894 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it8/wer_16 - -%WER 15.70 [ 1293 / 8234, 218 ins, 163 del, 912 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it3/wer_16 -%WER 15.61 [ 1285 / 8234, 217 ins, 163 del, 905 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it4/wer_16 -%WER 10.46 [ 590 / 5643, 125 ins, 51 del, 414 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it3/wer_15 -%WER 10.40 [ 587 / 5643, 124 ins, 52 del, 411 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it4/wer_16 - -%WER 15.56 [ 1281 / 8234, 224 ins, 152 del, 905 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it3/wer_15 -%WER 15.44 [ 1271 / 8234, 220 ins, 165 del, 886 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it4/wer_16 -%WER 10.33 [ 583 / 5643, 125 ins, 51 del, 407 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it3/wer_15 -%WER 10.33 [ 583 / 5643, 125 ins, 47 del, 411 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it4/wer_15 - -%WER 11.43 [ 941 / 8234, 113 ins, 144 del, 684 sub ] exp/tri3b/decode_bd_tgpr_dev93/wer_19 -%WER 16.09 [ 1325 / 8234, 193 ins, 185 del, 947 sub ] exp/tri3b/decode_bd_tgpr_dev93.si/wer_16 -%WER 6.79 [ 383 / 5643, 51 ins, 49 del, 283 sub ] exp/tri3b/decode_bd_tgpr_eval92/wer_18 -%WER 10.61 [ 599 / 5643, 91 ins, 74 del, 434 sub ] exp/tri3b/decode_bd_tgpr_eval92.si/wer_15 -%WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19 -%WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18 - -%WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17 -%WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15 - -%WER 12.98 [ 1069 / 8234, 209 ins, 116 del, 744 sub ] exp/tri3b/decode_tgpr_dev93_tg/wer_19 -%WER 9.30 [ 525 / 5643, 120 ins, 37 del, 368 sub ] exp/tri3b/decode_tgpr_eval92/wer_18 -%WER 12.95 [ 731 / 5643, 167 ins, 46 del, 518 sub ] exp/tri3b/decode_tgpr_eval92.si/wer_14 -%WER 8.54 [ 482 / 5643, 113 ins, 29 del, 340 sub ] exp/tri3b/decode_tgpr_eval92_tg/wer_17 - -%WER 12.12 [ 998 / 8234, 209 ins, 88 del, 701 sub ] exp/tri4a/decode_tgpr_dev93/wer_17 -%WER 15.98 [ 1316 / 8234, 275 ins, 119 del, 922 sub ] exp/tri4a/decode_tgpr_dev93.si/wer_15 -%WER 7.83 [ 442 / 5643, 107 ins, 23 del, 312 sub ] exp/tri4a/decode_tgpr_eval92/wer_16 -%WER 10.90 [ 615 / 5643, 148 ins, 30 del, 437 sub ] exp/tri4a/decode_tgpr_eval92.si/wer_13 - -%WER 9.15 [ 753 / 8234, 90 ins, 113 del, 550 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93/wer_16 -%WER 12.64 [ 1041 / 8234, 137 ins, 145 del, 759 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93.si/wer_16 -%WER 5.74 [ 324 / 5643, 47 ins, 35 del, 242 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92/wer_19 -%WER 7.92 [ 447 / 5643, 64 ins, 46 del, 337 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92.si/wer_15 -%WER 9.38 [ 772 / 8234, 90 ins, 118 del, 564 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_18 -%WER 13.07 [ 1076 / 8234, 148 ins, 143 del, 785 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17 -%WER 6.03 [ 340 / 5643, 66 ins, 26 del, 248 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_13 -%WER 8.19 [ 462 / 5643, 74 ins, 42 del, 346 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15 -%WER 12.16 [ 1001 / 8234, 197 ins, 98 del, 706 sub ] exp/tri4b/decode_tgpr_dev93/wer_17 -%WER 15.47 [ 1274 / 8234, 235 ins, 120 del, 919 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17 -%WER 8.08 [ 456 / 5643, 125 ins, 16 del, 315 sub ] exp/tri4b/decode_tgpr_eval92/wer_13 -%WER 10.49 [ 592 / 5643, 147 ins, 27 del, 418 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_12 +%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg1/wer_15_0.5 +%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg2/wer_15_0.5 +%WER 18.16 [ 1495 / 8234, 268 ins, 153 del, 1074 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg3/wer_16_0.0 +%WER 18.18 [ 1497 / 8234, 268 ins, 154 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg4/wer_16_0.0 + + +# tri2b is an LDA+MLLT system trained on SI-84 +exp/tri2b: nj=10 align prob=-47.22 over 15.10h [retry=0.7%, fail=0.0%] states=2005 gauss=15036 tree-impr=5.45 lda-sum=26.20 mllt:impr,logdet=1.34,1.97 +%WER 16.37 [ 1348 / 8234, 241 ins, 157 del, 950 sub ] exp/tri2b/decode_nosp_tgpr_dev93/wer_17_0.0 +%WER 10.53 [ 594 / 5643, 110 ins, 60 del, 424 sub ] exp/tri2b/decode_nosp_tgpr_eval92/wer_17_0.5 + + +# tri3b is an LDA+MLLT+SAT system trained on all of SI-284 +exp/tri3b: nj=10 align prob=-44.30 over 81.23h [retry=0.8%, fail=0.1%] states=3362 gauss=40061 fmllr-impr=3.70 over 59.77h tree-impr=7.86 + +%WER 15.56 [ 1281 / 8234, 220 ins, 140 del, 921 sub ] exp/tri3b/decode_nosp_tgpr_dev93.si/wer_17_0.5 +%WER 12.82 [ 1056 / 8234, 135 ins, 147 del, 774 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93.si/wer_15_0.0 +%WER 9.24 [ 761 / 8234, 89 ins, 109 del, 563 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93/wer_16_0.0 +%WER 11.53 [ 949 / 8234, 179 ins, 94 del, 676 sub ] exp/tri3b/decode_nosp_tgpr_dev93/wer_15_0.5 +%WER 10.94 [ 901 / 8234, 181 ins, 82 del, 638 sub ] exp/tri3b/decode_nosp_tg_dev93/wer_14_0.5 +%WER 8.16 [ 672 / 8234, 94 ins, 94 del, 484 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93_fg/wer_17_0.0 + +%WER 10.95 [ 618 / 5643, 148 ins, 36 del, 434 sub ] exp/tri3b/decode_nosp_tgpr_eval92.si/wer_14_0.0 +%WER 8.19 [ 462 / 5643, 77 ins, 51 del, 334 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92.si/wer_16_0.0 +%WER 5.55 [ 313 / 5643, 35 ins, 45 del, 233 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92/wer_17_1.0 +%WER 4.89 [ 276 / 5643, 47 ins, 28 del, 201 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92_fg/wer_15_0.5 +%WER 7.53 [ 425 / 5643, 112 ins, 20 del, 293 sub ] exp/tri3b/decode_nosp_tg_eval92/wer_17_0.0 +%WER 8.15 [ 460 / 5643, 113 ins, 30 del, 317 sub ] exp/tri3b/decode_nosp_tgpr_eval92/wer_14_1.0 + + +# tri4b is an LDA+MLLT+SAT system after estimating pronunciation probabilities +# and word-and-pronunciation-dependent silence probabilities. + +exp/tri4b: nj=10 align prob=-44.46 over 81.23h [retry=0.6%, fail=0.1%] states=3413 gauss=40059 fmllr-impr=0.17 over 60.20h tree-impr=8.70 + +%WER 15.16 [ 1248 / 8234, 253 ins, 96 del, 899 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17_0.0 +%WER 12.62 [ 1039 / 8234, 141 ins, 124 del, 774 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17_0.0 +%WER 9.01 [ 742 / 8234, 106 ins, 97 del, 539 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_16_0.0 +%WER 8.25 [ 679 / 8234, 94 ins, 100 del, 485 sub ] exp/tri4b/decode_bd_tgpr_dev93_fg/wer_17_0.5 +%WER 10.92 [ 899 / 8234, 186 ins, 92 del, 621 sub ] exp/tri4b/decode_tg_dev93/wer_17_0.5 +%WER 11.44 [ 942 / 8234, 203 ins, 87 del, 652 sub ] exp/tri4b/decode_tgpr_dev93/wer_14_0.5 + +%WER 10.93 [ 617 / 5643, 147 ins, 33 del, 437 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_14_1.0 +%WER 8.74 [ 493 / 5643, 104 ins, 34 del, 355 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15_0.0 +%WER 5.69 [ 321 / 5643, 50 ins, 34 del, 237 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_17_0.5 +%WER 4.71 [ 266 / 5643, 40 ins, 27 del, 199 sub ] exp/tri4b/decode_bd_tgpr_eval92_fg/wer_17_1.0 +%WER 7.39 [ 417 / 5643, 107 ins, 24 del, 286 sub ] exp/tri4b/decode_tg_eval92/wer_16_1.0 +%WER 7.90 [ 446 / 5643, 111 ins, 27 del, 308 sub ] exp/tri4b/decode_tgpr_eval92/wer_15_1.0 + + +###################################### +## Results below this point were mostly obtained in 2013 by Hainan Xu, +## They are from parts of the script that are now not run by default in the run.sh. +## you can look in the git history to figure out when these results were added. + %WER 7.99 [ 658 / 8234, 72 ins, 95 del, 491 sub ] exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it8/wer_12 %WER 11.15 [ 918 / 8234, 180 ins, 81 del, 657 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it3/wer_15 %WER 11.23 [ 925 / 8234, 201 ins, 77 del, 647 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it4/wer_12 @@ -166,7 +142,7 @@ exit 0 # not updated -# DNN on fMLLR features (Karel's setup, [7.8.2015]). +# DNN on fMLLR features (Karel's setup, [7.8.2015]). # frame cross-entropy training %WER 6.05 [ 498 / 8234, 59 ins, 67 del, 372 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_dev93/wer_11_0.0 %WER 3.69 [ 208 / 5643, 19 ins, 19 del, 170 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_eval92/wer_11_1.0 @@ -298,7 +274,7 @@ for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.s # bidirectional LSTM # ----------------------- -# local/nnet3/run_lstm.sh --affix bidirectional \ +# local/nnet3/run_lstm.sh --affix bidirectional \ # --lstm-delay " [-1,1] [-2,2] [-3,3] " \ # --label-delay 0 \ # --cell-dim 640 \ diff --git a/egs/wsj/s5/local/chain/compare_wer.sh b/egs/wsj/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..edfefad547f --- /dev/null +++ b/egs/wsj/s5/local/chain/compare_wer.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev93 (tgpr) " + "#WER dev93 (tg) " + "#WER dev93 (big-dict,tgpr) " + "#WER dev93 (big-dict,fg) " + "#WER eval92 (tgpr) " + "#WER eval92 (tg) " + "#WER eval92 (big-dict,tgpr)" + "#WER eval92 (big-dict,fg) ") + +for n in 0 1 2 3 4 5 6 7; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg) + + wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/wsj/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..d874eb0986a --- /dev/null +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,361 @@ +#!/bin/bash + + +# This was modified from run_tdnn_lstm_1a.sh, making similar +# changes as the diff from run_tdnn_lstm_1a.sh->run_tdnn_1c.sh +# in egs/tedlium/s5_r2/local/nnet3/tuning, +# specifically: +# changing chunk_left_context to zero, shrink from 0.99->1 +# (since it's not applicable to ReLUs), and removing +# the deriv-truncate-margin option since it's only applicable +# to recurrent setups; removing label-delay. +# adding pre-final layers (I experimented with this, +# it did seem helpful); using 3M not 1.5M frames per iter to keep the +# time per job reasonable; and fewer final jobs (5 not 10). + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1a_sp: num-iters=102 nj=2..5 num-params=7.6M dim=40+100->2889 combine=-0.052->-0.051 xent:train/valid[67,101,final]=(-0.881,-0.824,-0.822/-0.953,-0.922,-0.921) logprob:train/valid[67,101,final]=(-0.048,-0.042,-0.041/-0.064,-0.064,-0.063) + +# The following table compares (nnet3 TDNN, chain TDNN+LSTM, this experiment == chain TDNN). +# This is better than the nnet3 TDNN, but the difference with the chain TDNN+LSTM +# is inconsistent. + +# local/chain/compare_wer.sh --online exp/nnet3/tdnn1a_sp exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp +# System tdnn1a_sp tdnn_lstm1a_sp tdnn1a_sp +#WER dev93 (tgpr) 9.18 7.48 7.87 +# [online:] 7.49 8.02 +#WER dev93 (tg) 8.59 7.41 7.61 +# [online:] 7.40 7.70 +#WER dev93 (big-dict,tgpr) 6.45 5.64 5.71 +# [online:] 5.70 5.60 +#WER dev93 (big-dict,fg) 5.83 5.40 5.10 +# [online:] 5.19 5.21 +#WER eval92 (tgpr) 6.15 5.67 5.23 +# [online:] 5.60 5.44 +#WER eval92 (tg) 5.55 5.46 4.87 +# [online:] 5.53 4.87 +#WER eval92 (big-dict,tgpr) 3.58 3.69 3.24 +# [online:] 3.63 3.31 +#WER eval92 (big-dict,fg) 2.98 3.28 2.71 +# [online:] 3.31 2.92 +# Final train prob -0.0341 -0.0414 +# Final valid prob -0.0506 -0.0634 +# Final train prob (xent) -0.5643 -0.8216 +# Final valid prob (xent) -0.6648 -0.9208 + + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l 2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051) + +# The following compares: +# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM) +# system. +# This is consistently better than the nnet3 TDNN+LSTM, but the +# difference with the chain TDNN is inconsistent. + +# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp +# System tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp +#WER dev93 (tgpr) 8.54 7.87 7.48 +# [online:] 8.57 8.02 7.49 +#WER dev93 (tg) 8.25 7.61 7.41 +# [online:] 8.34 7.70 7.40 +#WER dev93 (big-dict,tgpr) 6.24 5.71 5.64 +# [online:] 6.40 5.60 5.70 +#WER dev93 (big-dict,fg) 5.70 5.10 5.40 +# [online:] 5.77 5.21 5.19 +#WER eval92 (tgpr) 6.52 5.23 5.67 +# [online:] 6.56 5.44 5.60 +#WER eval92 (tg) 6.13 4.87 5.46 +# [online:] 6.24 4.87 5.53 +#WER eval92 (big-dict,tgpr) 3.88 3.24 3.69 +# [online:] 3.88 3.31 3.63 +#WER eval92 (big-dict,fg) 3.38 2.71 3.28 +# [online:] 3.53 2.92 3.31 +# Final train prob -0.0414 -0.0341 +# Final valid prob -0.0634 -0.0506 +# Final train prob (xent) -0.8216 -0.5643 +# Final valid prob (xent) -0.9208 -0.6648 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +label_delay=5 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=512 + relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l [ ... ]" + echo "e.g.: $0 exp/nnet3/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/nnet3/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev93 (tgpr) " + "#WER dev93 (tg) " + "#WER dev93 (big-dict,tgpr) " + "#WER dev93 (big-dict,fg) " + "#WER eval92 (tgpr) " + "#WER eval92 (tg) " + "#WER eval92 (big-dict,tgpr)" + "#WER eval92 (big-dict,fg) ") + +for n in 0 1 2 3 4 5 6 7; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg) + + wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh index 8d4cff326b3..e30988b7bf6 100755 --- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh +++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh @@ -1,83 +1,215 @@ #!/bin/bash -# this script is called from scripts like run_ms.sh; it does the common stages -# of the build, such as feature extraction. -# This is actually the same as local/online/run_nnet2_common.sh, except -# for the directory names. +set -e -o pipefail -. cmd.sh -mfccdir=mfcc +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. -stage=1 -. cmd.sh +stage=0 +nj=30 +train_set=train_si284 # you might set this to e.g. train. +test_sets="test_dev93 test_eval92" +gmm=tri4b # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff in (e.g. + # in the tedlium recip it's _cleaned). + +. ./cmd.sh . ./path.sh -. ./utils/parse_options.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi if [ $stage -le 1 ]; then - for datadir in train_si284 test_eval93 test_dev93 test_eval92; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; - done - utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1 + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp fi if [ $stage -le 2 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We align the si84 data for this purpose. + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires - steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84 + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done fi if [ $stage -le 3 ]; then - # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 - # because after we get the transform (12th iter is the last), any further - # training is pointless. - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --realign-iters "" \ - --splice-opts "--left-context=3 --right-context=3" \ - 5000 10000 data/train_si84_hires data/lang \ - exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b + echo "$0: selecting segments of hires training data that were also present in the" + echo " ... original training data." + + # note, these data-dirs are temporary; we put them in a sub-directory + # of the place where we'll make the alignments. + temp_data_root=exp/nnet3${nnet3_affix}/tri5 + mkdir -p $temp_data_root + + utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ + data/${train_set}_sp_hires $temp_data_root/${train_set}_hires + + # note: essentially all the original segments should be in the hires data. + n1=$(wc -l /dev/null - for data in test_eval92 test_dev93 test_eval93; do - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \ - data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error & - done - wait - [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh \ + data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 8 ]; then + echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)" + steps/make_mfcc.sh --nj $nj \ + --cmd "$train_cmd" data/${train_set}_sp + steps/compute_cmvn_stats.sh data/${train_set}_sp + echo "$0: fixing input data-dir to remove nonexistent features, in case some " + echo ".. speed-perturbed segments were too short." + utils/fix_data_dir.sh data/${train_set}_sp fi +if [ $stage -le 9 ]; then + if [ -f $ali_dir/ali.1.gz ]; then + echo "$0: alignments in $ali_dir appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir +fi + + exit 0; diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh index 2454fb5be63..d9af546b49b 100755 --- a/egs/wsj/s5/local/nnet3/run_lstm.sh +++ b/egs/wsj/s5/local/nnet3/run_lstm.sh @@ -1,5 +1,7 @@ #!/bin/bash +# This script is deprecated, see run_tdnn_lstm.sh + # this is a basic lstm script # LSTM script runs for more epochs than the TDNN script # and each epoch takes twice the time @@ -125,4 +127,3 @@ if [ $stage -le 9 ]; then fi exit 0; - diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh index 124b04949a0..311ee14d16a 100755 --- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh +++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh @@ -1,5 +1,8 @@ #!/bin/bash + +# This script is deprecated. + set -o pipefail set -e # this is run_discriminative.sh diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh deleted file mode 100755 index 337c5656de4..00000000000 --- a/egs/wsj/s5/local/nnet3/run_tdnn.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# this is the standard "tdnn" system, built in nnet3; it's what we use to -# call multi-splice. - -. cmd.sh - - -# At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. - -stage=0 -train_stage=-10 -dir=exp/nnet3/nnet_tdnn_a -. cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=650 + relu-renorm-layer name=tdnn2 dim=650 input=Append(-1,0,1) + relu-renorm-layer name=tdnn3 dim=650 input=Append(-1,0,1) + relu-renorm-layer name=tdnn4 dim=650 input=Append(-3,0,3) + relu-renorm-layer name=tdnn5 dim=650 input=Append(-6,-3,0) + output-layer name=output dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=3 \ + --trainer.samples-per-iter=400000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0015 \ + --trainer.optimization.final-effective-lrate=0.00015 \ + --trainer.optimization.minibatch-size=256,128 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # note: for TDNNs, looped decoding gives exactly the same results + # as regular decoding, so there is no point in testing it separately. + # We use regular decoding because it supports multi-threaded (we just + # didn't create the binary for that, for looped decoding, so far). + rm $dir/.error || true 2>/dev/null + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nj=$(wc -l b}.sh +# There seems to be no consistent difference. + +# run_tdnn_1a.sh is the standard "tdnn" system, built in nnet3 with xconfigs. + +# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp exp/nnet3/tdnn1b_sp +# System tdnn1a_sp tdnn1b_sp +#WER dev93 (tgpr) 9.18 9.12 +#WER dev93 (tg) 8.59 8.51 +#WER dev93 (big-dict,tgpr) 6.45 6.19 +#WER dev93 (big-dict,fg) 5.83 5.78 +#WER eval92 (tgpr) 6.15 6.33 +#WER eval92 (tg) 5.55 5.74 +#WER eval92 (big-dict,tgpr) 3.58 3.62 +#WER eval92 (big-dict,fg) 2.98 3.10 +# Final train prob -0.7200 -0.6035 +# Final valid prob -0.8834 -0.7578 +# Final train acc 0.7762 0.8015 +# Final valid acc 0.7301 0.7607 + + +set -e -o pipefail -u + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 + +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +tdnn_affix=1b #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. + +# Options which are not passed through to run_ivector_common.sh +train_stage=-10 +remove_egs=true +srand=0 +reporting_email= +# set common_egs_dir to use previously dumped egs. +common_egs_dir= + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=750 + relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,2) + relu-renorm-layer name=tdnn3 dim=750 input=Append(-3,3) + relu-renorm-layer name=tdnn4 dim=750 input=Append(-7,2) + relu-renorm-layer name=tdnn5 dim=750 input=Append(-3,3) + relu-renorm-layer name=tdnn6 dim=750 + output-layer name=output dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=3 \ + --trainer.samples-per-iter=400000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0015 \ + --trainer.optimization.final-effective-lrate=0.00015 \ + --trainer.optimization.minibatch-size=256,128 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # note: for TDNNs, looped decoding gives exactly the same results + # as regular decoding, so there is no point in testing it separately. + # We use regular decoding because it supports multi-threaded (we just + # didn't create the binary for that, for looped decoding, so far). + rm $dir/.error || true 2>/dev/null + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nj=$(wc -l 3413 combine=-0.55->-0.54 loglike:train/valid[67,101,combined]=(-0.63,-0.55,-0.55/-0.71,-0.63,-0.63) accuracy:train/valid[67,101,combined]=(0.80,0.82,0.82/0.76,0.78,0.78) + + + +# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp 2>/dev/null +# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp +# System tdnn1a_sp tdnn_lstm1a_sp +#WER dev93 (tgpr) 9.18 8.54 +# [looped:] 8.54 +# [online:] 8.57 +#WER dev93 (tg) 8.59 8.25 +# [looped:] 8.21 +# [online:] 8.34 +#WER dev93 (big-dict,tgpr) 6.45 6.24 +# [looped:] 6.28 +# [online:] 6.40 +#WER dev93 (big-dict,fg) 5.83 5.70 +# [looped:] 5.70 +# [online:] 5.77 +#WER eval92 (tgpr) 6.15 6.52 +# [looped:] 6.45 +# [online:] 6.56 +#WER eval92 (tg) 5.55 6.13 +# [looped:] 6.08 +# [online:] 6.24 +#WER eval92 (big-dict,tgpr) 3.58 3.88 +# [looped:] 3.93 +# [online:] 3.88 +#WER eval92 (big-dict,fg) 2.98 3.38 +# [looped:] 3.47 +# [online:] 3.53 +# Final train prob -0.7200 -0.5492 +# Final valid prob -0.8834 -0.6343 +# Final train acc 0.7762 0.8154 +# Final valid acc 0.7301 0.7849 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM options +train_stage=-10 +label_delay=5 + +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=520 + relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang=$lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + data_affix=$(echo $data | sed s/test_//) + nj=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nj=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nj=$(wc -l 3205 combine=-0.43->-0.42 loglike:train/valid[89,135,combined]=(-0.51,-0.39,-0.38/-0.59,-0.51,-0.51) accuracy:train/valid[89,135,combined]=(0.85,0.88,0.88/0.82,0.84,0.84) + + +# It seems to be a little worse the regular-frame-rate system. + +# local/nnet3/compare_wer.sh --looped exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm_lfr1a_sp +# System tdnn_lstm1a_sp tdnn_lstm_lfr1a_sp +#WER dev93 (tgpr) 8.54 9.02 +# [looped:] 8.54 8.99 +#WER dev93 (tg) 8.25 8.60 +# [looped:] 8.21 8.54 +#WER dev93 (big-dict,tgpr) 6.24 6.85 +# [looped:] 6.28 6.81 +#WER dev93 (big-dict,fg) 5.70 6.33 +# [looped:] 5.70 6.33 +#WER eval92 (tgpr) 6.52 6.52 +# [looped:] 6.45 6.42 +#WER eval92 (tg) 6.13 6.01 +# [looped:] 6.08 5.92 +#WER eval92 (big-dict,tgpr) 3.88 4.22 +# [looped:] 3.93 4.20 +#WER eval92 (big-dict,fg) 3.38 3.76 +# [looped:] 3.47 3.79 +# Final train prob -0.5492 -0.3100 +# Final valid prob -0.6343 -0.4646 +# Final train acc 0.8154 0.9051 +# Final valid acc 0.7849 0.8615 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM options +train_stage=-10 +label_delay=5 + +# training chunk-options +chunk_width=40,30,20 +chunk_left_context=40 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology and a reduced sampling rate. + # We use 4000 leaves, which is a little less than the number used + # in the baseline GMM system (5k) in this setup, since generally + # LFR systems do best with somewhat fewer leaves. + # + # To get the stats to build the tree this script only uses every third frame, + # but it dumps converted alignments that essentially have 3 different + # frame-shifted versions of the alignment interpolated together; these can be + # used without modification in getting labels for training. + steps/nnet3/chain/build_tree.sh \ + --repeat-frames true --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 4000 data/${train_set}_sp \ + $lang $ali_dir $treedir +fi + + +if [ $stage -le 14 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=520 + relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1) + fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3) + relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3) + fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3 + + output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_rnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.samples-per-iter=10000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0003 \ + --trainer.optimization.final-effective-lrate=0.00003 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --ali-dir=$treedir \ + --lang=$lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + echo 3 >$dir/frame_subsampling_factor +fi + +if [ $stage -le 16 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \ + $dir $dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \ + $dir $dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 17 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l data/$y/utt2spk; cp data/$y/utt2spk data/$y/spk2utt; - steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; done @@ -33,7 +33,7 @@ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ # get the fMLLR basis. steps/get_fmllr_basis.sh --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} exp/tri3b + data/train_si284 data/lang${lang_suffix} exp/tri3b # decoding tri3b with basis fMLLR steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ @@ -50,5 +50,3 @@ steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \ exp/tri3b/graph${lang_suffix}_tgpr data/test_eval92_utt \ exp/tri3b/decode${lang_suffix}_tgpr_eval92_basis_utt || exit 1; - - diff --git a/egs/wsj/s5/local/run_mmi_tri2b.sh b/egs/wsj/s5/local/run_mmi_tri2b.sh deleted file mode 100755 index d7ddbfbaf62..00000000000 --- a/egs/wsj/s5/local/run_mmi_tri2b.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -lang_suffix= - -echo "$0 $@" # Print the command line for logging -. utils/parse_options.sh || exit 1; - -. ./cmd.sh - -# Train and test MMI (and boosted MMI) on tri2b system. -steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} \ - exp/tri2b exp/tri2b_denlats_si84 || exit 1; - -# train the basic MMI system. -steps/train_mmi.sh --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \ - exp/tri2b_denlats_si84 exp/tri2b_mmi || exit 1; -for iter in 3 4; do - steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \ - exp/tri2b_mmi/decode${lang_suffix}_tgpr_dev93_it$iter & - steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \ - exp/tri2b_mmi/decode${lang_suffix}_tgpr_eval92_it$iter & -done - -# MMI with 0.1 boosting factor. -steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \ - data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \ - exp/tri2b_denlats_si84 exp/tri2b_mmi_b0.1 || exit 1; - -for iter in 3 4; do - steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \ - exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter & - steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \ - exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_eval92_it$iter & -done - - -# Train a UBM with 400 components, for fMMI. -steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \ - 400 data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 exp/dubm2b - -steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \ - exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1 - -for iter in `seq 3 8`; do - steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \ - exp/tri2b_fmmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter & -done - -steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \ - exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1_lr0.005 || exit 1; -for iter in `seq 3 8`; do - steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \ - exp/tri2b_fmmi_b0.1_lr0.005/decode${lang_suffix}_tgpr_dev93_it$iter & -done - -steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \ - data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \ - exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_indirect_b0.1 -for iter in `seq 3 8`; do - steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ - exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \ - exp/tri2b_fmmi_indirect_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter & -done diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh index fb004117658..4d505f5da3a 100755 --- a/egs/wsj/s5/run.sh +++ b/egs/wsj/s5/run.sh @@ -1,7 +1,15 @@ #!/bin/bash +stage=0 +train=true # set to false to disable the training-related scripts + # note: you probably only want to set --train false if you + # are using at least --stage 1. +decode=true # set to false to disable the decoding-related scripts. + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. @@ -18,334 +26,313 @@ wsj0=/export/corpora5/LDC/LDC93S6B wsj1=/export/corpora5/LDC/LDC94S13B -local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? || exit 1; -# Sometimes, we have seen WSJ distributions that do not have subdirectories -# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the -# wsj0 or wsj1 directories. In such cases, try the following: -# -# corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj -# local/cstr_wsj_data_prep.sh $corpus -# rm data/local/dict/lexiconp.txt -# $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work. -# -# "nosp" refers to the dictionary before silence probabilities and pronunciation -# probabilities are added. -local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1; - -utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_tmp_nosp data/lang_nosp || exit 1; - -local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1; - - # We suggest to run the next three commands in the background, - # as they are not a precondition for the system building and - # most of the tests: these commands build a dictionary - # containing many of the OOVs in the WSJ LM training data, - # and an LM trained directly on that data (i.e. not just - # copying the arpa files from the disks from LDC). - # Caution: the commands below will only work if $decode_cmd - # is setup to use qsub. Else, just remove the --cmd option. - # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style, - # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead. +if [ $stage -le 0 ]; then + # data preparation. + local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? || exit 1; + + # Sometimes, we have seen WSJ distributions that do not have subdirectories + # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the + # wsj0 or wsj1 directories. In such cases, try the following: + # + # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj + # local/cstr_wsj_data_prep.sh $corpus + # rm data/local/dict/lexiconp.txt + # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work. + # + # "nosp" refers to the dictionary before silence probabilities and pronunciation + # probabilities are added. + local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1; + + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp || exit 1; + + local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1; + + # We suggest to run the next three commands in the background, + # as they are not a precondition for the system building and + # most of the tests: these commands build a dictionary + # containing many of the OOVs in the WSJ LM training data, + # and an LM trained directly on that data (i.e. not just + # copying the arpa files from the disks from LDC). + # Caution: the commands below will only work if $decode_cmd + # is setup to use qsub. Else, just remove the --cmd option. + # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style, + # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead. ( - local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1 && \ - utils/prepare_lang.sh data/local/dict_nosp_larger \ - "" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \ - local/wsj_train_lms.sh --dict-suffix "_nosp" && - local/wsj_format_local_lms.sh --lang-suffix "_nosp" # && + local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1 && \ + utils/prepare_lang.sh data/local/dict_nosp_larger \ + "" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \ + local/wsj_train_lms.sh --dict-suffix "_nosp" && + local/wsj_format_local_lms.sh --lang-suffix "_nosp" # && ) & -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. - -for x in test_eval92 test_eval93 test_dev93 train_si284; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1; - steps/compute_cmvn_stats.sh data/$x || exit 1; -done - -utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1 - -# Now make subset with the shortest 2k utterances from si-84. -utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1; - -# Now make subset with half of the data from si-84. -utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1; - - -# Note: the --boost-silence option should probably be omitted by default -# for normal setups. It doesn't always help. [it's to discourage non-silence -# models from modeling silence.] -steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ - data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1; - -( - utils/mkgraph.sh data/lang_nosp_test_tgpr \ - exp/mono0a exp/mono0a/graph_nosp_tgpr && \ - steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ - data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \ - steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ - data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 -) & - -steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ - data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; - -steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \ - data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1; - -while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \ - [ -z data/lang_nosp_test_tgpr/tmp/LG.fst ]; do - sleep 20; -done -sleep 30; -# or the mono mkgraph.sh might be writing -# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail. - -utils/mkgraph.sh data/lang_nosp_test_tgpr \ - exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1; - -steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \ - data/test_dev93 exp/tri1/decode_nosp_tgpr_dev93 || exit 1; -steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \ - data/test_eval92 exp/tri1/decode_nosp_tgpr_eval92 || exit 1; - -# test various modes of LM rescoring (4 is the default one). -# This is just confirming they're equivalent. -for mode in 1 2 3 4; do - steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \ - data/lang_nosp_test_{tgpr,tg} data/test_dev93 \ - exp/tri1/decode_nosp_tgpr_dev93 \ - exp/tri1/decode_nosp_tgpr_dev93_tg$mode || exit 1; -done - - -## the following command demonstrates how to get lattices that are -## "word-aligned" (arcs coincide with words, with boundaries in the right -## place). -#sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'` -#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \ -# data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \ -# exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1; - -steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1; - -steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ - data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1; - -utils/mkgraph.sh data/lang_nosp_test_tgpr \ - exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1; -steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \ - data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 || exit 1; -steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \ - data/test_eval92 exp/tri2b/decode_nosp_tgpr_eval92 || exit 1; - -# At this point, you could run the example scripts that show how VTLN works. -# We haven't included this in the default recipes yet. -# local/run_vtln.sh --lang-suffix "_nosp" -# local/run_vtln2.sh --lang-suffix "_nosp" - -# Now, with dev93, compare lattice rescoring with biglm decoding, -# going from tgpr to tg. Note: results are not the same, even though they should -# be, and I believe this is due to the beams not being wide enough. The pruning -# seems to be a bit too narrow in the current scripts (got at least 0.7% absolute -# improvement from loosening beams from their current values). - -steps/decode_biglm.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \ - data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93_tg_biglm - -# baseline via LM rescoring of lattices. -steps/lmrescore.sh --cmd "$decode_cmd" \ - data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \ - data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 \ - exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1; - -# Trying Minimum Bayes Risk decoding (like Confusion Network decoding): -mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr -cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \ - exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr -local/score_mbr.sh --cmd "$decode_cmd" \ - data/test_dev93/ data/lang_nosp_test_tgpr/ \ - exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr - -# This script trains a delta+delta-delta system. It's not really recommended or + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + + for x in test_eval92 test_eval93 test_dev93 train_si284; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1; + steps/compute_cmvn_stats.sh data/$x || exit 1; + done + + utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1 + + # Now make subset with the shortest 2k utterances from si-84. + utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1; + + # Now make subset with half of the data from si-84. + utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1; +fi + + +if [ $stage -le 1 ]; then + # monophone + + + # Note: the --boost-silence option should probably be omitted by default + # for normal setups. It doesn't always help. [it's to discourage non-silence + # models from modeling silence.] + if $train; then + steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1; + fi + + if $decode; then + utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \ + steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ + data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \ + steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ + data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 + fi +fi + +if [ $stage -le 2 ]; then + # tri1 + if $train; then + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; + + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \ + data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1; + fi + + if $decode; then + utils/mkgraph.sh data/lang_nosp_test_tgpr \ + exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1; + + for data in dev93 eval92; do + nspk=$(wc -l " data/local/lang_tmp data/lang || exit 1; - -for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do - mkdir -p data/lang_test_${lm_suffix} - cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1; - rm -rf data/lang_test_${lm_suffix}/tmp - cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/ -done - -# Silprob for larger lexicon. -utils/dict_dir_add_pronprobs.sh --max-normalize true \ - data/local/dict_nosp_larger \ - exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ - exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1 - -utils/prepare_lang.sh data/local/dict_larger \ - "" data/local/lang_tmp_larger data/lang_bd || exit 1; - -for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do - mkdir -p data/lang_test_bd_${lm_suffix} - cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1; - rm -rf data/lang_test_bd_${lm_suffix}/tmp - cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/ -done - -( - utils/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr || exit 1; - steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93 || exit 1; - steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92 || exit 1; - - utils/mkgraph.sh data/lang_test_bd_tgpr \ - exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1; - steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ - exp/tri4b/graph_bd_tgpr data/test_dev93 \ - exp/tri4b/decode_bd_tgpr_dev93 || exit 1; - steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ - exp/tri4b/graph_bd_tgpr data/test_eval92 \ - exp/tri4b/decode_bd_tgpr_eval92 || exit 1; -) & +if [ $stage -le 4 ]; then + # From 2b system, train 3b which is LDA + MLLT + SAT. + + # Align tri2b system with all the si284 data. + if $train; then + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284 || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1; + fi + + if $decode; then + utils/mkgraph.sh data/lang_nosp_test_tgpr \ + exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1; + + # the larger dictionary ("big-dict"/bd) + locally produced LM. + utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \ + exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1; + + # At this point you could run the command below; this gets + # results that demonstrate the basis-fMLLR adaptation (adaptation + # on small amounts of adaptation data). + # local/run_basis_fmllr.sh --lang-suffix "_nosp" + + for data in dev93 eval92; do + nspk=$(wc -l " data/local/lang_tmp data/lang || exit 1; + + for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do + mkdir -p data/lang_test_${lm_suffix} + cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1; + rm -rf data/lang_test_${lm_suffix}/tmp + cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/ + done + + # Silprob for larger ("bd") lexicon. + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp_larger \ + exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ + exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1 + + utils/prepare_lang.sh data/local/dict_larger \ + "" data/local/lang_tmp_larger data/lang_bd || exit 1; + + for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do + mkdir -p data/lang_test_bd_${lm_suffix} + cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1; + rm -rf data/lang_test_bd_${lm_suffix}/tmp + cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/ + done +fi + + +if [ $stage -le 6 ]; then + # From 3b system, now using data/lang as the lang directory (we have now added + # pronunciation and silence probabilities), train another SAT system (tri4b). + + if $train; then + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1; + fi + + if $decode; then + utils/mkgraph.sh data/lang_test_tgpr \ + exp/tri4b exp/tri4b/graph_tgpr || exit 1; + utils/mkgraph.sh data/lang_test_bd_tgpr \ + exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1; + + for data in dev93 eval92; do + nspk=$(wc -l " - echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k" - echo "main options (for others, see top of script file)" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --config # config containing options" - echo " --stage # stage to do partial re-run from." - exit 1; -fi - -numgauss=$1 -data=$2 -lang=$3 -srcdir=$4 -dir=$5 - -for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do - [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1; -done - -nj=`cat $srcdir/num_jobs` || exit 1; -sdata=$data/split$nj; - -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` - -mkdir -p $dir/log -cp $srcdir/splice_opts $dir 2>/dev/null -cp $srcdir/cmvn_opts $dir 2>/dev/null -cp $srcdir/final.mat $dir -echo $nj > $dir/num_jobs -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; - -utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -cp $srcdir/tree $dir - - -## Set up features. -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - cp $srcdir/final.mat $dir - ;; - *) echo "Invalid feature type $feat_type" && exit 1; -esac -if [ -f $srcdir/trans.1 ]; then - echo Using transforms from $srcdir; - rm $dir/trans.* 2>/dev/null - ln.pl $srcdir/trans.* $dir # Link those transforms to current directory. - feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" -else - feats="$sifeats" -fi -## Done setting up features. - -rm $dir/fsts.*.gz 2>/dev/null -ln.pl $srcdir/fsts.*.gz $dir # Link training-graph FSTs to current directory. - -## Mix up old model -if [ $stage -le 0 ]; then - echo Mixing up old model to $numgauss Gaussians -# Note: this script also works for mixing down. - $cmd $dir/log/mixup.log \ - gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \ - $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1; -fi -## Done. - -cur_alidir=$srcdir # dir to find alignments. -[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if - # we won't be generating them. - -x=1 -while [ $x -le $num_iters ]; do - echo "$0: iteration $x" - if echo $realign_iters | grep -w $x >/dev/null; then - if [ $stage -le $x ]; then - echo "$0: realigning data" - mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" - $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ - gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \ - "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ - "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; - fi - cur_alidir=$dir - fi - if [ $stage -le $x ]; then - echo "$0: accumulating statistics" - $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ - gmm-acc-stats-ali $dir/$x.mdl "$feats" \ - "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; - echo "$0: re-estimating model" - [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; - $cmd $dir/log/update.$x.log \ - gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ - "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; - rm $dir/$x.mdl $dir/$x.*.acc - rm $dir/$x.occs 2>/dev/null - fi - x=$[$x+1] -done - -rm $dir/final.mdl $dir/final.occs 2>/dev/null -ln -s $x.mdl $dir/final.mdl -ln -s $x.occs $dir/final.occs - -if [ -f $dir/trans.1 ]; then - echo "$0: accumulating stats for alignment model." - $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ - ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ - gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ - ark,s,cs:- $dir/$x.JOB.acc || exit 1; - [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; - echo "$0: Re-estimating alignment model." - $cmd $dir/log/est_alimdl.log \ - gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \ - "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; - rm $dir/$x.*.acc - rm $dir/final.alimdl 2>/dev/null - ln -s $x.alimdl $dir/final.alimdl -fi - -utils/summarize_warnings.pl $dir/log - -echo Done diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 0333d628544..bb8efd56ab8 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -22,12 +22,13 @@ mkdir -p $data/.backup [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; +set -e -o pipefail -u + tmpdir=$(mktemp -d /tmp/kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM export LC_ALL=C - function check_sorted { file=$1 sort -k1,1 -u <$file >$file.tmp @@ -54,8 +55,8 @@ function filter_file { cp $file_to_filter ${file_to_filter}.tmp utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=`cat ${file_to_filter}.tmp | wc -l` - length2=`cat ${file_to_filter} | wc -l` + length1=$(cat ${file_to_filter}.tmp | wc -l) + length2=$(cat ${file_to_filter} | wc -l) if [ $length1 -ne $length2 ]; then echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." fi @@ -77,7 +78,7 @@ function filter_recordings { exit 1; fi awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=`cat $tmpdir/recordings | wc -l` + n1=$(cat $tmpdir/recordings | wc -l) [ ! -s $tmpdir/recordings ] && \ echo "Empty list of recordings (bad file $data/segments)?" && exit 1; utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh index 42204b85e7d..65ff3c3c79d 100755 --- a/egs/wsj/s5/utils/mkgraph.sh +++ b/egs/wsj/s5/utils/mkgraph.sh @@ -75,7 +75,7 @@ fi N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; } P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; } -[[ -f $2/frame_subsampling_factor && $loopscale != 1.0 ]] && \ +[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \ echo "$0: WARNING: chain models need '--self-loop-scale 1.0'"; mkdir -p $lang/tmp diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 49c929207b9..58e51a75aef 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -132,7 +132,7 @@ if [ -f $data/wav.scp ]; then check_sorted_and_uniq $data/segments # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. ! cat $data/segments | \ - awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \ + awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ echo "$0: badly formatted segments file" && exit 1; segments_len=`cat $data/segments | wc -l`