diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
deleted file mode 100755
index 9e795316352..00000000000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/bin/bash
-
-# by default, with cleanup:
-# local/chain/run_tdnn.sh
-
-# without cleanup:
-# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run the corresponding non-chain nnet3 system
-# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 550 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize 0.1 \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
-fi
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 6704f9d299e..e56946c1b54 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -259,14 +259,14 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
   fi
 
  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
+    --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
index 3e14a4efc55..da0bb728e69 100755
--- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -1,12 +1,20 @@
 #!/bin/bash
 
 # this script is used for comparing decoding results between systems.
-# e.g. local/nnet3/compare_wer_general.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
+# e.g. local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
 # For use with discriminatively trained systems you specify the epochs after a colon:
 # for instance,
 # local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_c_sp exp/nnet3_cleaned/tdnn_c_sp_smbr:{1,2,3}
 
 
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3_cleaned/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3_cleaned/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
 echo "# $0 $*"
 
 include_looped=false
@@ -14,6 +22,11 @@ if [ "$1" == "--looped" ]; then
   include_looped=true
   shift
 fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
 
 
 
@@ -71,6 +84,16 @@ for n in 0 1 2 3; do
      done
      echo
    fi
+   if $include_online; then
+     echo -n "#         [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum ${dirname}_online/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
 done
 
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
index b4f2dd3e3b4..16093616b05 100755
--- a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
@@ -21,9 +21,9 @@ num_threads_ubm=32
 nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/nnet3_cleaned or whatever.
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
index 379c8040a27..f6e4fb71b75 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+
+# 1b is as 1a but uses xconfigs.
+
 #    This is the standard "tdnn" system, built in nnet3; this script
 # is the version that's meant to run with data-cleanup, that doesn't
 # support parallel alignments.
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..35789342ffb
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# 1c is as 1b but using more 'chain-like' splicing and slightly
+# smaller dim.  Not better; maybe slightly worse.
+
+# note: the num-params is almost the same.
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1{b,c}_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+# exp/nnet3_cleaned/tdnn1c_sp: num-iters=240 nj=2..12 num-params=10.1M dim=40+100->4187 combine=-1.16->-1.15 loglike:train/valid[159,239,combined]=(-1.22,-1.16,-1.15/-1.41,-1.38,-1.38) accuracy:train/valid[159,239,combined]=(0.66,0.67,0.68/0.62,0.63,0.63)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1{b,c}_sp
+# System                tdnn1b_sp tdnn1c_sp
+# WER on dev(orig)           11.7      11.9
+# WER on dev(rescored)       10.9      11.1
+# WER on test(orig)          11.7      11.8
+# WER on test(rescored)      11.0      11.2
+# Final train prob        -0.9416   -1.1505
+# Final valid prob        -1.1496   -1.3805
+# Final train acc          0.7241    0.6756
+# Final valid acc          0.6788    0.6255
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1c  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
new file mode 100755
index 00000000000..666c2f1bb31
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+
+# run_tdnn_lfr_1a.sh is similar in configuration to run_tdnn_1c.sh, but it's a
+# low-frame-rate system (see egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+# for an example of such a system).
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1a  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lfr${tdnn_affix}_sp
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+
+if [ $stage -le 17 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+        --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
index f1502dd2761..28c45836cf7 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -9,15 +9,16 @@
 # System                tdnn_lstm1a_sp tdnn_lstm1b_sp
 # WER on dev(orig)           11.0      11.0
 #         [looped:]          11.0      11.1
-# WER on dev(rescored)       10.3      10.3
+# WER on dev(rescored)       10.4      10.3
 #         [looped:]          10.3      10.5
-# WER on test(orig)          10.8      10.6
+# WER on test(orig)          10.7      10.6
 #         [looped:]          10.7      10.7
 # WER on test(rescored)      10.1       9.9
 #         [looped:]          10.0      10.0
-# Final train prob     -0.68810.7954-0.68970.7946
-# Final valid prob     -0.77960.7611-0.79890.7582
-
+# Final train prob        -0.6881   -0.6897
+# Final valid prob        -0.7796   -0.7989
+# Final train acc          0.7954    0.7946
+# Final valid acc          0.7611    0.7582
 
 # by default, with cleanup:
 # local/nnet3/run_tdnn_lstm.sh
@@ -53,19 +54,11 @@ label_delay=5
 chunk_width=40,30,20
 chunk_left_context=40
 chunk_right_context=0
-# decode chunk-size options (for non-looped decoding)
-extra_left_context=50
-extra_right_context=0
 
 # training options
 srand=0
 remove_egs=true
 
-#decode options
-extra_left_context=
-extra_right_context=
-frames_per_chunk=
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -91,8 +84,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 gmm_dir=exp/${gmm}
 graph_dir=$gmm_dir/graph
 ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
-dir=${dir}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
 train_data_dir=data/${train_set}_sp_hires_comb
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
 
@@ -175,15 +167,14 @@ if [ $stage -le 13 ]; then
 fi
 
 if [ $stage -le 14 ]; then
-  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
-  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
-  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
   for dset in dev test; do
    (
     steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-        --extra-left-context $extra_left_context \
-        --extra-right-context $extra_right_context \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --frames-per-chunk $frames_per_chunk \
         --extra-left-context-initial 0 --extra-right-context-final 0 \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
       ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
index 1d3b12f2697..bc9a717419d 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -8,6 +8,8 @@
 # local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
 # with 1.5 times larger hidden dimensions.
 
+# exp/nnet3_cleaned/tdnn_lstm1c_sp: num-iters=246 nj=3..15 num-params=18.7M dim=40+100->4187 combine=-0.67->-0.66 loglike:train/valid[163,245,combined]=(-0.71,-0.63,-0.60/-0.92,-0.88,-0.85) accuracy:train/valid[163,245,combined]=(0.77,0.79,0.80/0.74,0.75,0.75)
+
 # local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp exp/nnet3_cleaned/tdnn_lstm1c_sp
 # System                tdnn_lstm1a_sp tdnn_lstm1b_sp tdnn_lstm1c_sp
 # WER on dev(orig)           11.0      11.0      11.0
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..3e8509bf4ac
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is like run_tdnn_lstm_1a.sh, but
+# it's a low-frame-rate system. (however, using num-jobs-final=10,
+# not 15, which was very high).
+
+
+# Generally the WER is the same or slightly better than before.
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp  2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1c_sp tdnn_lstm_lfr1a_sp
+# WER on dev(orig)           11.0      10.9
+#         [looped:]          10.9      10.9
+#         [online:]                    10.8
+# WER on dev(rescored)       10.4      10.3
+#         [looped:]          10.3      10.3
+#         [online:]                    10.3
+# WER on test(orig)          10.8      10.7
+#         [looped:]          10.7      10.7
+#         [online:]                    10.7
+# WER on test(rescored)      10.1      10.2
+#         [looped:]          10.1      10.1
+#         [online:]                    10.2
+# Final train prob        -0.5998   -0.5437
+# Final valid prob        -0.8542   -0.7286
+# Final train acc          0.7988    0.8343
+# Final valid acc          0.7521    0.7888
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh \
+       --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       --extra-left-context $chunk_left_context \
+       --extra-right-context $chunk_right_context \
+       --frames-per-chunk $frames_per_chunk \
+       --extra-left-context-initial 0 --extra-right-context-final 0 \
+       --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index acff4f9d7fe..e6732d21074 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -1,8 +1,15 @@
 #!/bin/bash
 
-# this RESULTS file was obtained by Haihua Xu in July 2013.
-
-for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+# this RESULTS file was obtained by Dan Povey in Feb 2017, after
+# a rewrite of the run.sh file.
+# To see results from the scripts local/nnet3/ and local/chain/,
+# look at the top of those files, we don't put those in the
+# RESULTS file.
+
+for dir in exp/*; do
+  steps/info/gmm_dir_info.pl $dir
+  for x in $dir/decode*dev93* $dir/decode*eval92*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+done
 exit 0
 
 # Use caution when comparing these results with other published results.
@@ -13,107 +20,76 @@ exit 0
 # in which we only test on utterances that are in either a 5k or 20k subset
 # of the vocabulary.
 
-# The following results are updated with LDA+MLLT to use 7, not 9 frames of context,
-# and also increased the learning rate for the "indirect" fMMI.
-
 # monophone, deltas, trained on the 2k shortest utterances from the si84 data.
-%WER 35.39 [ 2914 / 8234, 284 ins, 467 del, 2163 sub ] exp/mono0a/decode_tgpr_dev93/wer_10
-%WER 25.78 [ 1455 / 5643, 142 ins, 184 del, 1129 sub ] exp/mono0a/decode_tgpr_eval92/wer_9
+exp/mono0a: nj=10 align prob=-95.82 over 2.36h [retry=0.4%, fail=0.0%] states=132 gauss=973
+%WER 34.33 [ 2827 / 8234, 266 ins, 457 del, 2104 sub ] exp/mono0a/decode_nosp_tgpr_dev93/wer_10_0.0
+%WER 25.13 [ 1418 / 5643, 138 ins, 192 del, 1088 sub ] exp/mono0a/decode_nosp_tgpr_eval92/wer_10_0.0
+
+
 
 # first triphone build.  Built on half of SI-84.
-%WER 20.00 [ 1647 / 8234, 257 ins, 197 del, 1193 sub ] exp/tri1/decode_tgpr_dev93/wer_17
-%WER 13.04 [ 736 / 5643, 137 ins, 61 del, 538 sub ] exp/tri1/decode_tgpr_eval92/wer_14
+exp/tri1: nj=10 align prob=-93.75 over 7.38h [retry=0.4%, fail=0.0%] states=1567 gauss=10025 tree-impr=5.06
+%WER 19.40 [ 1597 / 8234, 247 ins, 199 del, 1151 sub ] exp/tri1/decode_nosp_tgpr_dev93/wer_14_0.5
+%WER 12.76 [ 720 / 5643, 110 ins, 89 del, 521 sub ] exp/tri1/decode_nosp_tgpr_eval92/wer_14_1.0
 
-# the same, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
+# the above, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
 # different rescoring methods.  They all give about the same results.  Note: 3 and 4 give
 # the "correct" LM scores.
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg1/wer_14
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg2/wer_14
-%WER 18.75 [ 1544 / 8234, 266 ins, 152 del, 1126 sub ] exp/tri1/decode_tgpr_dev93_tg3/wer_15
-%WER 18.76 [ 1545 / 8234, 266 ins, 152 del, 1127 sub ] exp/tri1/decode_tgpr_dev93_tg4/wer_15
-
-# tri2a is delta+delta-delta features.
-%WER 17.93 [ 1476 / 8234, 256 ins, 161 del, 1059 sub ] exp/tri2a/decode_tgpr_dev93/wer_16
-%WER 12.42 [ 701 / 5643, 132 ins, 64 del, 505 sub ] exp/tri2a/decode_tgpr_eval92/wer_15
-# just demonstrates how to do decoding constrained by lattices.
-%WER 16.76 [ 1380 / 8234, 275 ins, 132 del, 973 sub ] exp/tri2a/decode_tgpr_dev93_fromlats/wer_16
-
-# This is an LDA+MLLT system. 
-%WER 16.43 [ 1353 / 8234, 241 ins, 162 del, 950 sub ] exp/tri2b/decode_tgpr_dev93/wer_16
-%WER 10.69 [ 603 / 5643, 154 ins, 47 del, 402 sub ] exp/tri2b/decode_tgpr_eval92/wer_14
-
-# rescoring the lattices with trigram.
-%WER 15.29 [ 1252 / 8191, 219 ins, 153 del, 880 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg/wer_18
-# using the "biglm" decoding method to avoid the lattice rescoring step [not faster though.]
-%WER 15.31 [ 1261 / 8234, 227 ins, 158 del, 876 sub ] exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_18
-# using a Minimum Bayes Risk decoding method on top of the _tg lattices.
-%WER 15.15 [ 1241 / 8191, 221 ins, 155 del, 865 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg_mbr/wer_18
-
-# fMMI, default learning rate (0.001)
-
-%WER 15.19 [ 1251 / 8234, 213 ins, 148 del, 890 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.14 [ 1247 / 8234, 228 ins, 138 del, 881 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it4/wer_14
-%WER 15.06 [ 1240 / 8234, 211 ins, 152 del, 877 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 15.01 [ 1236 / 8234, 206 ins, 154 del, 876 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.99 [ 1234 / 8234, 210 ins, 159 del, 865 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.23 [ 1254 / 8234, 200 ins, 184 del, 870 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.55 [ 1280 / 8234, 234 ins, 151 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it3/wer_15
-%WER 15.63 [ 1287 / 8234, 242 ins, 150 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it4/wer_15
-%WER 15.30 [ 1260 / 8234, 224 ins, 143 del, 893 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it5/wer_15
-%WER 15.34 [ 1263 / 8234, 216 ins, 156 del, 891 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it6/wer_16
-%WER 15.34 [ 1263 / 8234, 242 ins, 139 del, 882 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it7/wer_14
-%WER 15.30 [ 1260 / 8234, 245 ins, 134 del, 881 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it8/wer_13
-
-%WER 15.21 [ 1252 / 8234, 218 ins, 148 del, 886 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.16 [ 1248 / 8234, 205 ins, 159 del, 884 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 15.22 [ 1253 / 8234, 229 ins, 147 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 14.90 [ 1227 / 8234, 203 ins, 150 del, 874 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.95 [ 1231 / 8234, 202 ins, 152 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.18 [ 1250 / 8234, 184 ins, 172 del, 894 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.70 [ 1293 / 8234, 218 ins, 163 del, 912 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it3/wer_16
-%WER 15.61 [ 1285 / 8234, 217 ins, 163 del, 905 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it4/wer_16
-%WER 10.46 [ 590 / 5643, 125 ins, 51 del, 414 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it3/wer_15
-%WER 10.40 [ 587 / 5643, 124 ins, 52 del, 411 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it4/wer_16
-
-%WER 15.56 [ 1281 / 8234, 224 ins, 152 del, 905 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.44 [ 1271 / 8234, 220 ins, 165 del, 886 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 10.33 [ 583 / 5643, 125 ins, 51 del, 407 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it3/wer_15
-%WER 10.33 [ 583 / 5643, 125 ins, 47 del, 411 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it4/wer_15
-
-%WER 11.43 [ 941 / 8234, 113 ins, 144 del, 684 sub ] exp/tri3b/decode_bd_tgpr_dev93/wer_19
-%WER 16.09 [ 1325 / 8234, 193 ins, 185 del, 947 sub ] exp/tri3b/decode_bd_tgpr_dev93.si/wer_16
-%WER 6.79 [ 383 / 5643, 51 ins, 49 del, 283 sub ] exp/tri3b/decode_bd_tgpr_eval92/wer_18
-%WER 10.61 [ 599 / 5643, 91 ins, 74 del, 434 sub ] exp/tri3b/decode_bd_tgpr_eval92.si/wer_15
-%WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19
-%WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18
-
-%WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17
-%WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15
-
-%WER 12.98 [ 1069 / 8234, 209 ins, 116 del, 744 sub ] exp/tri3b/decode_tgpr_dev93_tg/wer_19
-%WER 9.30 [ 525 / 5643, 120 ins, 37 del, 368 sub ] exp/tri3b/decode_tgpr_eval92/wer_18
-%WER 12.95 [ 731 / 5643, 167 ins, 46 del, 518 sub ] exp/tri3b/decode_tgpr_eval92.si/wer_14
-%WER 8.54 [ 482 / 5643, 113 ins, 29 del, 340 sub ] exp/tri3b/decode_tgpr_eval92_tg/wer_17
-
-%WER 12.12 [ 998 / 8234, 209 ins, 88 del, 701 sub ] exp/tri4a/decode_tgpr_dev93/wer_17
-%WER 15.98 [ 1316 / 8234, 275 ins, 119 del, 922 sub ] exp/tri4a/decode_tgpr_dev93.si/wer_15
-%WER 7.83 [ 442 / 5643, 107 ins, 23 del, 312 sub ] exp/tri4a/decode_tgpr_eval92/wer_16
-%WER 10.90 [ 615 / 5643, 148 ins, 30 del, 437 sub ] exp/tri4a/decode_tgpr_eval92.si/wer_13
-
-%WER 9.15 [ 753 / 8234, 90 ins, 113 del, 550 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93/wer_16
-%WER 12.64 [ 1041 / 8234, 137 ins, 145 del, 759 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93.si/wer_16
-%WER 5.74 [ 324 / 5643, 47 ins, 35 del, 242 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92/wer_19
-%WER 7.92 [ 447 / 5643, 64 ins, 46 del, 337 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92.si/wer_15
-%WER 9.38 [ 772 / 8234, 90 ins, 118 del, 564 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_18
-%WER 13.07 [ 1076 / 8234, 148 ins, 143 del, 785 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17
-%WER 6.03 [ 340 / 5643, 66 ins, 26 del, 248 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_13
-%WER 8.19 [ 462 / 5643, 74 ins, 42 del, 346 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15
-%WER 12.16 [ 1001 / 8234, 197 ins, 98 del, 706 sub ] exp/tri4b/decode_tgpr_dev93/wer_17
-%WER 15.47 [ 1274 / 8234, 235 ins, 120 del, 919 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17
-%WER 8.08 [ 456 / 5643, 125 ins, 16 del, 315 sub ] exp/tri4b/decode_tgpr_eval92/wer_13
-%WER 10.49 [ 592 / 5643, 147 ins, 27 del, 418 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_12
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg1/wer_15_0.5
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg2/wer_15_0.5
+%WER 18.16 [ 1495 / 8234, 268 ins, 153 del, 1074 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg3/wer_16_0.0
+%WER 18.18 [ 1497 / 8234, 268 ins, 154 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg4/wer_16_0.0
+
+
+# tri2b is an LDA+MLLT system trained on SI-84
+exp/tri2b: nj=10 align prob=-47.22 over 15.10h [retry=0.7%, fail=0.0%] states=2005 gauss=15036 tree-impr=5.45 lda-sum=26.20 mllt:impr,logdet=1.34,1.97
+%WER 16.37 [ 1348 / 8234, 241 ins, 157 del, 950 sub ] exp/tri2b/decode_nosp_tgpr_dev93/wer_17_0.0
+%WER 10.53 [ 594 / 5643, 110 ins, 60 del, 424 sub ] exp/tri2b/decode_nosp_tgpr_eval92/wer_17_0.5
+
+
+# tri3b is an LDA+MLLT+SAT system trained on all of SI-284
+exp/tri3b: nj=10 align prob=-44.30 over 81.23h [retry=0.8%, fail=0.1%] states=3362 gauss=40061 fmllr-impr=3.70 over 59.77h tree-impr=7.86
+
+%WER 15.56 [ 1281 / 8234, 220 ins, 140 del, 921 sub ] exp/tri3b/decode_nosp_tgpr_dev93.si/wer_17_0.5
+%WER 12.82 [ 1056 / 8234, 135 ins, 147 del, 774 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93.si/wer_15_0.0
+%WER 9.24 [ 761 / 8234, 89 ins, 109 del, 563 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93/wer_16_0.0
+%WER 11.53 [ 949 / 8234, 179 ins, 94 del, 676 sub ] exp/tri3b/decode_nosp_tgpr_dev93/wer_15_0.5
+%WER 10.94 [ 901 / 8234, 181 ins, 82 del, 638 sub ] exp/tri3b/decode_nosp_tg_dev93/wer_14_0.5
+%WER 8.16 [ 672 / 8234, 94 ins, 94 del, 484 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93_fg/wer_17_0.0
+
+%WER 10.95 [ 618 / 5643, 148 ins, 36 del, 434 sub ] exp/tri3b/decode_nosp_tgpr_eval92.si/wer_14_0.0
+%WER 8.19 [ 462 / 5643, 77 ins, 51 del, 334 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92.si/wer_16_0.0
+%WER 5.55 [ 313 / 5643, 35 ins, 45 del, 233 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92/wer_17_1.0
+%WER 4.89 [ 276 / 5643, 47 ins, 28 del, 201 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92_fg/wer_15_0.5
+%WER 7.53 [ 425 / 5643, 112 ins, 20 del, 293 sub ] exp/tri3b/decode_nosp_tg_eval92/wer_17_0.0
+%WER 8.15 [ 460 / 5643, 113 ins, 30 del, 317 sub ] exp/tri3b/decode_nosp_tgpr_eval92/wer_14_1.0
+
+
+# tri4b is an LDA+MLLT+SAT system after estimating pronunciation probabilities
+# and word-and-pronunciation-dependent silence probabilities.
+
+exp/tri4b: nj=10 align prob=-44.46 over 81.23h [retry=0.6%, fail=0.1%] states=3413 gauss=40059 fmllr-impr=0.17 over 60.20h tree-impr=8.70
+
+%WER 15.16 [ 1248 / 8234, 253 ins, 96 del, 899 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17_0.0
+%WER 12.62 [ 1039 / 8234, 141 ins, 124 del, 774 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17_0.0
+%WER 9.01 [ 742 / 8234, 106 ins, 97 del, 539 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_16_0.0
+%WER 8.25 [ 679 / 8234, 94 ins, 100 del, 485 sub ] exp/tri4b/decode_bd_tgpr_dev93_fg/wer_17_0.5
+%WER 10.92 [ 899 / 8234, 186 ins, 92 del, 621 sub ] exp/tri4b/decode_tg_dev93/wer_17_0.5
+%WER 11.44 [ 942 / 8234, 203 ins, 87 del, 652 sub ] exp/tri4b/decode_tgpr_dev93/wer_14_0.5
+
+%WER 10.93 [ 617 / 5643, 147 ins, 33 del, 437 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_14_1.0
+%WER 8.74 [ 493 / 5643, 104 ins, 34 del, 355 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15_0.0
+%WER 5.69 [ 321 / 5643, 50 ins, 34 del, 237 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_17_0.5
+%WER 4.71 [ 266 / 5643, 40 ins, 27 del, 199 sub ] exp/tri4b/decode_bd_tgpr_eval92_fg/wer_17_1.0
+%WER 7.39 [ 417 / 5643, 107 ins, 24 del, 286 sub ] exp/tri4b/decode_tg_eval92/wer_16_1.0
+%WER 7.90 [ 446 / 5643, 111 ins, 27 del, 308 sub ] exp/tri4b/decode_tgpr_eval92/wer_15_1.0
+
+
+######################################
+## Results below this point were mostly obtained in 2013 by Hainan Xu,
+## They are from parts of the script that are now not run by default in the run.sh.
+## you can look in the git history to figure out when these results were added.
+
 %WER 7.99 [ 658 / 8234, 72 ins, 95 del, 491 sub ] exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it8/wer_12
 %WER 11.15 [ 918 / 8234, 180 ins, 81 del, 657 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it3/wer_15
 %WER 11.23 [ 925 / 8234, 201 ins, 77 del, 647 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it4/wer_12
@@ -166,7 +142,7 @@ exit 0
 # not updated
 
 
-# DNN on fMLLR features (Karel's setup, [7.8.2015]). 
+# DNN on fMLLR features (Karel's setup, [7.8.2015]).
 # frame cross-entropy training
 %WER 6.05 [ 498 / 8234, 59 ins, 67 del, 372 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_dev93/wer_11_0.0
 %WER 3.69 [ 208 / 5643, 19 ins, 19 del, 170 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_eval92/wer_11_1.0
@@ -298,7 +274,7 @@ for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.s
 
 # bidirectional LSTM
 # -----------------------
-# local/nnet3/run_lstm.sh --affix bidirectional \ 
+# local/nnet3/run_lstm.sh --affix bidirectional \
 #                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
 #	                  --label-delay 0 \
 #                         --cell-dim 640 \
diff --git a/egs/wsj/s5/local/chain/compare_wer.sh b/egs/wsj/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..edfefad547f
--- /dev/null
+++ b/egs/wsj/s5/local/chain/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..d874eb0986a
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,361 @@
+#!/bin/bash
+
+
+# This was modified from run_tdnn_lstm_1a.sh, making similar
+# changes as the diff from run_tdnn_lstm_1a.sh->run_tdnn_1c.sh
+# in egs/tedlium/s5_r2/local/nnet3/tuning,
+# specifically:
+# changing chunk_left_context to zero, shrink from 0.99->1
+# (since it's not applicable to ReLUs), and removing
+# the deriv-truncate-margin option since it's only applicable
+# to recurrent setups; removing label-delay.
+# adding pre-final layers (I experimented with this,
+# it did seem helpful); using 3M not 1.5M frames per iter to keep the
+# time per job reasonable; and fewer final jobs (5 not 10).
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=102 nj=2..5 num-params=7.6M dim=40+100->2889 combine=-0.052->-0.051 xent:train/valid[67,101,final]=(-0.881,-0.824,-0.822/-0.953,-0.922,-0.921) logprob:train/valid[67,101,final]=(-0.048,-0.042,-0.041/-0.064,-0.064,-0.063)
+
+# The following table compares (nnet3 TDNN, chain TDNN+LSTM, this experiment == chain TDNN).
+# This is better than the nnet3 TDNN, but the difference with the chain TDNN+LSTM
+# is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn1a_sp exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp tdnn1a_sp
+#WER dev93 (tgpr)                9.18      7.48      7.87
+#             [online:]                    7.49      8.02
+#WER dev93 (tg)                  8.59      7.41      7.61
+#             [online:]                    7.40      7.70
+#WER dev93 (big-dict,tgpr)       6.45      5.64      5.71
+#             [online:]                    5.70      5.60
+#WER dev93 (big-dict,fg)         5.83      5.40      5.10
+#             [online:]                    5.19      5.21
+#WER eval92 (tgpr)               6.15      5.67      5.23
+#             [online:]                    5.60      5.44
+#WER eval92 (tg)                 5.55      5.46      4.87
+#             [online:]                    5.53      4.87
+#WER eval92 (big-dict,tgpr)      3.58      3.69      3.24
+#             [online:]                    3.63      3.31
+#WER eval92 (big-dict,fg)        2.98      3.28      2.71
+#             [online:]                    3.31      2.92
+# Final train prob                  -0.0341   -0.0414
+# Final valid prob                  -0.0506   -0.0634
+# Final train prob (xent)             -0.5643   -0.8216
+# Final valid prob (xent)             -0.6648   -0.9208
+
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..4b752a55a4b
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,393 @@
+#!/bin/bash
+
+
+# this is a TDNN+LSTM chain system.
+# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
+# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
+# Note: we're using the same hidden-layer sizes as
+# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
+# fact that we'd normally choose a smaller model for a setup with
+# less data, because the Tedlium model was probably on the small side.
+# Note: we normally use more parameters for LSTM-containing than TDNN-only
+# systems.
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
+
+# The following compares:
+# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
+# system.
+# This is consistently better than the nnet3 TDNN+LSTM, but the
+# difference with the chain TDNN is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
+# System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                8.54      7.87      7.48
+#             [online:]          8.57      8.02      7.49
+#WER dev93 (tg)                  8.25      7.61      7.41
+#             [online:]          8.34      7.70      7.40
+#WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
+#             [online:]          6.40      5.60      5.70
+#WER dev93 (big-dict,fg)         5.70      5.10      5.40
+#             [online:]          5.77      5.21      5.19
+#WER eval92 (tgpr)               6.52      5.23      5.67
+#             [online:]          6.56      5.44      5.60
+#WER eval92 (tg)                 6.13      4.87      5.46
+#             [online:]          6.24      4.87      5.53
+#WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
+#             [online:]          3.88      3.31      3.63
+#WER eval92 (big-dict,fg)        3.38      2.71      3.28
+#             [online:]          3.53      2.92      3.31
+# Final train prob                  -0.0414   -0.0341
+# Final valid prob                  -0.0634   -0.0506
+# Final train prob (xent)             -0.8216   -0.5643
+# Final valid prob (xent)             -0.9208   -0.6648
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+label_delay=5
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 19 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 20 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/compare_wer.sh b/egs/wsj/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..255cbabaef3
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer.sh exp/nnet3/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn_c_sp exp/nnet3/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 8d4cff326b3..e30988b7bf6 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -1,83 +1,215 @@
 #!/bin/bash
 
-# this script is called from scripts like run_ms.sh; it does the common stages
-# of the build, such as feature extraction.
-# This is actually the same as local/online/run_nnet2_common.sh, except
-# for the directory names.
+set -e -o pipefail
 
-. cmd.sh
-mfccdir=mfcc
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
-stage=1
 
-. cmd.sh
+stage=0
+nj=30
+train_set=train_si284   # you might set this to e.g. train.
+test_sets="test_dev93 test_eval92"
+gmm=tri4b                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
 
 
 if [ $stage -le 1 ]; then
-  for datadir in train_si284 test_eval93 test_dev93 test_eval92; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-  done
-  utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-    data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
 fi
 
 if [ $stage -le 3 ]; then
-  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
-  # because after we get the transform (12th iter is the last), any further
-  # training is pointless.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
-    --realign-iters "" \
-    --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/train_si84_hires data/lang \
-     exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+    data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+    sleep 5
+  fi
+
+  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+    # we don't want to overwrite old stuff, ask the user to delete it.
+    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+    echo " ... please delete and then rerun, or use a later --stage option."
+    exit 1;
+  fi
+  # we limit the number of iterations because it's only the LDA+MLLT transform
+  # that we need.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     3000 10000 $temp_data_root/${train_set}_hires data/lang \
+      $gmm_dir exp/nnet3${nnet3_affix}/tri5
 fi
 
+
 if [ $stage -le 4 ]; then
-  mkdir -p exp/nnet3
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-     --num-frames 400000 data/train_si84_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 5 ]; then
-  # even though $nj is just 10, each job uses multiple processes and threads.
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/train_si284_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 if [ $stage -le 6 ]; then
-  # We extract iVectors on all the train_si284 data, which will be what we
-  # train the system on.
-
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284_hires \
-    data/train_si284_hires_max2
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
 
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/train_si284_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train_si284 || exit 1;
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 8 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
 fi
 
+
 if [ $stage -le 7 ]; then
-  rm exp/nnet3/.error 2>/dev/null
-  for data in test_eval92 test_dev93 test_eval93; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
-      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
-  done
-  wait
-  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
+if [ $stage -le 9 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
 exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh
index 2454fb5be63..d9af546b49b 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# This script is deprecated, see run_tdnn_lstm.sh
+
 # this is a basic lstm script
 # LSTM script runs for more epochs than the TDNN script
 # and each epoch takes twice the time
@@ -125,4 +127,3 @@ if [ $stage -le 9 ]; then
 fi
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index 124b04949a0..311ee14d16a 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+
+# This script is deprecated.
+
 set -o pipefail
 set -e
 # this is run_discriminative.sh
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 337c5656de4..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_a
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-
-exit 0;
-
-# results:
-grep WER exp/nnet3/nnet_tdnn_a/decode_{tgpr,bd_tgpr}_{eval92,dev93}/scoring_kaldi/best_wer
-exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/scoring_kaldi/best_wer:%WER 6.03 [ 340 / 5643, 74 ins, 20 del, 246 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_13_1.0
-exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/scoring_kaldi/best_wer:%WER 9.35 [ 770 / 8234, 162 ins, 84 del, 524 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_11_0.5
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/scoring_kaldi/best_wer:%WER 3.81 [ 215 / 5643, 30 ins, 18 del, 167 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/scoring_kaldi/best_wer:%WER 6.74 [ 555 / 8234, 69 ins, 72 del, 414 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_11_0.0
-b03:s5:
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh b/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
deleted file mode 100755
index aefbcdd331b..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-
-
-# This version of the TDNN system is being built to have a similar configuration
-# to the one in local/online/run_nnet2.sh, for better comparability.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_c
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-1,0,1  -2,1  -4,2 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "--max-jobs-run 12" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-# The following results compare this nnet3 decode with the matched nnet2 baseline.
-#
-#b06:s5: cat exp/nnet3/nnet_tdnn_c/decode_*/scoring_kaldi/best_wer 
-#%WER 6.40 [ 527 / 8234, 59 ins, 71 del, 397 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_dev93/wer_10_0.0
-#%WER 3.54 [ 200 / 5643, 18 ins, 17 del, 165 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_eval92/wer_10_1.0
-#%WER 9.11 [ 750 / 8234, 140 ins, 83 del, 527 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_dev93/wer_10_1.0
-#%WER 6.22 [ 351 / 5643, 85 ins, 15 del, 251 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_eval92/wer_10_1.0
-#b06:s5: 
-#b06:s5: cat exp/nnet2_online/nnet_ms_a/decode_*/scoring_kaldi/best_wer 
-#%WER 6.62 [ 545 / 8234, 56 ins, 79 del, 410 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_dev93/wer_13_0.0
-#%WER 3.70 [ 209 / 5643, 25 ins, 18 del, 166 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_eval92/wer_13_0.5
-#%WER 9.33 [ 768 / 8234, 157 ins, 73 del, 538 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_dev93/wer_11_0.5
-#%WER 6.11 [ 345 / 5643, 92 ins, 14 del, 239 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_eval92/wer_10_1.0
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..078719b1114
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp
+# System                tdnn1a_sp
+#WER dev93 (tgpr)                9.18
+#WER dev93 (tg)                  8.59
+#WER dev93 (big-dict,tgpr)       6.45
+#WER dev93 (big-dict,fg)         5.83
+#WER eval92 (tgpr)               6.15
+#WER eval92 (tg)                 5.55
+#WER eval92 (big-dict,tgpr)      3.58
+#WER eval92 (big-dict,fg)        2.98
+# Final train prob        -0.7200
+# Final valid prob        -0.8834
+# Final train acc          0.7762
+# Final valid acc          0.7301
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=650
+  relu-renorm-layer name=tdnn2 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=650 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=650 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..61223b8a135
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# 1b is like 1a, but using a different splicing setup; the difference
+# is like the difference from
+# egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn{c->b}.sh
+# There seems to be no consistent difference.
+
+#  run_tdnn_1a.sh is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp exp/nnet3/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+#WER dev93 (tgpr)                9.18      9.12
+#WER dev93 (tg)                  8.59      8.51
+#WER dev93 (big-dict,tgpr)       6.45      6.19
+#WER dev93 (big-dict,fg)         5.83      5.78
+#WER eval92 (tgpr)               6.15      6.33
+#WER eval92 (tg)                 5.55      5.74
+#WER eval92 (big-dict,tgpr)      3.58      3.62
+#WER eval92 (big-dict,fg)        2.98      3.10
+# Final train prob        -0.7200   -0.6035
+# Final valid prob        -0.8834   -0.7578
+# Final train acc          0.7762    0.8015
+# Final valid acc          0.7301    0.7607
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1b  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,2)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-7,2)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn6 dim=750
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..6369fdc3fed
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_1a.sh is a TDNN+LSTM system.  Compare with the TDNN
+# system in run_tdnn_1a.sh.  Configuration is similar to
+# the same-named script run_tdnn_lstm_1a.sh in
+# egs/tedlium/s5_r2/local/nnet3/tuning.
+
+# It's a little better than the TDNN-only script on dev93, a little
+# worse on eval92.
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1a_sp
+# exp/nnet3/tdnn_lstm1a_sp: num-iters=102 nj=3..10 num-params=8.8M dim=40+100->3413 combine=-0.55->-0.54 loglike:train/valid[67,101,combined]=(-0.63,-0.55,-0.55/-0.71,-0.63,-0.63) accuracy:train/valid[67,101,combined]=(0.80,0.82,0.82/0.76,0.78,0.78)
+
+
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp 2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                9.18      8.54
+#             [looped:]                    8.54
+#             [online:]                    8.57
+#WER dev93 (tg)                  8.59      8.25
+#             [looped:]                    8.21
+#             [online:]                    8.34
+#WER dev93 (big-dict,tgpr)       6.45      6.24
+#             [looped:]                    6.28
+#             [online:]                    6.40
+#WER dev93 (big-dict,fg)         5.83      5.70
+#             [looped:]                    5.70
+#             [online:]                    5.77
+#WER eval92 (tgpr)               6.15      6.52
+#             [looped:]                    6.45
+#             [online:]                    6.56
+#WER eval92 (tg)                 5.55      6.13
+#             [looped:]                    6.08
+#             [online:]                    6.24
+#WER eval92 (big-dict,tgpr)      3.58      3.88
+#             [looped:]                    3.93
+#             [online:]                    3.88
+#WER eval92 (big-dict,fg)        2.98      3.38
+#             [looped:]                    3.47
+#             [online:]                    3.53
+# Final train prob        -0.7200   -0.5492
+# Final valid prob        -0.8834   -0.6343
+# Final train acc          0.7762    0.8154
+# Final valid acc          0.7301    0.7849
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode_looped.sh \
+          --frames-per-chunk 30 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/online/nnet3/decode.sh \
+          --nj $nj --cmd "$decode_cmd" \
+          $graph_dir data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..f2a4ed37ae5
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,335 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is modified from the same named script
+# in egs/tedlium/s5_r2/local/nnet3/tuning/.
+# Of course reducing the hidden-dims).
+# This is a low-frame-rate TDNN+LSTM system.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm_lfr1a_sp
+# exp/nnet3/tdnn_lstm_lfr1a_sp: num-iters=136 nj=3..10 num-params=8.7M dim=40+100->3205 combine=-0.43->-0.42 loglike:train/valid[89,135,combined]=(-0.51,-0.39,-0.38/-0.59,-0.51,-0.51) accuracy:train/valid[89,135,combined]=(0.85,0.88,0.88/0.82,0.84,0.84)
+
+
+# It seems to be a little worse the regular-frame-rate system.
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1a_sp tdnn_lstm_lfr1a_sp
+#WER dev93 (tgpr)                8.54      9.02
+#             [looped:]          8.54      8.99
+#WER dev93 (tg)                  8.25      8.60
+#             [looped:]          8.21      8.54
+#WER dev93 (big-dict,tgpr)       6.24      6.85
+#             [looped:]          6.28      6.81
+#WER dev93 (big-dict,fg)         5.70      6.33
+#             [looped:]          5.70      6.33
+#WER eval92 (tgpr)               6.52      6.52
+#             [looped:]          6.45      6.42
+#WER eval92 (tg)                 6.13      6.01
+#             [looped:]          6.08      5.92
+#WER eval92 (big-dict,tgpr)      3.88      4.22
+#             [looped:]          3.93      4.20
+#WER eval92 (big-dict,fg)        3.38      3.76
+#             [looped:]          3.47      3.79
+# Final train prob        -0.5492   -0.3100
+# Final valid prob        -0.6343   -0.4646
+# Final train acc          0.8154    0.9051
+# Final valid acc          0.7849    0.8615
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \
+                   $dir $dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \
+      $dir $dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/run_basis_fmllr.sh b/egs/wsj/s5/local/run_basis_fmllr.sh
index caddd7af8de..db5207cc333 100755
--- a/egs/wsj/s5/local/run_basis_fmllr.sh
+++ b/egs/wsj/s5/local/run_basis_fmllr.sh
@@ -18,7 +18,7 @@ for x in test_eval92 test_eval93 test_dev93 ; do
   cp -r data/$x data/$y
   cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
   cp data/$y/utt2spk data/$y/spk2utt;
-  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; 
+  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
 done
 
 
@@ -33,7 +33,7 @@ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
 
  # get the fMLLR basis.
 steps/get_fmllr_basis.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri3b
+  data/train_si284 data/lang${lang_suffix} exp/tri3b
 
  # decoding tri3b with basis fMLLR
 steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
@@ -50,5 +50,3 @@ steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
 steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
   exp/tri3b/graph${lang_suffix}_tgpr data/test_eval92_utt \
   exp/tri3b/decode${lang_suffix}_tgpr_eval92_basis_utt || exit 1;
-
-
diff --git a/egs/wsj/s5/local/run_mmi_tri2b.sh b/egs/wsj/s5/local/run_mmi_tri2b.sh
deleted file mode 100755
index d7ddbfbaf62..00000000000
--- a/egs/wsj/s5/local/run_mmi_tri2b.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-lang_suffix=
-
-echo "$0 $@"  # Print the command line for logging
-. utils/parse_options.sh || exit 1;
-
-. ./cmd.sh
-
-# Train and test MMI (and boosted MMI) on tri2b system.
-steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} \
-  exp/tri2b exp/tri2b_denlats_si84 || exit 1;
-
-# train the basic MMI system.
-steps/train_mmi.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi  || exit 1;
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-# MMI with 0.1 boosting factor.
-steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi_b0.1  || exit 1;
-
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-     exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-
-# Train a UBM with 400 components, for fMMI.
-steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
-  400 data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 exp/dubm2b
-
-steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1
-
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1_lr0.005/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_indirect_b0.1
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-     exp/tri2b_fmmi_indirect_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index fb004117658..4d505f5da3a 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -1,7 +1,15 @@
 #!/bin/bash
 
+stage=0
+train=true   # set to false to disable the training-related scripts
+             # note: you probably only want to set --train false if you
+             # are using at least --stage 1.
+decode=true  # set to false to disable the decoding-related scripts.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
+. utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
+
 
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
@@ -18,334 +26,313 @@
 wsj0=/export/corpora5/LDC/LDC93S6B
 wsj1=/export/corpora5/LDC/LDC94S13B
 
-local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
 
-# Sometimes, we have seen WSJ distributions that do not have subdirectories
-# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
-# wsj0 or wsj1 directories. In such cases, try the following:
-#
-# corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
-# local/cstr_wsj_data_prep.sh $corpus
-# rm data/local/dict/lexiconp.txt
-# $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
-#
-# "nosp" refers to the dictionary before silence probabilities and pronunciation
-# probabilities are added.
-local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
-
-utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
-
-local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
-
- # We suggest to run the next three commands in the background,
- # as they are not a precondition for the system building and
- # most of the tests: these commands build a dictionary
- # containing many of the OOVs in the WSJ LM training data,
- # and an LM trained directly on that data (i.e. not just
- # copying the arpa files from the disks from LDC).
- # Caution: the commands below will only work if $decode_cmd
- # is setup to use qsub.  Else, just remove the --cmd option.
- # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
- # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
+if [ $stage -le 0 ]; then
+  # data preparation.
+  local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
+
+  # Sometimes, we have seen WSJ distributions that do not have subdirectories
+  # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
+  # wsj0 or wsj1 directories. In such cases, try the following:
+  #
+  # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
+  # local/cstr_wsj_data_prep.sh $corpus
+  # rm data/local/dict/lexiconp.txt
+  # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
+  #
+  # "nosp" refers to the dictionary before silence probabilities and pronunciation
+  # probabilities are added.
+  local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+                        "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+
+  local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
+
+  # We suggest to run the next three commands in the background,
+  # as they are not a precondition for the system building and
+  # most of the tests: these commands build a dictionary
+  # containing many of the OOVs in the WSJ LM training data,
+  # and an LM trained directly on that data (i.e. not just
+  # copying the arpa files from the disks from LDC).
+  # Caution: the commands below will only work if $decode_cmd
+  # is setup to use qsub.  Else, just remove the --cmd option.
+  # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
+  # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
   (
-   local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
-   utils/prepare_lang.sh data/local/dict_nosp_larger \
-     "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
-   local/wsj_train_lms.sh --dict-suffix "_nosp" &&
-   local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
+    local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
+      utils/prepare_lang.sh data/local/dict_nosp_larger \
+                            "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
+      local/wsj_train_lms.sh --dict-suffix "_nosp" &&
+      local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
   ) &
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-
-for x in test_eval92 test_eval93 test_dev93 train_si284; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
-  steps/compute_cmvn_stats.sh data/$x || exit 1;
-done
-
-utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
-
-# Now make subset with the shortest 2k utterances from si-84.
-utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
-
-# Now make subset with half of the data from si-84.
-utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
-
-
-# Note: the --boost-silence option should probably be omitted by default
-# for normal setups.  It doesn't always help. [it's to discourage non-silence
-# models from modeling silence.]
-steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/mono0a exp/mono0a/graph_nosp_tgpr && \
- steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
- steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
-) &
-
-steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
-  data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
-
-while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \
-   [ -z data/lang_nosp_test_tgpr/tmp/LG.fst ]; do
-  sleep 20;
-done
-sleep 30;
-# or the mono mkgraph.sh might be writing
-# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
-
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_dev93 exp/tri1/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_eval92 exp/tri1/decode_nosp_tgpr_eval92 || exit 1;
-
-# test various modes of LM rescoring (4 is the default one).
-# This is just confirming they're equivalent.
-for mode in 1 2 3 4; do
-  steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
-    data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93_tg$mode  || exit 1;
-done
-
-
-## the following command demonstrates how to get lattices that are
-## "word-aligned" (arcs coincide with words, with boundaries in the right
-## place).
-#sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
-#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
-#  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
-#  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
-
-steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_eval92 exp/tri2b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point, you could run the example scripts that show how VTLN works.
-# We haven't included this in the default recipes yet.
-# local/run_vtln.sh --lang-suffix "_nosp"
-# local/run_vtln2.sh --lang-suffix "_nosp"
-
-# Now, with dev93, compare lattice rescoring with biglm decoding,
-# going from tgpr to tg.  Note: results are not the same, even though they should
-# be, and I believe this is due to the beams not being wide enough.  The pruning
-# seems to be a bit too narrow in the current scripts (got at least 0.7% absolute
-# improvement from loosening beams from their current values).
-
-steps/decode_biglm.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93_tg_biglm
-
-# baseline via LM rescoring of lattices.
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1;
-
-# Trying Minimum Bayes Risk decoding (like Confusion Network decoding):
-mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-local/score_mbr.sh --cmd "$decode_cmd" \
- data/test_dev93/ data/lang_nosp_test_tgpr/ \
- exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-
-# This script trains a delta+delta-delta system.  It's not really recommended or
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+
+  for x in test_eval92 test_eval93 test_dev93 train_si284; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
+    steps/compute_cmvn_stats.sh data/$x || exit 1;
+  done
+
+  utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
+
+  # Now make subset with the shortest 2k utterances from si-84.
+  utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
+
+  # Now make subset with half of the data from si-84.
+  utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # monophone
+
+
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  if $train; then
+    steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
+      steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # tri1
+  if $train; then
+    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
+
+    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+      data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
+        data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1;
+
+      # test various modes of LM rescoring (4 is the default one).
+      # This is just confirming they're equivalent.
+      for mode in 1 2 3 4; do
+        steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
+          data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
+          exp/tri1/decode_nosp_tgpr_${data} \
+          exp/tri1/decode_nosp_tgpr_${data}_tg$mode  || exit 1;
+      done
+      # later on we'll demonstrate const-arpa LM rescoring, which is now
+      # the recommended method.
+    done
+
+    ## the following command demonstrates how to get lattices that are
+    ## "word-aligned" (arcs coincide with words, with boundaries in the right
+    ## place).
+    #sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
+    #steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
+    #  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
+    #  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  # tri2b.  there is no special meaning in the "b"-- it's historical.
+  if $train; then
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+      data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
+
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+      data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj ${nspk} --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} || exit 1;
+
+       # compare lattice rescoring with biglm decoding, going from tgpr to tg.
+      steps/decode_biglm.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data}_tg_biglm
+
+       # baseline via LM rescoring of lattices.
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} \
+        exp/tri2b/decode_nosp_tgpr_${data}_tg || exit 1;
+
+      # Demonstrating Minimum Bayes Risk decoding (like Confusion Network decoding):
+      mkdir exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+      cp exp/tri2b/decode_nosp_tgpr_${data}_tg/lat.*.gz \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr;
+      local/score_mbr.sh --cmd "$decode_cmd"  \
+         data/test_${data}/ data/lang_nosp_test_tgpr/ \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+    done
+  fi
+
+  # At this point, you could run the example scripts that show how VTLN works.
+  # We haven't included this in the default recipes.
+  # local/run_vtln.sh --lang-suffix "_nosp"
+  # local/run_vtln2.sh --lang-suffix "_nosp"
+fi
+
+
+# local/run_delas.sh trains a delta+delta-delta system.  It's not really recommended or
 # necessary, but it does contain a demonstration of the decode_fromlats.sh
 # script which isn't used elsewhere.
 # local/run_deltas.sh
 
-# Align tri2b system with si84 data.
-steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
-  --use-graphs true data/train_si84 \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_si84  || exit 1;
-
-local/run_mmi_tri2b.sh --lang-suffix "_nosp"
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri2b_ali_si84 exp/tri3b || exit 1;
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
-steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point you could run the command below; this gets
-# results that demonstrate the basis-fMLLR adaptation (adaptation
-# on small amounts of adaptation data).
-local/run_basis_fmllr.sh --lang-suffix "_nosp"
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_dev93 exp/tri3b/decode_nosp_tgpr_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93_tg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_tgpr_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92_tg || exit 1;
-
-# Trying the larger dictionary ("big-dict"/bd) + locally produced LM.
-utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
-
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92 || exit 1;
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_bd_tgpr_dev93 || exit 1;
-
-# Example of rescoring with ConstArpaLm.
-steps/lmrescore_const_arpa.sh \
-  --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92{,_fgconst} || exit 1;
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_fg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_fg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_tg || exit 1;
-
-# The following two steps, which are a kind of side-branch, try mixing up
-( # from the 3b system.  This is to demonstrate that script.
- steps/mixup.sh --cmd "$train_cmd" \
-   20000 data/train_si84 data/lang_nosp exp/tri3b exp/tri3b_20k || exit 1;
- steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-   exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri3b_20k/decode_nosp_tgpr_dev93  || exit 1;
-)
-
-# From 3b system, align all si284 data.
-steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri3b exp/tri3b_ali_si284 || exit 1;
-
-
-# From 3b system, train another SAT system (tri4a) with all the si284 data.
-
-steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4a || exit 1;
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4a exp/tri4a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-
-# This step is just to demonstrate the train_quick.sh script, in which we
-# initialize the GMMs from the old system's GMMs.
-steps/train_quick.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4b || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_bd_tgpr_eval92 || exit 1;
-) &
-
-# Silprob for normal lexicon.
-steps/get_prons.sh --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri4b || exit 1;
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
-
-utils/prepare_lang.sh data/local/dict \
-  "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
-
-for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
-  mkdir -p data/lang_test_${lm_suffix}
-  cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_${lm_suffix}/tmp
-  cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
-done
-
-# Silprob for larger lexicon.
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp_larger \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
-
-utils/prepare_lang.sh data/local/dict_larger \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
-
-for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
-  mkdir -p data/lang_test_bd_${lm_suffix}
-  cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_bd_${lm_suffix}/tmp
-  cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
-done
-
-(
- utils/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
-) &
+if [ $stage -le 4 ]; then
+  # From 2b system, train 3b which is LDA + MLLT + SAT.
+
+  # Align tri2b system with all the si284 data.
+  if $train; then
+    steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
+      data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284  || exit 1;
+
+    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
+
+    # the larger dictionary ("big-dict"/bd) + locally produced LM.
+    utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
+
+    # At this point you could run the command below; this gets
+    # results that demonstrate the basis-fMLLR adaptation (adaptation
+    # on small amounts of adaptation data).
+    # local/run_basis_fmllr.sh --lang-suffix "_nosp"
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri3b/graph_nosp_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
+        data/test_${data} exp/tri3b/decode_nosp_{tgpr,tg}_${data} || exit 1
+
+      # decode with big dictionary.
+      steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
+        exp/tri3b/graph_nosp_bd_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_bd_tgpr_${data} || exit 1;
+
+      # Example of rescoring with ConstArpaLm.
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri3b/decode_nosp_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # Estimate pronunciation and silence probabilities.
+
+  # Silprob for normal lexicon.
+  steps/get_prons.sh --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/tri3b || exit 1;
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+
+  utils/prepare_lang.sh data/local/dict \
+    "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+
+  for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
+    mkdir -p data/lang_test_${lm_suffix}
+    cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_${lm_suffix}/tmp
+    cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
+  done
+
+  # Silprob for larger ("bd") lexicon.
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp_larger \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
+
+  utils/prepare_lang.sh data/local/dict_larger \
+    "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
+
+  for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
+    mkdir -p data/lang_test_bd_${lm_suffix}
+    cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_bd_${lm_suffix}/tmp
+    cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
+  done
+fi
+
+
+if [ $stage -le 6 ]; then
+  # From 3b system, now using data/lang as the lang directory (we have now added
+  # pronunciation and silence probabilities), train another SAT system (tri4b).
+
+  if $train; then
+    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_test_tgpr \
+      exp/tri4b exp/tri4b/graph_tgpr || exit 1;
+    utils/mkgraph.sh data/lang_test_bd_tgpr \
+      exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_tgpr data/test_${data} \
+        exp/tri4b/decode_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_test_tgpr data/lang_test_tg \
+        data/test_${data} exp/tri4b/decode_{tgpr,tg}_${data} || exit 1
+
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_bd_tgpr data/test_${data} \
+        exp/tri4b/decode_bd_tgpr_${data} || exit 1;
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri4b/decode_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+
+exit 0;
+
+### Caution: the parts of the script below this statement are not run by default.
+###
 
 
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
   data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
+local/run_mmi_tri4b.sh
 
 # These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
 # (see local/run_nnet2.sh for other, non-online nnet2 setups).
@@ -357,7 +344,6 @@ local/online/run_nnet2_discriminative.sh
 # default.
 # local/run_rnnlms.sh
 
-local/run_mmi_tri4b.sh
 
 #local/run_nnet2.sh
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 2d7f6f46cce..cdbbb00a68a 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -5,6 +5,7 @@
 # Apache 2.0.
 
 from __future__ import division
+import traceback
 import datetime
 import logging
 import re
@@ -332,20 +333,21 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
 
 def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
     times = parse_train_logs(exp_dir)
-    data = parse_prob_logs(exp_dir, key, output)
+
     report = []
     report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    try:
+        data = parse_prob_logs(exp_dir, key, output)
+    except:
+        tb = traceback.format_exc()
+        logger.warning("Error getting info from logs, exception was: " + tb)
+        data = []
     for x in data:
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
         except KeyError:
             continue
-    if len(report) - 1 == 0:
-        raise KaldiLogParseException("Could not find any lines with {k} in "
-                " {e}/log/compute_prob_train.*.log or "
-                " {e}/log/compute_prob_valid.*.log or both".format(
-                    k=key, e=exp_dir))
 
     total_time = 0
     for iter in times.keys():
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index ba1f4487297..3cf28cd70bd 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -100,7 +100,7 @@ case "$mode" in
      # grammar and transition weights.
     mdl=`dirname $indir`/final.mdl
     [ ! -f $mdl ] && echo No such model $mdl && exit 1;
-    [[ -f `dirname $indir`/frame_subsampling_factor && $self_loop_scale != 1.0 ]] && \
+    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
       echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
diff --git a/egs/wsj/s5/steps/mixup.sh b/egs/wsj/s5/steps/mixup.sh
deleted file mode 100755
index 238d8cab77f..00000000000
--- a/egs/wsj/s5/steps/mixup.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
-
-# mix up (or down); do 3 iters of model training; realign; then do two more
-# iterations of model training.
-
-# Begin configuration section.
-cmd=run.pl
-beam=10
-retry_beam=40
-boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-num_iters=5
-realign_iters=3 # Space-separated list of iterations to realign on.
-stage=0
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f path.sh ] && . ./path.sh;
-. parse_options.sh || exit 1;
-
-if [ $# != 5 ]; then
-   echo "Usage: steps/mixup.sh <num-gauss> <data-dir> <lang-dir> <old-exp-dir> <exp-dir>"
-   echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k"
-   echo "main options (for others, see top of script file)"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --config <config-file>                           # config containing options"
-   echo "  --stage <stage>                                  # stage to do partial re-run from."
-   exit 1;
-fi
-
-numgauss=$1
-data=$2
-lang=$3
-srcdir=$4
-dir=$5
-
-for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do
-  [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1;
-done
-
-nj=`cat $srcdir/num_jobs` || exit 1;
-sdata=$data/split$nj;
-
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-
-mkdir -p $dir/log
-cp $srcdir/splice_opts $dir 2>/dev/null
-cp $srcdir/cmvn_opts $dir 2>/dev/null
-cp $srcdir/final.mat $dir
-echo $nj > $dir/num_jobs
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-cp $srcdir/tree $dir
-
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
-    cp $srcdir/final.mat $dir    
-    ;;
-  *) echo "Invalid feature type $feat_type" && exit 1;
-esac
-if [ -f $srcdir/trans.1 ]; then
-  echo Using transforms from $srcdir;
-  rm $dir/trans.* 2>/dev/null
-  ln.pl $srcdir/trans.* $dir  # Link those transforms to current directory.
-  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
-else
-  feats="$sifeats"
-fi
-## Done setting up features.
-
-rm $dir/fsts.*.gz 2>/dev/null
-ln.pl $srcdir/fsts.*.gz $dir  # Link training-graph FSTs to current directory.
-
-## Mix up old model
-if [ $stage -le 0 ]; then
-  echo Mixing up old model to $numgauss Gaussians
-# Note: this script also works for mixing down.
-  $cmd $dir/log/mixup.log \
-    gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \
-    $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1;
-fi
-## Done.
-
-cur_alidir=$srcdir # dir to find alignments.
-[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if
- # we won't be generating them.
-
-x=1
-while [ $x -le $num_iters ]; do
-  echo "$0: iteration $x"
-  if echo $realign_iters | grep -w $x >/dev/null; then
-    if [ $stage -le $x ]; then
-      echo "$0: realigning data"
-      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
-      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \
-        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
-        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-    fi
-    cur_alidir=$dir
-  fi
-  if [ $stage -le $x ]; then
-    echo "$0: accumulating statistics"
-    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
-      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
-      "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
-    echo "$0: re-estimating model"
-    [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;
-    $cmd $dir/log/update.$x.log \
-      gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
-      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
-    rm $dir/$x.mdl $dir/$x.*.acc
-    rm $dir/$x.occs  2>/dev/null
-  fi
-  x=$[$x+1]
-done
-
-rm $dir/final.mdl $dir/final.occs 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-ln -s $x.occs $dir/final.occs
-
-if [ -f $dir/trans.1 ]; then 
-  echo "$0: accumulating stats for alignment model."
-  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
-    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
-    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
-    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
-  [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;  
-  echo "$0: Re-estimating alignment model."
-  $cmd $dir/log/est_alimdl.log \
-    gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
-    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
-  rm $dir/$x.*.acc
-  rm $dir/final.alimdl 2>/dev/null
-  ln -s $x.alimdl $dir/final.alimdl 
-fi
-
-utils/summarize_warnings.pl $dir/log
-
-echo Done
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 0333d628544..bb8efd56ab8 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -22,12 +22,13 @@ mkdir -p $data/.backup
 
 [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
 
+set -e -o pipefail -u
+
 tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
 trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 
 export LC_ALL=C
 
-
 function check_sorted {
   file=$1
   sort -k1,1 -u <$file >$file.tmp
@@ -54,8 +55,8 @@ function filter_file {
   cp $file_to_filter ${file_to_filter}.tmp
   utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
   if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
-    length1=`cat ${file_to_filter}.tmp | wc -l`
-    length2=`cat ${file_to_filter} | wc -l`
+    length1=$(cat ${file_to_filter}.tmp | wc -l)
+    length2=$(cat ${file_to_filter} | wc -l)
     if [ $length1 -ne $length2 ]; then
       echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
     fi
@@ -77,7 +78,7 @@ function filter_recordings {
       exit 1;
     fi
     awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
-    n1=`cat $tmpdir/recordings | wc -l`
+    n1=$(cat $tmpdir/recordings | wc -l)
     [ ! -s $tmpdir/recordings ] && \
       echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 42204b85e7d..65ff3c3c79d 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -75,7 +75,7 @@ fi
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
 
-[[ -f $2/frame_subsampling_factor && $loopscale != 1.0 ]] && \
+[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
   echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
 mkdir -p $lang/tmp
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 49c929207b9..58e51a75aef 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -132,7 +132,7 @@ if [ -f $data/wav.scp ]; then
     check_sorted_and_uniq $data/segments
     # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
     ! cat $data/segments | \
-      awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
       echo "$0: badly formatted segments file" && exit 1;
 
     segments_len=`cat $data/segments | wc -l`