Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[scripts,egs] Replace LDA layer with delta and delta-delta features (#…
…3490) WER is about the same but this is simpler to implement and more standard.
- Loading branch information
Showing
8 changed files
with
992 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
tuning/run_tdnn_1i.sh | ||
tuning/run_tdnn_1j.sh |
301 changes: 301 additions & 0 deletions
301
egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1j.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,301 @@ | ||
#!/bin/bash | ||
|
||
# 1j is as 1i but replaces the LDA layer at the input of the | ||
# network with delta and delta-delta features. | ||
# Below, 1j2 is just a different rerun of 1j with different | ||
# --affix option, to give some idea of the run-to-run variation. | ||
|
||
# local/chain/compare_wer.sh --online exp/chain/tdnn1i_sp exp/chain/tdnn1j_sp exp/chain/tdnn1j2_sp | ||
# System tdnn1i_sp tdnn1j_sp tdnn1j2_sp | ||
#WER dev_clean_2 (tgsmall) 10.73 10.78 10.79 | ||
# [online:] 10.81 10.80 10.80 | ||
#WER dev_clean_2 (tglarge) 7.54 7.43 7.54 | ||
# [online:] 7.59 7.37 7.55 | ||
# Final train prob -0.0621 -0.0620 -0.0615 | ||
# Final valid prob -0.0787 -0.0787 -0.0785 | ||
# Final train prob (xent) -1.4603 -1.4363 -1.4438 | ||
# Final valid prob (xent) -1.5847 -1.5629 -1.5712 | ||
# Num-params 5210944 5210944 5210944 | ||
|
||
# steps/info/chain_dir_info.pl exp/chain/tdnn1j_sp | ||
# exp/chain/tdnn1j_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.049->-0.046 (over 4) xent:train/valid[21,33,final]=(-1.40,-1.18,-1.17/-1.56,-1.38,-1.37) logprob:train/valid[21,33,final]=(-0.064,-0.052,-0.047/-0.089,-0.085,-0.078) | ||
|
||
# Set -e here so that we catch if any executable fails immediately | ||
set -euo pipefail | ||
|
||
# First the options that are passed through to run_ivector_common.sh | ||
# (some of which are also used in this script directly). | ||
stage=0 | ||
decode_nj=10 | ||
train_set=train_clean_5 | ||
test_sets=dev_clean_2 | ||
gmm=tri3b | ||
nnet3_affix= | ||
|
||
# The rest are configs specific to this script. Most of the parameters | ||
# are just hardcoded at this level, in the commands below. | ||
affix=1j # affix for the TDNN directory name | ||
tree_affix= | ||
train_stage=-10 | ||
get_egs_stage=-10 | ||
decode_iter= | ||
|
||
# training options | ||
# training chunk-options | ||
chunk_width=140,100,160 | ||
common_egs_dir= | ||
xent_regularize=0.1 | ||
|
||
# training options | ||
srand=0 | ||
remove_egs=true | ||
reporting_email= | ||
|
||
#decode options | ||
test_online_decoding=true # if true, it will run the last decoding stage. | ||
|
||
|
||
# End configuration section. | ||
echo "$0 $@" # Print the command line for logging | ||
|
||
. ./cmd.sh | ||
. ./path.sh | ||
. ./utils/parse_options.sh | ||
|
||
if ! cuda-compiled; then | ||
cat <<EOF && exit 1 | ||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA | ||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine | ||
where "nvcc" is installed. | ||
EOF | ||
fi | ||
|
||
# The iVector-extraction and feature-dumping parts are the same as the standard | ||
# nnet3 setup, and you can skip them by setting "--stage 11" if you have already | ||
# run those things. | ||
local/nnet3/run_ivector_common.sh --stage $stage \ | ||
--train-set $train_set \ | ||
--gmm $gmm \ | ||
--nnet3-affix "$nnet3_affix" || exit 1; | ||
|
||
# Problem: We have removed the "train_" prefix of our training set in | ||
# the alignment directory names! Bad! | ||
gmm_dir=exp/$gmm | ||
ali_dir=exp/${gmm}_ali_${train_set}_sp | ||
tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} | ||
lang=data/lang_chain | ||
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats | ||
dir=exp/chain${nnet3_affix}/tdnn${affix}_sp | ||
train_data_dir=data/${train_set}_sp_hires | ||
lores_train_data_dir=data/${train_set}_sp | ||
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires | ||
|
||
for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ | ||
$lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do | ||
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 | ||
done | ||
|
||
if [ $stage -le 10 ]; then | ||
echo "$0: creating lang directory $lang with chain-type topology" | ||
# Create a version of the lang/ directory that has one state per phone in the | ||
# topo file. [note, it really has two states.. the first one is only repeated | ||
# once, the second one has zero or more repeats.] | ||
if [ -d $lang ]; then | ||
if [ $lang/L.fst -nt data/lang/L.fst ]; then | ||
echo "$0: $lang already exists, not overwriting it; continuing" | ||
else | ||
echo "$0: $lang already exists and seems to be older than data/lang..." | ||
echo " ... not sure what to do. Exiting." | ||
exit 1; | ||
fi | ||
else | ||
cp -r data/lang $lang | ||
silphonelist=$(cat $lang/phones/silence.csl) || exit 1; | ||
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; | ||
# Use our special topology... note that later on may have to tune this | ||
# topology. | ||
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo | ||
fi | ||
fi | ||
|
||
if [ $stage -le 11 ]; then | ||
# Get the alignments as lattices (gives the chain training more freedom). | ||
# use the same num-jobs as the alignments | ||
steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ | ||
data/lang $gmm_dir $lat_dir | ||
rm $lat_dir/fsts.*.gz # save space | ||
fi | ||
|
||
if [ $stage -le 12 ]; then | ||
# Build a tree using our new topology. We know we have alignments for the | ||
# speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use | ||
# those. The num-leaves is always somewhat less than the num-leaves from | ||
# the GMM baseline. | ||
if [ -f $tree_dir/final.mdl ]; then | ||
echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." | ||
exit 1; | ||
fi | ||
steps/nnet3/chain/build_tree.sh \ | ||
--frame-subsampling-factor 3 \ | ||
--context-opts "--context-width=2 --central-position=1" \ | ||
--cmd "$train_cmd" 3500 ${lores_train_data_dir} \ | ||
$lang $ali_dir $tree_dir | ||
fi | ||
|
||
|
||
if [ $stage -le 13 ]; then | ||
mkdir -p $dir | ||
echo "$0: creating neural net configs using the xconfig parser"; | ||
|
||
num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') | ||
learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) | ||
|
||
tdnn_opts="l2-regularize=0.03" | ||
tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66" | ||
linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" | ||
prefinal_opts="l2-regularize=0.03" | ||
output_opts="l2-regularize=0.015" | ||
|
||
mkdir -p $dir/configs | ||
cat <<EOF > $dir/configs/network.xconfig | ||
input dim=100 name=ivector | ||
input dim=40 name=input | ||
# this takes the MFCCs and generates filterbank coefficients. The MFCCs | ||
# are more compressible so we prefer to dump the MFCCs to disk rather | ||
# than filterbanks. | ||
idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat | ||
batchnorm-component name=batchnorm0 input=idct | ||
spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 | ||
delta-layer name=delta input=spec-augment | ||
no-op-component name=input2 input=Append(delta, Scale(0.4, ReplaceIndex(ivector, t, 0))) | ||
# the first splicing is moved before the lda layer, so no splicing here | ||
relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768 input=input2 | ||
tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 | ||
tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 | ||
tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 | ||
tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 | ||
tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 | ||
linear-component name=prefinal-l dim=192 $linear_opts | ||
## adding the layers for chain branch | ||
prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 | ||
output-layer name=output include-log-softmax=false dim=$num_targets $output_opts | ||
# adding the layers for xent branch | ||
prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 | ||
output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts | ||
EOF | ||
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ | ||
fi | ||
|
||
|
||
if [ $stage -le 14 ]; then | ||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then | ||
utils/create_split_dir.pl \ | ||
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage | ||
fi | ||
|
||
steps/nnet3/chain/train.py --stage=$train_stage \ | ||
--cmd="$decode_cmd" \ | ||
--feat.online-ivector-dir=$train_ivector_dir \ | ||
--feat.cmvn-opts="--norm-means=false --norm-vars=false" \ | ||
--chain.xent-regularize $xent_regularize \ | ||
--chain.leaky-hmm-coefficient=0.1 \ | ||
--chain.l2-regularize=0.0 \ | ||
--chain.apply-deriv-weights=false \ | ||
--chain.lm-opts="--num-extra-lm-states=2000" \ | ||
--trainer.add-option="--optimization.memory-compression-level=2" \ | ||
--trainer.srand=$srand \ | ||
--trainer.max-param-change=2.0 \ | ||
--trainer.num-epochs=20 \ | ||
--trainer.frames-per-iter=3000000 \ | ||
--trainer.optimization.num-jobs-initial=2 \ | ||
--trainer.optimization.num-jobs-final=5 \ | ||
--trainer.optimization.initial-effective-lrate=0.002 \ | ||
--trainer.optimization.final-effective-lrate=0.0002 \ | ||
--trainer.num-chunk-per-minibatch=128,64 \ | ||
--egs.chunk-width=$chunk_width \ | ||
--egs.dir="$common_egs_dir" \ | ||
--egs.opts="--frames-overlap-per-eg 0" \ | ||
--cleanup.remove-egs=$remove_egs \ | ||
--use-gpu=true \ | ||
--reporting.email="$reporting_email" \ | ||
--feat-dir=$train_data_dir \ | ||
--tree-dir=$tree_dir \ | ||
--lat-dir=$lat_dir \ | ||
--dir=$dir || exit 1; | ||
fi | ||
|
||
if [ $stage -le 15 ]; then | ||
# Note: it's not important to give mkgraph.sh the lang directory with the | ||
# matched topology (since it gets the topology file from the model). | ||
utils/mkgraph.sh \ | ||
--self-loop-scale 1.0 data/lang_test_tgsmall \ | ||
$tree_dir $tree_dir/graph_tgsmall || exit 1; | ||
fi | ||
|
||
if [ $stage -le 16 ]; then | ||
frames_per_chunk=$(echo $chunk_width | cut -d, -f1) | ||
rm $dir/.error 2>/dev/null || true | ||
|
||
for data in $test_sets; do | ||
( | ||
nspk=$(wc -l <data/${data}_hires/spk2utt) | ||
steps/nnet3/decode.sh \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--frames-per-chunk $frames_per_chunk \ | ||
--nj $nspk --cmd "$decode_cmd" --num-threads 4 \ | ||
--online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ | ||
$tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1 | ||
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ | ||
data/lang_test_{tgsmall,tglarge} \ | ||
data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1 | ||
) || touch $dir/.error & | ||
done | ||
wait | ||
[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 | ||
fi | ||
|
||
# Not testing the 'looped' decoding separately, because for | ||
# TDNN systems it would give exactly the same results as the | ||
# normal decoding. | ||
|
||
if $test_online_decoding && [ $stage -le 17 ]; then | ||
# note: if the features change (e.g. you add pitch features), you will have to | ||
# change the options of the following command line. | ||
steps/online/nnet3/prepare_online_decoding.sh \ | ||
--mfcc-config conf/mfcc_hires.conf \ | ||
$lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online | ||
|
||
rm $dir/.error 2>/dev/null || true | ||
|
||
for data in $test_sets; do | ||
( | ||
nspk=$(wc -l <data/${data}_hires/spk2utt) | ||
# note: we just give it "data/${data}" as it only uses the wav.scp, the | ||
# feature type does not matter. | ||
steps/online/nnet3/decode.sh \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--nj $nspk --cmd "$decode_cmd" \ | ||
$tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1 | ||
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ | ||
data/lang_test_{tgsmall,tglarge} \ | ||
data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1 | ||
) || touch $dir/.error & | ||
done | ||
wait | ||
[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 | ||
fi | ||
|
||
|
||
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
tuning/run_tdnn_7q.sh | ||
tuning/run_tdnn_7r.sh |
Oops, something went wrong.