From 246b9825340fdd5968bd81cd9d514fcf4d799064 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 3 Jan 2019 18:17:12 -0500 Subject: [PATCH 01/18] adding extra corpus for LM, minor cleaning --- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 50 ++++--------------- egs/madcat_ar/v1/local/train_lm.sh | 18 ++++--- egs/madcat_ar/v1/run_end2end.sh | 41 ++++++++------- 3 files changed, 45 insertions(+), 64 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 2891e50da9e..dc6ed4b7c2b 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -3,24 +3,24 @@ # This script does end2end chain training (i.e. from scratch) -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ # System e2e_cnn_1a -# WER 7.81 -# CER 2.05 -# Final train prob -0.0812 -# Final valid prob -0.0708 +# WER 5.73 +# WER (rescored) 5.67 +# CER 1.45 +# CER (rescored) 1.42 +# Final train prob -0.0934 +# Final valid prob -0.0746 # Final train prob (xent) # Final valid prob (xent) # Parameters 2.94M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.073->-0.073 (over 2) logprob:train/valid[64,97,final]=(-0.084,-0.080,-0.081/-0.073,-0.070,-0.071) - +# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.071->-0.070 (over 5) logprob:train/valid[64,97,final]=(-0.089,-0.084,-0.093/-0.075,-0.073,-0.075) set -e # configs for 'chain' stage=0 -nj=70 train_stage=-10 get_egs_stage=-10 affix=1a @@ -31,9 +31,6 @@ minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= cmvn_opts="--norm-means=false --norm-vars=false" train_set=train -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g - # End configuration section. echo "$0 $@" # Print the command line for logging @@ -67,7 +64,7 @@ if [ $stage -le 0 ]; then fi if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ --shared-phones true \ --type mono \ data/$train_set $lang $treedir @@ -107,9 +104,6 @@ EOF fi if [ $stage -le 3 ]; then - # no need to store the egs in a shared storage because we always - # remove them. Anyway, it takes only 5 minutes to generate them. - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ @@ -138,29 +132,3 @@ if [ $stage -le 3 ]; then --tree-dir $treedir \ --dir $dir || exit 1; fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index b7fc0b09a46..903b288a834 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -64,6 +64,12 @@ if [ $stage -le 0 ]; then # we can later fold the dev data into this. cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + if [ -d "data/local/gigawordcorpus/arb_gw_5/data" ]; then + cat data/local/gigawordcorpus/arb_gw_5/data/nhr_arb_combined.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt + fi + # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work # out interpolation weights.) @@ -72,7 +78,7 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from MADCAT text - cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi @@ -83,7 +89,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=1' + min_counts='corpus_text=2 train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" @@ -103,8 +109,8 @@ fi if [ $stage -le 2 ]; then echo "$0: pruning the LM (to larger size)" - # Using 1 million n-grams for a big LM for rescoring purposes. - size=1000000 + # Using 20 million n-grams for a big LM for rescoring purposes. + size=20000000 prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' @@ -114,9 +120,9 @@ fi if [ $stage -le 3 ]; then echo "$0: pruning the LM (to smaller size)" - # Using 500k n-grams for a smaller LM for graph building. Prune from the + # Using 10 million n-grams for a smaller LM for graph building. Prune from the # bigger-pruned LM, it'll be faster. - size=500000 + size=10000000 prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index bb2b4f86db1..62f4eeb7c71 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -36,6 +36,8 @@ if [ $stage -le 0 ]; then echo "Exiting with status 1 to avoid data corruption" exit 1; fi + + echo "$0: preparing data...$(date)" local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 \ --use_extra_corpus_text $use_extra_corpus_text @@ -64,7 +66,7 @@ if [ $stage -le 1 ]; then image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in test train; do + for set in test dev train; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; @@ -99,28 +101,33 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/chain/run_e2e_cnn.sh +fi + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +decode_e2e=true +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang + data/local/dict/lexicon.txt $lang_decode utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/lang data/lang_rescore_6g + data/lang $lang_rescore fi -if [ $stage -le 4 ]; then - echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_e2e_cnn.sh --nj $nj -fi +if [ $stage -le 5 ] && $decode_e2e; then + echo "$0: $(date) stage 5: decoding end2end setup..." + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; -if [ $stage -le 5 ]; then - echo "$0: Aligning the training data using the e2e chain model...$(date)." - steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train -fi + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1 -if [ $stage -le 6 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali.sh --nj $nj + echo "$0: Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ fi From ff212a05a82c168cb40e8e3a60b356acda50d44c Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 8 Jan 2019 23:23:43 -0500 Subject: [PATCH 02/18] making BPE and word-based setups similar --- egs/iam/v1/local/augment_data.sh | 34 +++ egs/iam/v1/local/chain/run_cnn.sh | 1 + egs/iam/v1/local/chain/run_cnn_chainali.sh | 1 + egs/iam/v1/local/chain/run_cnn_e2eali.sh | 1 + egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh | 2 +- egs/iam/v1/local/chain/run_e2e_cnn.sh | 1 + .../v1/local/chain/{ => tuning}/run_cnn_1a.sh | 0 .../chain/{ => tuning}/run_cnn_chainali_1a.sh | 0 .../chain/{ => tuning}/run_cnn_chainali_1b.sh | 0 .../chain/{ => tuning}/run_cnn_chainali_1c.sh | 0 .../chain/{ => tuning}/run_cnn_chainali_1d.sh | 0 .../chain/{ => tuning}/run_cnn_e2eali_1a.sh | 0 .../local/chain/tuning/run_cnn_e2eali_1b.sh | 244 +++++++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1c.sh | 251 ++++++++++++++++++ .../run_e2e_cnn_1a.sh} | 81 +++--- egs/iam/v1/local/extract_features.sh | 48 ++++ egs/iam/v1/local/gen_topo.py | 92 +++++++ egs/iam/v1/local/make_features.py | 193 ++++++++++++-- egs/iam/v1/local/prepare_data.sh | 5 +- egs/iam/v1/run_end2end.sh | 53 ++-- 20 files changed, 911 insertions(+), 96 deletions(-) create mode 100755 egs/iam/v1/local/augment_data.sh create mode 120000 egs/iam/v1/local/chain/run_cnn.sh create mode 120000 egs/iam/v1/local/chain/run_cnn_chainali.sh create mode 120000 egs/iam/v1/local/chain/run_cnn_e2eali.sh create mode 120000 egs/iam/v1/local/chain/run_e2e_cnn.sh rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_1a.sh (100%) rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_chainali_1a.sh (100%) rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_chainali_1b.sh (100%) rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_chainali_1c.sh (100%) rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_chainali_1d.sh (100%) rename egs/iam/v1/local/chain/{ => tuning}/run_cnn_e2eali_1a.sh (100%) create mode 100755 egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh create mode 100755 egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh rename egs/iam/v1/local/chain/{run_flatstart_cnn1a.sh => tuning/run_e2e_cnn_1a.sh} (68%) create mode 100755 egs/iam/v1/local/extract_features.sh create mode 100755 egs/iam/v1/local/gen_topo.py diff --git a/egs/iam/v1/local/augment_data.sh b/egs/iam/v1/local/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/iam/v1/local/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v1/local/chain/run_cnn.sh b/egs/iam/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_chainali.sh b/egs/iam/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..41b712609c2 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1d.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali.sh b/egs/iam/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..ad51803ab0e --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1c.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh index 6d8cca876bf..1e3aed66c6b 100755 --- a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh +++ b/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh @@ -23,7 +23,7 @@ nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1a +e2echain_model_dir=exp/chain/e2e_cnn_1b common_egs_dir= reporting_email= diff --git a/egs/iam/v1/local/chain/run_e2e_cnn.sh b/egs/iam/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/iam/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_1a.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_1a.sh diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_chainali_1a.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_chainali_1b.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_chainali_1c.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_chainali_1d.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh similarity index 100% rename from egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh rename to egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..6d8cca876bf --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but uses unconstrained egs + +# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1a cnn_e2eali_1b +# WER 12.79 12.23 +# CER 5.73 5.48 +# Final train prob -0.0556 -0.0367 +# Final valid prob -0.0795 -0.0592 +# Final train prob (xent) -0.9178 -0.8382 +# Final valid prob (xent) -1.0604 -0.9853 +# Parameters 3.95M 3.95M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh new file mode 100755 index 00000000000..cf90193f7eb --- /dev/null +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -0,0 +1,251 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has more CNN layers, different filter size +# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1c (dict_50k) cnn_e2eali_1c (dict_500k) +# WER 12.20 9.62 +# CER 5.29 4.33 +# Final train prob -0.0494 -0.0494 +# Final valid prob -0.0644 -0.0644 +# Final train prob (xent) -0.4852 -0.4852 +# Final valid prob (xent) -0.5437 -0.5437 +# Parameters 4.33M 4.33M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->376 combine=-0.052->-0.052 (over 1) xent:train/valid[19,29,final]=(-0.715,-0.508,-0.485/-0.717,-0.562,-0.544) logprob:train/valid[19,29,final]=(-0.089,-0.054,-0.049/-0.100,-0.070,-0.064) + + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1b +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang_test +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done +fi + + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh similarity index 68% rename from egs/iam/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 56c897137f4..4836d76fa6e 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -2,20 +2,20 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a -# System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.52 12.72 13.87 -# CER 10.07 5.99 6.54 -# Final train prob -0.0077 -0.0291 -0.0371 -# Final valid prob -0.0970 -0.0359 -0.0636 -# Final train prob (xent) -0.5484 -0.9781 -# Final valid prob (xent) -0.9643 -1.1544 -# Parameters 4.36M 3.96M 9.13M +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 13.59 +# WER (rescored) 13.27 +# CER 6.92 +# CER (rescored) 6.71 +# Final train prob 0.0345 +# Final valid prob 0.0269 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.52M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) - +# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027) set -e # configs for 'chain' @@ -23,20 +23,16 @@ stage=0 train_stage=-10 get_egs_stage=-10 affix=1a +nj=30 # training options tdnn_dim=450 -num_epochs=4 -num_jobs_initial=2 -num_jobs_final=4 minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=1000000 -cmvn_opts="--norm-means=true --norm-vars=true" train_set=train -lang_test=lang_unk - +decode_val=true +lang_decode=data/lang_test +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -85,13 +81,8 @@ fi if [ $stage -le 2 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -99,14 +90,11 @@ if [ $stage -le 2 ]; then conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts @@ -121,9 +109,9 @@ if [ $stage -le 3 ]; then steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ @@ -131,11 +119,11 @@ if [ $stage -le 3 ]; then --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ @@ -155,15 +143,16 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v1/local/extract_features.sh b/egs/iam/v1/local/extract_features.sh new file mode 100755 index 00000000000..1741ad3f9b2 --- /dev/null +++ b/egs/iam/v1/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment=false +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/iam/v1/local/gen_topo.py b/egs/iam/v1/local/gen_topo.py new file mode 100755 index 00000000000..540bfbcf270 --- /dev/null +++ b/egs/iam/v1/local/gen_topo.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_punctuation_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index 84e012daedb..3ce501732cf 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2017 Yiwen Shao # 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to @@ -14,20 +15,27 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys +import scipy.io as sio import numpy as np from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") -parser.add_argument('dir', type=str, - help='Source data directory (containing images.scp)') +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') parser.add_argument('--out-ark', type=str, default='-', help='Where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, @@ -35,8 +43,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() @@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im, allowed_lengths = None): - scale_size = args.feat_dim - sx = im.shape[1] - sy = im.shape[0] - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) + +def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding else: # Find an allowed length for the image - imlen = im.shape[1] + imlen = im.shape[1] # width allowed_len = 0 for l in allowed_lengths: if l > imlen: @@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None): # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen - left_padding = padding // 2 + left_padding = int(padding // 2) right_padding = padding - left_padding - dim_y = im.shape[0] + dim_y = im.shape[0] # height im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 -### main ### -data_list_path = os.path.join(args.dir, 'images.scp') +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +### main ### +random.seed(1) +data_list_path = args.images_scp_path if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None -if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] - with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + with open(allowed_len_handle) as f: for line in f: allowed_lengths.append(int(line.strip())) print("Read {} allowed lengths and will apply them to the " @@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None): num_fail = 0 num_ok = 0 +aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scaled = get_scaled_image(im, allowed_lengths) - - if im_scaled is None: + if args.fliplr: + im = np.fliplr(im) + if args.augment: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_contrast = contrast_normalization(im_aug, 0.05, 0.2) + slant_degree = find_slant_project(im_contrast) + im_sheared = horizontal_shear(im_contrast, slant_degree) + im_aug = im_sheared + else: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) + if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_scaled, (1, 0)) + data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) -print('Generated features for {} images. Failed for {} (iamge too ' +print('Generated features for {} images. Failed for {} (image too ' 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index 73d711c73f0..9c01ac90f28 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -165,6 +165,7 @@ if [ $stage -le 0 ]; then local/process_data.py data/local data/test --dataset test || exit 1 local/process_data.py data/local data/val --dataset validation || exit 1 - utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt - utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt + image/fix_data_dir.sh data/train + image/fix_data_dir.sh data/test + image/fix_data_dir.sh data/val fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 6df93e739f4..22f5c8f11af 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -16,7 +16,7 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # This corpus is of written NZ English that can be purchased here: # "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ - +train_set=train_aug . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -32,45 +32,58 @@ if [ $stage -le 0 ]; then --wellington-dir "$wellington_database" \ --username "$username" --password "$password" fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test,val}/data if [ $stage -le 1 ]; then - image/get_image2num_frames.py data/train # This will be needed for the next command + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; done utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist local/train_lm.sh --vocab-size 50k fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." - # This is for training. Use a large vocab size, e.g. 500k to include all the # training words: local/prepare_dict.sh --vocab-size 500k --dir data/local/dict - utils/prepare_lang.sh --sil-prob 0.95 \ + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo + # This is for decoding. We use a 50k lexicon to be consistent with the papers # reporting WERs on IAM. local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k - utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \ - "" data/lang_test/temp data/lang_test + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict_50k "" data/lang_test/temp data/lang_test utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ data/local/dict_50k/lexicon.txt data/lang_test @@ -82,20 +95,20 @@ if [ $stage -le 3 ]; then cp data/lang_test/G.fst data/lang_unk/G.fst fi -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe..." - local/chain/run_flatstart_cnn1a.sh + local/chain/run_e2e_cnn_1b.sh --train_set $train_set fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/$train_set data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - local/chain/run_cnn_e2eali_1a.sh + local/chain/run_cnn_e2eali_1d.sh --train_set $train_set fi From 3c3185b9e36f790ff9e49951e88732b8053f28da Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 00:06:26 -0500 Subject: [PATCH 03/18] adding changes in run.sh --- egs/iam/v1/run.sh | 60 ++++++++++++++++++++++++++------------- egs/iam/v1/run_end2end.sh | 16 ++++------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index b943870f530..7907f018c04 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -20,6 +20,7 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # This corpus is of written NZ English that can be purchased here: # "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ +train_set=train_aug . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -30,33 +31,54 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ --username "$username" --password "$password" fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test,val}/data if [ $stage -le 1 ]; then - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + echo "$0: $(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: $(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; done + utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then + for set in train; do + echo "$0: $(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist local/train_lm.sh --vocab-size 50k fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." - # This is for training. Use a large vocab size, e.g. 500k to include all the # training words: local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training @@ -81,7 +103,7 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/$train_set \ data/lang exp/mono fi @@ -93,10 +115,10 @@ if [ $stage -le 5 ] && $decode_gmm; then fi if [ $stage -le 6 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \ exp/mono exp/mono_ali - steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \ + steps/train_deltas.sh --cmd $cmd 500 20000 data/$train_set data/lang \ exp/mono_ali exp/tri fi @@ -108,12 +130,12 @@ if [ $stage -le 7 ] && $decode_gmm; then fi if [ $stage -le 8 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \ exp/tri exp/tri_ali steps/train_lda_mllt.sh --cmd $cmd \ --splice-opts "--left-context=3 --right-context=3" 500 20000 \ - data/train data/lang exp/tri_ali exp/tri2 + data/$train_set data/lang exp/tri_ali exp/tri2 fi if [ $stage -le 9 ] && $decode_gmm; then @@ -125,10 +147,10 @@ fi if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train data/lang exp/tri2 exp/tri2_ali + data/$train_set data/lang exp/tri2 exp/tri2_ali steps/train_sat.sh --cmd $cmd 500 20000 \ - data/train data/lang exp/tri2_ali exp/tri3 + data/$train_set data/lang exp/tri2_ali exp/tri3 fi if [ $stage -le 11 ] && $decode_gmm; then @@ -140,13 +162,13 @@ fi if [ $stage -le 12 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train data/lang exp/tri3 exp/tri3_ali + data/$train_set data/lang exp/tri3 exp/tri3_ali fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh --lang-test lang_unk + local/chain/run_cnn.sh --lang-test lang_unk --train_set $train_set fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2 + local/chain/run_cnn_chainali.sh --chain-model-dir exp/chain/cnn_1a --stage 2 --train_set $train_set fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 22f5c8f11af..58461c740b6 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -22,10 +22,7 @@ train_set=train_aug . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. - - ./local/check_tools.sh - if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ @@ -35,13 +32,13 @@ fi mkdir -p data/{train,test,val}/data if [ $stage -le 1 ]; then - echo "$(date) stage 1: getting allowed image widths for e2e training..." + echo "$0: $(date) stage 1: getting allowed image widths for e2e training..." image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - echo "$(date) Extracting features, creating feats.scp file" + echo "$0: $(date) Extracting features, creating feats.scp file" local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train steps/compute_cmvn_stats.sh data/train || exit 1; for set in val test; do @@ -54,7 +51,7 @@ fi if [ $stage -le 2 ]; then for set in train; do - echo "$(date) stage 2: Performing augmentation, it will double training data" + echo "$0: $(date) stage 2: Performing augmentation, it will double training data" local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done @@ -74,7 +71,6 @@ if [ $stage -le 4 ]; then local/prepare_dict.sh --vocab-size 500k --dir data/local/dict utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang - silphonelist=`cat data/lang/phones/silence.csl` nonsilphonelist=`cat data/lang/phones/nonsilence.csl` local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo @@ -97,7 +93,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe..." - local/chain/run_e2e_cnn_1b.sh --train_set $train_set + local/chain/run_e2e_cnn.sh --train_set $train_set fi if [ $stage -le 6 ]; then @@ -105,10 +101,10 @@ if [ $stage -le 6 ]; then steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/$train_set data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train + data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - local/chain/run_cnn_e2eali_1d.sh --train_set $train_set + local/chain/run_cnn_e2eali.sh --train_set $train_set fi From 0862c616eaf9ca584ec9877fff1c6ea23b3ca721 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 00:53:40 -0500 Subject: [PATCH 04/18] updating result in 1b --- egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh | 244 ------------------ .../local/chain/tuning/run_cnn_e2eali_1b.sh | 27 +- 2 files changed, 11 insertions(+), 260 deletions(-) delete mode 100755 egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh deleted file mode 100755 index 1e3aed66c6b..00000000000 --- a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh +++ /dev/null @@ -1,244 +0,0 @@ -#!/bin/bash - -# e2eali_1b is the same as e2eali_1a but uses unconstrained egs - -# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b -# System cnn_e2eali_1a cnn_e2eali_1b -# WER 12.79 12.23 -# CER 5.73 5.48 -# Final train prob -0.0556 -0.0367 -# Final valid prob -0.0795 -0.0592 -# Final train prob (xent) -0.9178 -0.8382 -# Final valid prob (xent) -1.0604 -0.9853 -# Parameters 3.95M 3.95M - -# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b -# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) - -set -e -o pipefail - -stage=0 - -nj=30 -train_set=train -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1b -common_egs_dir= -reporting_email= - -# chain options -train_stage=-10 -xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options -chunk_width=340,300,200,100 -num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 -tdnn_dim=450 -# training options -srand=0 -remove_egs=true -lang_test=lang_unk -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 2 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ - --acoustic-scale 1.0 \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/lang $e2echain_model_dir $lat_dir - echo "" >$lat_dir/splice_opts -fi - -if [ $stage -le 3 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ - --alignment-subsampling-factor 1 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 4 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' mod?els... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 5 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$cmd" \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=1 \ - --chain.left-tolerance 3 \ - --chain.right-tolerance 3 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ - --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 6 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 7 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 6d8cca876bf..0a19e0fb20d 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -1,29 +1,27 @@ #!/bin/bash # e2eali_1b is the same as e2eali_1a but uses unconstrained egs - -# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b -# System cnn_e2eali_1a cnn_e2eali_1b -# WER 12.79 12.23 -# CER 5.73 5.48 -# Final train prob -0.0556 -0.0367 -# Final valid prob -0.0795 -0.0592 -# Final train prob (xent) -0.9178 -0.8382 -# Final valid prob (xent) -1.0604 -0.9853 -# Parameters 3.95M 3.95M +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1b (dict_50k) cnn_e2eali_1b (dict_500k) +# WER 11.41 10.25 +# CER 4.87 4.60 +# Final train prob -0.0384 -0.0384 +# Final valid prob -0.0444 -0.0444 +# Final train prob (xent) -0.8084 -0.8084 +# Final valid prob (xent) -0.8470 -0.8470 +# Parameters 3.97M 3.97M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b -# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) +# exp/chain/cnn_e2eali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->376 combine=-0.039->-0.039 (over 1) xent:train/valid[27,41,final]=(-1.28,-0.846,-0.808/-1.27,-0.871,-0.847) logprob:train/valid[27,41,final]=(-0.064,-0.043,-0.038/-0.065,-0.051,-0.044) set -e -o pipefail - stage=0 nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1a +e2echain_model_dir=exp/chain/e2e_cnn_1b common_egs_dir= reporting_email= @@ -141,7 +139,6 @@ if [ $stage -le 4 ]; then mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -152,11 +149,9 @@ if [ $stage -le 4 ]; then relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - # adding the layers for xent branch # This block prints the configs for a separate output that will be # trained with a cross-entropy objective in the 'chain' mod?els... this From 214bd1232de03714051ec396d1e9a2a06860b44b Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 01:01:13 -0500 Subject: [PATCH 05/18] updating results --- .../v1/local/chain/tuning/run_cnn_e2eali_1b.sh | 2 +- egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 0a19e0fb20d..dc830319f69 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -21,7 +21,7 @@ nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1b +e2echain_model_dir=exp/chain/e2e_cnn_1a common_egs_dir= reporting_email= diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 4836d76fa6e..cc27c8e55c4 100755 --- a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -2,20 +2,19 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a # System e2e_cnn_1a -# WER 13.59 -# WER (rescored) 13.27 -# CER 6.92 -# CER (rescored) 6.71 -# Final train prob 0.0345 -# Final valid prob 0.0269 +# WER 15.24 +# CER 7.27 +# Final train prob -0.0209 +# Final valid prob -0.0417 # Final train prob (xent) # Final valid prob (xent) # Parameters 9.52M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027) +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.021->-0.021 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.043,-0.042) + set -e # configs for 'chain' From f0b87b302265492267e1a184a889ec0cef946b51 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 17:18:36 -0500 Subject: [PATCH 06/18] adding tuning directory --- egs/madcat_zh/v1/local/chain/run_cnn.sh | 1 + egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh | 1 + egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh | 1 + egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_1a.sh | 0 egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_chainali_1a.sh | 0 egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_chainali_1b.sh | 0 .../chain/{run_flatstart_cnn1a.sh => tuning/run_e2e_cnn_1a.sh} | 0 7 files changed, 3 insertions(+) create mode 120000 egs/madcat_zh/v1/local/chain/run_cnn.sh create mode 120000 egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh create mode 120000 egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh rename egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_1a.sh (100%) rename egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_chainali_1a.sh (100%) rename egs/madcat_zh/v1/local/chain/{ => tuning}/run_cnn_chainali_1b.sh (100%) rename egs/madcat_zh/v1/local/chain/{run_flatstart_cnn1a.sh => tuning/run_e2e_cnn_1a.sh} (100%) diff --git a/egs/madcat_zh/v1/local/chain/run_cnn.sh b/egs/madcat_zh/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..86568421fe1 --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1b.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh similarity index 100% rename from egs/madcat_zh/v1/local/chain/run_cnn_1a.sh rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 100% rename from egs/madcat_zh/v1/local/chain/run_cnn_chainali_1a.sh rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh similarity index 100% rename from egs/madcat_zh/v1/local/chain/run_cnn_chainali_1b.sh rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh diff --git a/egs/madcat_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh similarity index 100% rename from egs/madcat_zh/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh From ff073765ea630ab00b5ca8d1785008337305df86 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 20:02:59 -0500 Subject: [PATCH 07/18] making it similar to madcat chinese --- egs/madcat_zh/v1/local/download_data.sh | 38 ------ egs/madcat_zh/v1/local/extract_features.sh | 12 +- egs/madcat_zh/v1/local/make_features.py | 138 --------------------- egs/madcat_zh/v1/local/prepare_data.sh | 53 ++++---- egs/madcat_zh/v1/run_end2end.sh | 31 ++++- 5 files changed, 58 insertions(+), 214 deletions(-) delete mode 100755 egs/madcat_zh/v1/local/download_data.sh delete mode 100755 egs/madcat_zh/v1/local/make_features.py diff --git a/egs/madcat_zh/v1/local/download_data.sh b/egs/madcat_zh/v1/local/download_data.sh deleted file mode 100755 index 6b4055f7205..00000000000 --- a/egs/madcat_zh/v1/local/download_data.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Ashish Arora -# Apache 2.0 - -# This script downloads data splits for MADCAT Chinese dataset. -# It also check if madcat chinese data is present or not. - -download_dir1=/export/corpora/LDC/LDC2014T13/data -train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid -test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid -dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid -data_split_dir=data/download/datasplits - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -if [ -d $data_split_dir ]; then - echo "$0: Not downloading the data splits as it is already there." -else - if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then - mkdir -p $data_split_dir - echo "$0: Downloading the data splits..." - wget -P $data_split_dir $train_split_url || exit 1; - wget -P $data_split_dir $test_split_url || exit 1; - wget -P $data_split_dir $dev_split_url || exit 1; - fi - echo "$0: Done downloading the data splits" -fi - -if [ -d $download_dir1 ]; then - echo "$0: madcat chinese data directory is present." -else - if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then - echo "$0: please download madcat data..." - fi -fi diff --git a/egs/madcat_zh/v1/local/extract_features.sh b/egs/madcat_zh/v1/local/extract_features.sh index 0660ae4b412..9fe588f31b8 100755 --- a/egs/madcat_zh/v1/local/extract_features.sh +++ b/egs/madcat_zh/v1/local/extract_features.sh @@ -1,10 +1,16 @@ #!/bin/bash + # Copyright 2017 Yiwen Shao # 2018 Ashish Arora +# Apache 2.0 +# This script runs the make features script in parallel. + nj=4 cmd=run.pl feat_dim=40 +augment='no_aug' +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -29,11 +35,11 @@ done # split images.scp utils/split_scp.pl $scp $split_scps || exit 1; -echo "$0: Preparing the test and train feature files..." $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim \| \ + --feat-dim $feat_dim --augment_type $augment \ + --vertical-shift $verticle_shift \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_zh/v1/local/make_features.py b/egs/madcat_zh/v1/local/make_features.py deleted file mode 100755 index a21276d32c2..00000000000 --- a/egs/madcat_zh/v1/local/make_features.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2018 Hossein Hadian - -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - If an 'image2num_frames' file is found in the data dir, it will be used - to enforce the images to have the specified length in that file by padding - white pixels (the --padding option will be ignored in this case). This relates - to end2end chain training. - - eg. local/make_features.py data/train --feat-dim 40 -""" - -import argparse -import os -import sys -import numpy as np -from scipy import misc - -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") -parser.add_argument('images_scp_path', type=str, - help='Path of images.scp file') -parser.add_argument('--allowed_len_file_path', type=str, default=None, - help='If supplied, each images will be padded to reach the ' - 'target length (this overrides --padding).') -parser.add_argument('--out-ark', type=str, default='-', - help='Where to write the output feature file') -parser.add_argument('--feat-dim', type=int, default=40, - help='Size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='Number of white pixels to pad on the left' - 'and right side of the image.') - - -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - -def horizontal_pad(im, allowed_lengths = None): - if allowed_lengths is None: - left_padding = right_padding = args.padding - else: # Find an allowed length for the image - imlen = im.shape[1] # width - allowed_len = 0 - for l in allowed_lengths: - if l > imlen: - allowed_len = l - break - if allowed_len == 0: - # No allowed length was found for the image (the image is too long) - return None - padding = allowed_len - imlen - left_padding = int(padding // 2) - right_padding = padding - left_padding - dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) - return im_pad1 - - -### main ### - -data_list_path = args.images_scp_path - -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark,'wb') - -allowed_lengths = None -allowed_len_handle = args.allowed_len_file_path -if os.path.isfile(allowed_len_handle): - print("Found 'allowed_lengths.txt' file...", file=sys.stderr) - allowed_lengths = [] - with open(allowed_len_handle) as f: - for line in f: - allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the " - "features.".format(len(allowed_lengths)), file=sys.stderr) - -num_fail = 0 -num_ok = 0 -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: - num_fail += 1 - continue - data = np.transpose(im_horizontal_padded, (1, 0)) - data = np.divide(data, 255.0) - num_ok += 1 - write_kaldi_matrix(out_fh, data, image_id) - -print('Generated features for {} images. Failed for {} (image too ' - 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_zh/v1/local/prepare_data.sh b/egs/madcat_zh/v1/local/prepare_data.sh index c1accfb5e6c..6b4055f7205 100755 --- a/egs/madcat_zh/v1/local/prepare_data.sh +++ b/egs/madcat_zh/v1/local/prepare_data.sh @@ -1,43 +1,38 @@ #!/bin/bash -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2017 Hossein Hadian +# Copyright 2018 Ashish Arora # Apache 2.0 -# This script downloads the Madcat Chinese handwriting database and prepares the training -# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. -# It also downloads the LOB and Brown text corpora. It downloads the database files -# only if they do not already exist in download directory. +# This script downloads data splits for MADCAT Chinese dataset. +# It also check if madcat chinese data is present or not. -# Eg. local/prepare_data.sh -# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from -# utt2spk file: 000_a01-000u-00 000 -# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png -# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 - -stage=0 -download_dir=/export/corpora/LDC/LDC2014T13 +download_dir1=/export/corpora/LDC/LDC2014T13/data +train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid +test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid +dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid data_split_dir=data/download/datasplits . ./cmd.sh . ./path.sh . ./utils/parse_options.sh || exit 1; -if [[ ! -d $download_dir ]]; then - echo "$0: Warning: Couldn't find $download_dir." - echo "" +if [ -d $data_split_dir ]; then + echo "$0: Not downloading the data splits as it is already there." +else + if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then + mkdir -p $data_split_dir + echo "$0: Downloading the data splits..." + wget -P $data_split_dir $train_split_url || exit 1; + wget -P $data_split_dir $test_split_url || exit 1; + wget -P $data_split_dir $dev_split_url || exit 1; + fi + echo "$0: Done downloading the data splits" fi -mkdir -p data/{train,test,dev}/lines -if [ $stage -le 1 ]; then - local/process_data.py $download_dir $data_split_dir/madcat.train.raw.lineid data/train || exit 1 - local/process_data.py $download_dir $data_split_dir/madcat.test.raw.lineid data/test || exit 1 - local/process_data.py $download_dir $data_split_dir/madcat.dev.raw.lineid data/dev || exit 1 - - for dataset in train test dev; do - echo "$0: Fixing data directory for dataset: $dataset" - echo "Date: $(date)." - image/fix_data_dir.sh data/$dataset - done +if [ -d $download_dir1 ]; then + echo "$0: madcat chinese data directory is present." +else + if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then + echo "$0: please download madcat data..." + fi fi diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh index 7f759e54b57..fbfda3e6c76 100755 --- a/egs/madcat_zh/v1/run_end2end.sh +++ b/egs/madcat_zh/v1/run_end2end.sh @@ -12,18 +12,35 @@ password= # in "local/prepare_data.sh" to download the database: madcat_database=/export/corpora/LDC/LDC2014T13 data_split_dir=data/download/datasplits +overwrite=false +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. +./local/check_tools.sh -#./local/check_tools.sh +# Start from stage=-1 for using extra corpus text +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/zh.txt + head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt +fi if [ $stage -le 0 ]; then - local/download_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir for dataset in train test dev; do local/extract_lines.sh --nj $nj --cmd $cmd \ @@ -32,18 +49,20 @@ if [ $stage -le 0 ]; then data/${dataset}/lines done - echo "$0: Preparing data..." - local/prepare_data.sh --download-dir "$madcat_database" + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set + image/fix_data_dir.sh data/$set + done + fi mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then image/get_image2num_frames.py --feat-dim 80 data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train echo "$0: Preparing the test and train feature files..." for dataset in train test; do From b86beb2d878532bc1565ac9e5c9968a25ba10965 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 23:43:14 -0500 Subject: [PATCH 08/18] updating parameters --- .../v1/local/chain/tuning/run_cnn_1a.sh | 46 ++++++----------- .../local/chain/tuning/run_cnn_chainali_1a.sh | 41 ++++----------- .../local/chain/tuning/run_cnn_chainali_1b.sh | 50 ++++++------------- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 43 ++++++---------- egs/madcat_zh/v1/local/prepare_data.sh | 16 ++++-- 5 files changed, 69 insertions(+), 127 deletions(-) diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh index 43d083099a3..d17b3e3c9c5 100755 --- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh @@ -20,7 +20,7 @@ set -e -o pipefail stage=0 nj=50 -train_set=train_60 +train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -32,28 +32,16 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_test # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -181,12 +169,12 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ - --trainer.srand=$srand \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=8 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -195,13 +183,9 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -217,20 +201,15 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/lang_test \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ - $dir/graph data/test_60 $dir/decode_test || exit 1; + $dir/graph data/test $dir/decode_test || exit 1; fi diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh index bdad37f0c1d..5a3b85422f6 100755 --- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh @@ -17,9 +17,8 @@ set -e -o pipefail stage=0 - nj=30 -train_set=train_60 +train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -28,32 +27,20 @@ ali=tri3_ali chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} common_egs_dir= reporting_email= - # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_test # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=80 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=80 height-out=80 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=80 height-out=40 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2 @@ -97,13 +85,12 @@ if [ $stage -le 2 ]; then conv-relu-batchnorm-layer name=cnn6 height-in=20 height-out=20 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn7 height-in=20 height-out=20 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn8 height-in=20 height-out=10 time-offsets=-1,0,1 $common3 height-subsample-out=2 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs @@ -115,9 +102,9 @@ if [ $stage -le 3 ]; then steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ @@ -125,11 +112,11 @@ if [ $stage -le 3 ]; then --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 12 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ @@ -149,7 +136,7 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/lang_test \ $dir $dir/graph || exit 1; fi diff --git a/egs/madcat_zh/v1/local/prepare_data.sh b/egs/madcat_zh/v1/local/prepare_data.sh index 6b4055f7205..ba35b90b173 100755 --- a/egs/madcat_zh/v1/local/prepare_data.sh +++ b/egs/madcat_zh/v1/local/prepare_data.sh @@ -1,10 +1,20 @@ #!/bin/bash -# Copyright 2018 Ashish Arora +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian # Apache 2.0 -# This script downloads data splits for MADCAT Chinese dataset. -# It also check if madcat chinese data is present or not. +# This script downloads the Madcat Chinese handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 download_dir1=/export/corpora/LDC/LDC2014T13/data train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid From 360710c5f92b044ea4bad9bebde2367503da6947 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 9 Jan 2019 23:50:57 -0500 Subject: [PATCH 09/18] updating run.sh --- egs/madcat_zh/v1/run.sh | 61 ++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/egs/madcat_zh/v1/run.sh b/egs/madcat_zh/v1/run.sh index f591dcccb35..62bfa0ab05e 100755 --- a/egs/madcat_zh/v1/run.sh +++ b/egs/madcat_zh/v1/run.sh @@ -16,16 +16,36 @@ decode_gmm=true # The datasplits can be found on http://www.openslr.org/51/ madcat_database=/export/corpora/LDC/LDC2014T13 data_split_dir=data/download/datasplits +overwrite=false +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. +./local/check_tools.sh + +# Start from stage=-1 for using extra corpus text +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/zh.txt + head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt +fi mkdir -p data/{train,test,dev}/lines if [ $stage -le 0 ]; then - local/download_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir for dataset in train test dev; do local/extract_lines.sh --nj $nj --cmd $cmd \ @@ -34,20 +54,23 @@ if [ $stage -le 0 ]; then data/${dataset}/lines done - echo "$0: Preparing data..." - local/prepare_data.sh --download-dir $madcat_database + echo "$0: Processing data..." + for set in dev train test; do + local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set + image/fix_data_dir.sh data/$set + done fi # This script uses feat-dim of 60 while the end2end version uses a feat-dim of 80 -mkdir -p data/{train_60,test_60,dev_60}/data +mkdir -p data/{train,test,dev}/data if [ $stage -le 1 ]; then for dataset in train test dev; do for prepared in utt2spk text images.scp spk2utt; do - cp data/$dataset/$prepared data/${dataset}_60/$prepared + cp data/$dataset/$prepared data/${dataset}/$prepared done - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/${dataset}_60 - steps/compute_cmvn_stats.sh data/${dataset}_60 + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/${dataset} + steps/compute_cmvn_stats.sh data/${dataset} done fi @@ -67,56 +90,56 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_60 \ + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ data/lang exp/mono fi if [ $stage -le 5 ] && $decode_gmm; then utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph - steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test_60 \ + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi if [ $stage -le 6 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train_60 data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/mono exp/mono_ali steps/train_deltas.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \ - 50000 20000 data/train_60 data/lang \ + 50000 20000 data/train data/lang \ exp/mono_ali exp/tri fi if [ $stage -le 7 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph - steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test_60 \ + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi if [ $stage -le 8 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train_60 data/lang \ + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/tri exp/tri_ali steps/train_lda_mllt.sh --cmd $cmd \ --splice-opts "--left-context=3 --right-context=3" \ --context-opts "--context-width=2 --central-position=1" 50000 20000 \ - data/train_60 data/lang exp/tri_ali exp/tri2 + data/train data/lang exp/tri_ali exp/tri2 fi if [ $stage -le 9 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ - data/test_60 exp/tri2/decode_test + data/test exp/tri2/decode_test fi if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train_60 data/lang exp/tri2 exp/tri2_ali + data/train data/lang exp/tri2 exp/tri2_ali steps/train_sat.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \ - 50000 20000 data/train_60 data/lang \ + 50000 20000 data/train data/lang \ exp/tri2_ali exp/tri3 fi @@ -124,12 +147,12 @@ if [ $stage -le 11 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ - data/test_60 exp/tri3/decode_test + data/test exp/tri3/decode_test fi if [ $stage -le 12 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train_60 data/lang exp/tri3 exp/tri3_ali + data/train data/lang exp/tri3 exp/tri3_ali fi if [ $stage -le 13 ]; then From 051f3fea4dd5a1967fcd6475015ad28f5a17f2b0 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 00:03:45 -0500 Subject: [PATCH 10/18] updating run.sh --- egs/madcat_zh/v1/run.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/egs/madcat_zh/v1/run.sh b/egs/madcat_zh/v1/run.sh index 62bfa0ab05e..b3ef370c830 100755 --- a/egs/madcat_zh/v1/run.sh +++ b/egs/madcat_zh/v1/run.sh @@ -61,16 +61,11 @@ if [ $stage -le 0 ]; then done fi -# This script uses feat-dim of 60 while the end2end version uses a feat-dim of 80 mkdir -p data/{train,test,dev}/data if [ $stage -le 1 ]; then for dataset in train test dev; do - for prepared in utt2spk text images.scp spk2utt; do - cp data/$dataset/$prepared data/${dataset}/$prepared - done - - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/${dataset} - steps/compute_cmvn_stats.sh data/${dataset} + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset done fi From 0d492b141fbdcd71217019adb2ec4f6c62937005 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 01:01:02 -0500 Subject: [PATCH 11/18] adding aachen splits --- egs/iam/v1/local/chain/compare_wer.sh | 46 ++++++++++++ egs/iam/v1/local/prepare_data.sh | 36 +++++++--- egs/iam/v1/local/process_aachen_splits.py | 88 +++++++++++++++++++++++ egs/iam/v1/run.sh | 12 +++- egs/iam/v1/run_end2end.sh | 12 +++- 5 files changed, 183 insertions(+), 11 deletions(-) create mode 100755 egs/iam/v1/local/process_aachen_splits.py diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh index ad90710b13f..2ce14e13694 100755 --- a/egs/iam/v1/local/chain/compare_wer.sh +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -27,6 +27,14 @@ for x in $*; do done echo +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + echo -n "# CER " for x in $*; do cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') @@ -34,6 +42,44 @@ for x in $*; do done echo +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index 9c01ac90f28..dc07f07e318 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -18,6 +18,7 @@ stage=0 download_dir=data/download +process_aachen_split=false wellington_dir= username= password= # username and password for downloading the IAM database @@ -53,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +aachen_split_url=http://www.openslr.org/resources/56/splits.zip +aachen_splits=data/local/aachensplits mkdir -p $download_dir data/local # download and extact images and transcription @@ -144,6 +147,18 @@ else echo "$0: Wellington Corpus not included because wellington_dir not provided" fi +if [ -d $aachen_splits ]; then + echo "$0: Not downloading the Aachen splits as it is already there." +else + if [ ! -f $aachen_splits/splits.zip ]; then + echo "$0: Downloading Aachen splits ..." + mkdir -p $aachen_splits + wget -P $aachen_splits/ $aachen_split_url || exit 1; + fi + unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; + echo "$0: Done downloading and extracting Aachen splits" +fi + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask @@ -160,12 +175,17 @@ cat $train_old > $train_new cat $test_old > $test_new cat $val1_old $val2_old > $val_new -if [ $stage -le 0 ]; then - local/process_data.py data/local data/train --dataset train || exit 1 - local/process_data.py data/local data/test --dataset test || exit 1 - local/process_data.py data/local data/val --dataset validation || exit 1 - - image/fix_data_dir.sh data/train - image/fix_data_dir.sh data/test - image/fix_data_dir.sh data/val +if $process_aachen_split; then + local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 +else + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 fi + +image/fix_data_dir.sh data/train +image/fix_data_dir.sh data/test +image/fix_data_dir.sh data/val + diff --git a/egs/iam/v1/local/process_aachen_splits.py b/egs/iam/v1/local/process_aachen_splits.py new file mode 100755 index 00000000000..cb6a6d4f0d8 --- /dev/null +++ b/egs/iam/v1/local/process_aachen_splits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_aachen_splits.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('split_path', type=str, + help='location of the train/test/val set') +parser.add_argument('out_dir', type=str, + help='location to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.split_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + + +### main ### + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder) + for file in os.listdir(lines_path): + if file.endswith(".png"): + image_file_path = os.path.join(lines_path, file) + base_name = os.path.splitext(os.path.basename(image_file_path))[0] + text = text_dict[base_name] + utt_id = writer_id + '_' + base_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index 7907f018c04..049223aba07 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -21,6 +21,8 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ train_set=train_aug +process_aachen_split=false +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -40,7 +42,8 @@ if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split fi mkdir -p data/{train,test,val}/data @@ -84,7 +87,9 @@ if [ $stage -le 4 ]; then local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang - + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo # This is for decoding. We use a 50k lexicon to be consistent with the papers # reporting WERs on IAM: local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k # this is for decoding @@ -99,6 +104,9 @@ if [ $stage -le 4 ]; then utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict_50k "" data/lang_unk/temp data/lang_unk + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang_unk/topo cp data/lang_test/G.fst data/lang_unk/G.fst fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 58461c740b6..fa64ea944a1 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -6,6 +6,8 @@ stage=0 nj=20 username= password= +process_aachen_split=false +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -24,10 +26,18 @@ train_set=train_aug # if supplied. ./local/check_tools.sh if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split fi mkdir -p data/{train,test,val}/data From 3775cfe46d6482330c99abe77260c02547490a30 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 01:13:26 -0500 Subject: [PATCH 12/18] minor change --- egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 0f6de7ba563..9b5106a892d 100755 --- a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -56,7 +56,7 @@ if [ $stage -le 0 ]; then fi if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 70 --cmd "$cmd" \ --shared-phones true \ --type mono \ data/$train_set $lang $treedir @@ -116,7 +116,7 @@ if [ $stage -le 3 ]; then --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ --trainer.optimization.num-jobs-initial 6 \ - --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ @@ -143,7 +143,7 @@ fi if [ $stage -le 5 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ + --nj 70 --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; fi From 6f77b74d297a1907ea3559a4ac55fe3f9f1b411e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 01:23:04 -0500 Subject: [PATCH 13/18] adding val decoding --- .../local/chain/tuning/run_cnn_chainali_1a.sh | 21 +++++++++++-------- .../local/chain/tuning/run_cnn_chainali_1b.sh | 19 ++++++++++------- .../local/chain/tuning/run_cnn_chainali_1c.sh | 19 ++++++++++------- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh index ee3a1a3d92c..82304351533 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh @@ -8,6 +8,7 @@ stage=0 nj=30 train_set=train +decode_val=true gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -32,7 +33,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_test=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -219,12 +221,13 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh index c6876fbafcb..c7446d7283e 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh @@ -22,6 +22,7 @@ stage=0 nj=30 train_set=train +decode_val=true gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -47,6 +48,7 @@ tdnn_dim=450 srand=0 remove_egs=false lang_test=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -235,12 +237,13 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh index 54c52d913de..cff2dd7862f 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh @@ -20,6 +20,7 @@ stage=0 nj=30 train_set=train +decode_val=true gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. @@ -44,6 +45,7 @@ tdnn_dim=450 srand=0 remove_egs=false lang_test=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -236,12 +238,13 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir From 51b2043ca3472201ceef1e1b05322d8052b851b8 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 01:25:30 -0500 Subject: [PATCH 14/18] minor change --- egs/iam/v1/run.sh | 2 +- egs/iam/v1/run_end2end.sh | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index 049223aba07..85811b6cb3d 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -106,7 +106,7 @@ if [ $stage -le 4 ]; then data/local/dict_50k "" data/lang_unk/temp data/lang_unk silphonelist=`cat data/lang/phones/silence.csl` nonsilphonelist=`cat data/lang/phones/nonsilence.csl` - local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang_unk/topo + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo cp data/lang_test/G.fst data/lang_unk/G.fst fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index fa64ea944a1..0a8b014715f 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -98,6 +98,10 @@ if [ $stage -le 4 ]; then data/local/dict_50k exp/unk_lang_model utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ data/local/dict_50k "" data/lang_unk/temp data/lang_unk + + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo cp data/lang_test/G.fst data/lang_unk/G.fst fi From 38b85c6ef362b40c3246f164967fd68a767b3872 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 01:26:50 -0500 Subject: [PATCH 15/18] minor change --- egs/iam/v1/local/chain/compare_wer.sh | 32 --------------------------- 1 file changed, 32 deletions(-) diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh index 2ce14e13694..4a2cc29481c 100755 --- a/egs/iam/v1/local/chain/compare_wer.sh +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -27,14 +27,6 @@ for x in $*; do done echo -echo -n "# WER (rescored) " -for x in $*; do - wer="--" - [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') - printf "% 10s" $wer -done -echo - echo -n "# CER " for x in $*; do cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') @@ -42,14 +34,6 @@ for x in $*; do done echo -echo -n "# CER (rescored) " -for x in $*; do - cer="--" - [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') - printf "% 10s" $cer -done -echo - echo -n "# WER val " for x in $*; do wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') @@ -57,14 +41,6 @@ for x in $*; do done echo -echo -n "# WER (rescored) val " -for x in $*; do - wer="--" - [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') - printf "% 10s" $wer -done -echo - echo -n "# CER val " for x in $*; do cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') @@ -72,14 +48,6 @@ for x in $*; do done echo -echo -n "# CER (rescored) val " -for x in $*; do - cer="--" - [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') - printf "% 10s" $cer -done -echo - if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi From a04aa19e40ab8134d53789da08d281252c04300a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 17:05:08 -0500 Subject: [PATCH 16/18] adding result --- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 41 +++++-------------- egs/madcat_zh/v1/run_end2end.sh | 22 ++++++++-- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 9b5106a892d..ffc9a4c8a14 100755 --- a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -1,16 +1,18 @@ #!/bin/bash # Copyright 2017 Hossein Hadian -# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/ -# System cnn_1a cnn_chainali_1b e2e_cnn_1a -# WER 13.51 6.76 10.55 -# Final train prob -0.0291 -0.0138 -0.0702 -# Final valid prob -0.0712 -0.0171 -0.0578 -# Final train prob (xent) -0.3847 -0.4169 -# Final valid prob (xent) -0.4962 -0.5040 +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.41 +# Final train prob -0.0536 +# Final valid prob -0.0489 +# Final train prob (xent) +# Final valid prob (xent) -set -e +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=63 nj=6..12 num-params=6.1M dim=80->5760 combine=-0.048->-0.048 (over 5) logprob:train/valid[41,62,final]=(-0.062,-0.065,-0.054/-0.058,-0.062,-0.049) +set -e # configs for 'chain' stage=0 train_stage=-10 @@ -126,26 +128,3 @@ if [ $stage -le 3 ]; then --tree-dir $treedir \ --dir $dir || exit 1; fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 70 --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh index fbfda3e6c76..7e0fc1e25d1 100755 --- a/egs/madcat_zh/v1/run_end2end.sh +++ b/egs/madcat_zh/v1/run_end2end.sh @@ -81,13 +81,27 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then + echo "$0: calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +lang_decode=data/lang_test +decode_e2e=true +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + data/local/dict/lexicon.txt $lang_decode fi -if [ $stage -le 4 ]; then - echo "$0: calling the flat-start chain recipe..." - local/chain/run_flatstart_cnn1a.sh +if [ $stage -le 5 ] && $decode_e2e; then + echo "$0: $(date) stage 5: decoding end2end setup..." + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; + + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + echo "$0: Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ fi From 0589950959eb6eabeb21b7de92f8e226dcc44463 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 10 Jan 2019 18:04:25 -0500 Subject: [PATCH 17/18] updating results --- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index dc6ed4b7c2b..3caf8ae4494 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -2,18 +2,17 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) - # ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ -# System e2e_cnn_1a -# WER 5.73 -# WER (rescored) 5.67 -# CER 1.45 -# CER (rescored) 1.42 -# Final train prob -0.0934 -# Final valid prob -0.0746 +# System e2e_cnn_1a e2e_cnn_1a (with extra corpus text) +# WER 9.47 5.73 +# WER (rescored) 8.05 5.67 +# CER 2.45 1.45 +# CER (rescored) 2.10 1.42 +# Final train prob -0.0934 -0.0934 +# Final valid prob -0.0746 -0.0746 # Final train prob (xent) # Final valid prob (xent) -# Parameters 2.94M +# Parameters 2.94M 2.94M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ # exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.071->-0.070 (over 5) logprob:train/valid[64,97,final]=(-0.089,-0.084,-0.093/-0.075,-0.073,-0.075) From 3be4e4135a6d6b2581e4a7c34003442486eb863d Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 16 Jan 2019 00:35:05 -0500 Subject: [PATCH 18/18] bug fixes, updating results --- egs/iam/v1/local/chain/tuning/run_cnn_1a.sh | 80 ++++++++---------- .../local/chain/tuning/run_cnn_chainali_1a.sh | 57 ++++++------- .../local/chain/tuning/run_cnn_chainali_1b.sh | 68 +++++---------- .../local/chain/tuning/run_cnn_chainali_1c.sh | 61 +++++--------- .../local/chain/tuning/run_cnn_chainali_1d.sh | 84 ++++++++----------- .../local/chain/tuning/run_cnn_e2eali_1a.sh | 74 +++++++--------- .../local/chain/tuning/run_cnn_e2eali_1b.sh | 78 +++++++---------- .../local/chain/tuning/run_cnn_e2eali_1c.sh | 67 +++++---------- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 28 +++---- egs/iam/v1/local/gen_topo.py | 2 +- egs/iam/v1/local/train_lm.sh | 9 +- .../v1/local/chain/tuning/run_cnn_1a.sh | 15 +--- .../local/chain/tuning/run_cnn_chainali_1a.sh | 16 +--- egs/madcat_ar/v1/local/process_data.py | 2 +- egs/madcat_ar/v1/run.sh | 17 ++-- 15 files changed, 250 insertions(+), 408 deletions(-) diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh index 41a76920e37..1253bbe5aa3 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh @@ -4,23 +4,23 @@ # 2017 Chun Chieh Chang # 2017 Ashish Arora -# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) - # local/chain/compare_wer.sh exp/chain/cnn_1a/ -# System cnn_1a -# WER 18.52 -# CER 10.07 -# Final train prob -0.0077 -# Final valid prob -0.0970 -# Final train prob (xent) -0.5484 -# Final valid prob (xent) -0.9643 -# Parameters 4.36M +# System cnn_1a(dict_50k) cnn_1a(dict_50k + unk model) +# WER 16.88 15.18 +# CER 8.52 7.58 +# WER val 16.17 13.53 +# CER val 7.15 5.89 +# Final train prob -0.0299 +# Final valid prob -0.0574 +# Final train prob (xent) -0.3912 +# Final valid prob (xent) -0.6439 +# Parameters 4.36M -set -e -o pipefail +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.029->-0.029 (over 2) xent:train/valid[27,41,final]=(-0.522,-0.394,-0.391/-0.695,-0.644,-0.644) logprob:train/valid[27,41,final]=(-0.035,-0.030,-0.030/-0.056,-0.057,-0.057) +set -e -o pipefail stage=0 - nj=30 train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it @@ -34,28 +34,21 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +decode_val=true +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <368 combine=-0.020->-0.020 (over 2) xent:train/valid[27,41,final]=(-0.534,-0.425,-0.424/-0.659,-0.612,-0.612) logprob:train/valid[27,41,final]=(-0.026,-0.022,-0.022/-0.017,-0.016,-0.016) set -e -o pipefail stage=0 - nj=30 train_set=train decode_val=true @@ -21,29 +33,18 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -162,7 +160,6 @@ EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi - if [ $stage -le 5 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -177,9 +174,9 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ - --trainer.srand=$srand \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -189,15 +186,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -213,9 +205,8 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi @@ -229,5 +220,5 @@ if [ $stage -le 7 ]; then done fi -echo "Done. Date: $(date). Results:" +echo "$0 Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh index c7446d7283e..f5dbb93e7b7 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh @@ -1,25 +1,23 @@ #!/bin/bash # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. - -# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ -# System cnn_1a cnn_chainali_1b -# WER 18.52 14.38 -# CER 10.07 7.14 -# Final train prob -0.0077 -0.0113 -# Final valid prob -0.0970 -0.0400 -# Final train prob (xent) -0.5484 -0.6043 -# Final valid prob (xent) -0.9643 -0.9030 -# Parameters 4.36M 3.96M +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b +# System cnn_chainali_1b(dict_50k) cnn_chainali_1b(dict_50k + unk_model) +# WER 15.09 13.29 +# CER 7.13 6.08 +# WER val 14.80 11.98 +# CER val 6.16 4.87 +# Final train prob -0.0225 +# Final valid prob -0.0132 +# Final train prob (xent) -0.4466 +# Final valid prob (xent) -0.6048 +# Parameters 3.96M # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ -# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) - +# exp/chain/cnn_chainali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.019->-0.019 (over 2) xent:train/valid[27,41,final]=(-0.545,-0.448,-0.447/-0.645,-0.605,-0.605) logprob:train/valid[27,41,final]=(-0.026,-0.023,-0.023/-0.014,-0.013,-0.013) set -e -o pipefail - stage=0 - nj=30 train_set=train decode_val=true @@ -32,32 +30,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= -# chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -162,7 +145,6 @@ if [ $stage -le 4 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - # adding the layers for xent branch # This block prints the configs for a separate output that will be # trained with a cross-entropy objective in the 'chain' mod?els... this @@ -193,9 +175,9 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ - --trainer.srand=$srand \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -205,15 +187,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -229,9 +206,8 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi @@ -245,5 +221,5 @@ if [ $stage -le 7 ]; then done fi -echo "Done. Date: $(date). Results:" +echo "$0 Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh index cff2dd7862f..1dd83c5078f 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh @@ -1,23 +1,22 @@ #!/bin/bash # chainali_1c is as chainali_1b except it uses l2-regularize -# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c -# System cnn_chainali_1b cnn_chainali_1c -# WER 14.38 12.72 -# CER 7.14 5.99 -# Final train prob -0.0113 -0.0291 -# Final valid prob -0.0400 -0.0359 -# Final train prob (xent) -0.6043 -0.9781 -# Final valid prob (xent) -0.9030 -1.1544 -# Parameters 3.96M 3.96M +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1c +# System cnn_chainali_1c (dict_50k) cnn_chainali_1c(dict_50k + unk_model) +# WER 12.95 11.07 +# CER 6.04 4.91 +# WER val 12.75 9.78 +# CER val 5.15 3.74 +# Final train prob -0.0217 +# Final valid prob -0.0060 +# Final train prob (xent) -0.8303 +# Final valid prob (xent) -0.8665 +# Parameters 3.96M # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c -# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) - +# exp/chain/cnn_chainali_1c/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006) set -e -o pipefail - stage=0 - nj=30 train_set=train decode_val=true @@ -30,31 +29,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= -# chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <376 combine=-0.002->-0.002 (over 1) xent:train/valid[13,20,final]=(-1.66,-1.01,-0.865/-1.72,-1.12,-1.01) logprob:train/valid[13,20,final]=(-0.058,-0.019,-0.004/-0.055,-0.027,-0.013) - +# exp/chain/cnn_chainali_1d/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006) set -e -o pipefail stage=0 - nj=30 train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1c_uc #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn_1a_uc +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang_unk +lang_decode=lang_unk +decode_val=true +if $decode_val; then maybe_val=val; else maybe_val= ; fi + # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -157,7 +147,6 @@ if [ $stage -le 4 ]; then relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts @@ -192,11 +181,11 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ - --trainer.srand=$srand \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -206,15 +195,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=false \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -230,20 +214,20 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh index ba28f681708..f95f6a90ca1 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -1,27 +1,26 @@ #!/bin/bash -# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the -# lattice alignments and to build a tree - -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a -# System e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a -# WER 13.87 12.72 12.70 -# CER 6.54 5.99 5.75 -# Final train prob -0.0371 -0.0291 -0.0557 -# Final valid prob -0.0636 -0.0359 -0.0770 -# Final train prob (xent) -0.9781 -0.8847 -# Final valid prob (xent) -1.1544 -1.0370 -# Parameters 9.13M 3.96M 3.95M +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a +# System cnn_e2eali_1a_(dict_50k) cnn_e2eali_1a_(dict_50k + unk model) +# WER 13.30 11.94 +# CER 5.95 5.15 +# WER val 12.85 10.71 +# CER val 5.09 4.03 +# Final train prob -0.0562 +# Final valid prob -0.0634 +# Final train prob (xent) -0.8196 +# Final valid prob (xent) -0.8816 +# Parameters 3.96M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a -# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079) +# exp/chain/cnn_e2eali_1a: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.058->-0.058 (over 1) xent:train/valid[27,41,final]=(-2.67,-0.841,-0.820/-2.71,-0.892,-0.882) logprob:train/valid[27,41,final]=(-0.240,-0.060,-0.056/-0.245,-0.068,-0.063) set -e -o pipefail stage=0 - nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -32,26 +31,19 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 remove_egs=true -lang_test=lang_unk +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <376 combine=-0.039->-0.039 (over 1) xent:train/valid[27,41,final]=(-1.28,-0.846,-0.808/-1.27,-0.871,-0.847) logprob:train/valid[27,41,final]=(-0.064,-0.043,-0.038/-0.065,-0.051,-0.044) +# exp/chain/cnn_e2eali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-1.19,-0.805,-0.786/-1.19,-0.846,-0.829) logprob:train/valid[27,41,final]=(-0.060,-0.041,-0.038/-0.062,-0.048,-0.044) set -e -o pipefail stage=0 - nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -28,27 +30,17 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options -srand=0 -remove_egs=true -lang_test=lang_unk +lang_decode=lang_unk +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - - if ! cuda-compiled; then cat <$lat_dir/splice_opts fi @@ -114,20 +105,17 @@ if [ $stage -le 3 ]; then echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." exit 1; fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ + --frame-subsampling-factor 4 \ --alignment-subsampling-factor 1 \ --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ + --cmd "$cmd" $num_leaves $train_data_dir \ $lang $ali_dir $tree_dir fi - if [ $stage -le 4 ]; then mkdir -p $dir echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) cnn_opts="l2-regularize=0.075" @@ -167,7 +155,6 @@ EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi - if [ $stage -le 5 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -182,11 +169,11 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ - --trainer.srand=$srand \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ --trainer.frames-per-iter=1000000 \ @@ -196,15 +183,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=true \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -220,20 +202,20 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 data/$lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + done fi + +echo "$0 Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh index cf90193f7eb..047d673db17 100755 --- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -2,60 +2,47 @@ # e2eali_1c is the same as e2eali_1b but has more CNN layers, different filter size # smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. - # local/chain/compare_wer.sh exp/chain/cnn_e2eali_1c -# System cnn_e2eali_1c (dict_50k) cnn_e2eali_1c (dict_500k) -# WER 12.20 9.62 -# CER 5.29 4.33 -# Final train prob -0.0494 -0.0494 -# Final valid prob -0.0644 -0.0644 -# Final train prob (xent) -0.4852 -0.4852 -# Final valid prob (xent) -0.5437 -0.5437 -# Parameters 4.33M 4.33M +# System cnn_e2eali_1c (dict_50k) cnn_e2eali_1c(dict_50k + unk_model) +# WER 12.10 9.90 +# CER 5.23 4.16 +# WER val 12.15 9.60 +# CER val 4.78 3.56 +# Final train prob -0.0470 +# Final valid prob -0.0657 +# Final train prob (xent) -0.4713 +# Final valid prob (xent) -0.5437 +# Parameters 4.32M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c -# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->376 combine=-0.052->-0.052 (over 1) xent:train/valid[19,29,final]=(-0.715,-0.508,-0.485/-0.717,-0.562,-0.544) logprob:train/valid[19,29,final]=(-0.089,-0.054,-0.049/-0.100,-0.070,-0.064) - - +# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->368 combine=-0.051->-0.051 (over 1) xent:train/valid[19,29,final]=(-0.722,-0.500,-0.471/-0.748,-0.568,-0.544) logprob:train/valid[19,29,final]=(-0.090,-0.053,-0.047/-0.106,-0.071,-0.066) set -e -o pipefail stage=0 - nj=30 train_set=train decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1b +e2echain_model_dir=exp/chain/e2e_cnn_1a common_egs_dir= reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=550 -# training options -srand=0 -remove_egs=true -lang_decode=data/lang_test +lang_decode=data/lang_unk if $decode_val; then maybe_val=val; else maybe_val= ; fi dropout_schedule='0,0@0.20,0.2@0.50,0' # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - - if ! cuda-compiled; then cat <$lat_dir/splice_opts fi @@ -121,20 +107,17 @@ if [ $stage -le 3 ]; then echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." exit 1; fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ + --frame-subsampling-factor 4 \ --alignment-subsampling-factor 1 \ --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ + --cmd "$cmd" $num_leaves $train_data_dir \ $lang $ali_dir $tree_dir fi - if [ $stage -le 4 ]; then mkdir -p $dir echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" @@ -146,7 +129,6 @@ if [ $stage -le 4 ]; then mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -160,7 +142,6 @@ if [ $stage -le 4 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - # adding the layers for xent branch # This block prints the configs for a separate output that will be # trained with a cross-entropy objective in the 'chain' mod?els... this @@ -176,7 +157,6 @@ EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi - if [ $stage -le 5 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -191,11 +171,11 @@ if [ $stage -le 5 ]; then --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=true \ --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ - --trainer.srand=$srand \ + --trainer.srand=0 \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=5 \ --trainer.frames-per-iter=1500000 \ @@ -206,15 +186,10 @@ if [ $stage -le 5 ]; then --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=32,16 \ - --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ + --cleanup.remove-egs=true \ --use-gpu=true \ --reporting.email="$reporting_email" \ --feat-dir=$train_data_dir \ @@ -230,7 +205,6 @@ if [ $stage -le 6 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; @@ -246,6 +220,5 @@ if [ $stage -le 7 ]; then done fi - -echo "Done. Date: $(date). Results:" +echo "$0 Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh index cc27c8e55c4..462ad0522de 100755 --- a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -2,22 +2,22 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a -# System e2e_cnn_1a -# WER 15.24 -# CER 7.27 -# Final train prob -0.0209 -# Final valid prob -0.0417 +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a (dict_50k) e2e_cnn_1a (dict_50k + unk_model) +# WER 15.21 14.41 +# CER 7.43 6.82 +# WER val 14.84 13.51 +# CER val 6.41 5.60 +# Final train prob -0.0206 +# Final valid prob -0.0393 # Final train prob (xent) # Final valid prob (xent) # Parameters 9.52M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.021->-0.021 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.043,-0.042) +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.020->-0.020 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.040,-0.039) set -e - -# configs for 'chain' stage=0 train_stage=-10 get_egs_stage=-10 @@ -30,7 +30,7 @@ minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 common_egs_dir= train_set=train decode_val=true -lang_decode=data/lang_test +lang_decode=data/lang_unk if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -85,7 +85,6 @@ if [ $stage -le 2 ]; then mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 @@ -98,14 +97,12 @@ if [ $stage -le 2 ]; then relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs fi if [ $stage -le 3 ]; then # no need to store the egs in a shared storage because we always # remove them. Anyway, it takes only 5 minutes to generate them. - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ --cmd "$cmd" \ --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ @@ -128,7 +125,7 @@ if [ $stage -le 3 ]; then --trainer.optimization.shrink-value 1.0 \ --trainer.max-param-change 2.0 \ --cleanup.remove-egs true \ - --feat-dir data/${train_set} \ + --feat-dir data/$train_set \ --tree-dir $treedir \ --dir $dir || exit 1; fi @@ -140,7 +137,6 @@ if [ $stage -le 4 ]; then # topology file from the model). So you could give it a different # lang directory, one that contained a wordlist and LM of your choice, # as long as phones.txt was compatible. - utils/mkgraph.sh \ --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; @@ -154,5 +150,5 @@ if [ $stage -le 5 ]; then done fi -echo "Done. Date: $(date). Results:" +echo "$0 Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/gen_topo.py b/egs/iam/v1/local/gen_topo.py index 540bfbcf270..6fae276d542 100755 --- a/egs/iam/v1/local/gen_topo.py +++ b/egs/iam/v1/local/gen_topo.py @@ -36,7 +36,7 @@ with open(args.phone_list) as f: for line in f: line = line.strip() - phone = line.split(' ')[0] + phone = line.split('_')[0] if len(phone) == 1 and phone in exclude: punctuation_phones.append(int(line.split(' ')[1])) # For nonsilence phones that are not punctuations diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index a15fbea2af3..911f54c5439 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -58,9 +58,12 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true # Using LOB and brown corpus. - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ - local/remove_test_utterances_from_lob.py data/test/text data/val/text \ - > ${dir}/data/text/lob.txt + if [ ! -f data/local/lob-train-only.txt ]; then + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \ + > data/local/lob-train-only.txt + fi + cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt > ${dir}/data/text/brown.txt if [ -d "data/local/wellingtoncorpus" ]; then cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt > ${dir}/data/text/wellington.txt diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh index eb140e900e1..d449805be1d 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh @@ -24,22 +24,17 @@ xent_regularize=0.1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. tdnn_dim=450 -# training options -srand=0 remove_egs=false lang_decode=data/lang lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging - . ./cmd.sh . ./path.sh . ./utils/parse_options.sh - if ! cuda-compiled; then cat <