From 817ce77d40c976bfa92cc8af962b75d954c0396c Mon Sep 17 00:00:00 2001 From: Ondrej Platek Date: Tue, 18 Jun 2013 10:56:26 +0000 Subject: [PATCH] Merge branch 'master' into sandbox-oplatek Conflicts: .gitignore INSTALL README.txt egs/babel/s5/local/generate_proxy_keywords.sh egs/wsj/s5/steps/train_nnet_cpu.sh egs/wsj/s5/utils/nnet-cpu/make_nnet_config_preconditioned.pl src/Makefile src/configure src/lat/Makefile src/makefiles/cygwin.mk src/makefiles/darwin_10_5.mk src/makefiles/darwin_10_6.mk src/makefiles/darwin_10_7.mk src/makefiles/darwin_10_8.mk src/makefiles/linux_atlas.mk src/makefiles/linux_atlas_64bit.mk src/makefiles/linux_clapack.mk src/makefiles/linux_openblas.mk src/nnet-cpu/mixup-nnet.cc src/nnet-cpu/nnet-component-test.cc src/nnet-cpu/nnet-component.cc src/nnet-cpu/nnet-component.h src/nnet-cpu/nnet-nnet.cc src/nnet-cpu/nnet-nnet.h src/nnet-cpu/nnet-update-parallel.cc src/nnet-cpu/nnet-update-parallel.h src/nnet-cpubin/nnet-train-parallel.cc src/nnet/nnet-pdf-prior.h src/nnetbin/nnet-forward.cc tools/Makefile tools/extras/install_portaudio.sh git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/oplatek@2520 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- INSTALL | 2 +- INSTALL.md | 168 ++++++ README.md | 25 + README.txt | 3 +- egs/babel/s5/local/annotatedKwlist2KWs.pl | 124 +++++ egs/babel/s5/local/buildEditDistanceFst.pl | 127 +++++ egs/babel/s5/local/count2logprob.pl | 94 ++++ egs/babel/s5/local/subsetATWV.pl | 120 +++++ egs/kaldi-vystadial-recipe/.gitingore | 2 + egs/kaldi-vystadial-recipe/README.md | 54 ++ egs/kaldi-vystadial-recipe/s5/.gitignore | 7 + egs/kaldi-vystadial-recipe/s5/cmd.sh | 13 + .../s5/conf/decode.config | 3 + egs/kaldi-vystadial-recipe/s5/conf/mfcc.conf | 7 + .../s5/conf/train_conf.sh | 20 + .../s5/decode/decode-lattice.sh | 112 ++++ .../s5/decode/decode-online.sh | 97 ++++ .../s5/local/make_trans.py | 50 ++ .../s5/local/results.py | 164 ++++++ .../s5/local/save_check_conf.sh | 55 ++ egs/kaldi-vystadial-recipe/s5/local/score.sh | 53 ++ .../s5/local/vystadial_data_prep.sh | 83 +++ .../s5/local/vystadial_format_data.sh | 76 +++ .../s5/local/vystadial_prepare_dict.sh | 91 ++++ egs/kaldi-vystadial-recipe/s5/logs/README | 1 + egs/kaldi-vystadial-recipe/s5/path.sh | 16 + egs/kaldi-vystadial-recipe/s5/run.sh | 201 +++++++ .../s5/steps/align_fmllr.sh | 147 +++++ .../s5/steps/align_sgmm.sh | 193 +++++++ .../s5/steps/align_sgmm2.sh | 193 +++++++ .../s5/steps/align_si.sh | 89 ++++ .../s5/steps/compute_cmvn_stats.sh | 65 +++ egs/kaldi-vystadial-recipe/s5/steps/decode.sh | 97 ++++ .../s5/steps/decode_basis_fmllr.sh | 206 +++++++ .../s5/steps/decode_biglm.sh | 84 +++ .../s5/steps/decode_combine.sh | 59 +++ .../s5/steps/decode_fmllr.sh | 198 +++++++ .../s5/steps/decode_fmmi.sh | 95 ++++ .../s5/steps/decode_fromlats.sh | 90 ++++ .../s5/steps/decode_nnet.sh | 125 +++++ .../s5/steps/decode_sgmm.sh | 254 +++++++++ .../s5/steps/decode_sgmm2.sh | 190 +++++++ .../s5/steps/decode_sgmm2_rescore.sh | 107 ++++ .../s5/steps/decode_sgmm2_rescore_project.sh | 172 ++++++ .../s5/steps/decode_sgmm_rescore.sh | 107 ++++ .../s5/steps/decode_si.sh | 97 ++++ .../s5/steps/get_fmllr_basis.sh | 95 ++++ .../s5/steps/lmrescore.sh | 117 ++++ .../s5/steps/make_bn_feats.sh | 141 +++++ .../s5/steps/make_denlats.sh | 139 +++++ .../s5/steps/make_denlats_sgmm.sh | 157 ++++++ .../s5/steps/make_denlats_sgmm2.sh | 157 ++++++ .../s5/steps/make_fbank.sh | 111 ++++ .../s5/steps/make_mfcc.sh | 111 ++++ .../s5/steps/make_plp.sh | 111 ++++ egs/kaldi-vystadial-recipe/s5/steps/mixup.sh | 146 +++++ .../s5/steps/rnnlmrescore.sh | 176 ++++++ .../s5/steps/train_deltas.sh | 142 +++++ .../s5/steps/train_diag_ubm.sh | 125 +++++ .../s5/steps/train_lda_mllt.sh | 191 +++++++ .../s5/steps/train_mmi.sh | 144 +++++ .../s5/steps/train_mmi_fmmi.sh | 221 ++++++++ .../s5/steps/train_mmi_fmmi_indirect.sh | 244 +++++++++ .../s5/steps/train_mmi_sgmm.sh | 153 ++++++ .../s5/steps/train_mmi_sgmm2.sh | 152 ++++++ .../s5/steps/train_mono.sh | 135 +++++ .../s5/steps/train_mpe.sh | 158 ++++++ .../s5/steps/train_nnet.sh | 284 ++++++++++ .../s5/steps/train_quick.sh | 191 +++++++ .../s5/steps/train_sat.sh | 238 +++++++++ .../s5/steps/train_sgmm.sh | 273 ++++++++++ .../s5/steps/train_sgmm2.sh | 292 ++++++++++ .../s5/steps/train_ubm.sh | 128 +++++ .../s5/steps/word_align_lattices.sh | 48 ++ .../s5/utils/add_disambig.pl | 58 ++ .../s5/utils/add_lex_disambig.pl | 101 ++++ .../s5/utils/apply_map.pl | 54 ++ .../s5/utils/best_wer.sh | 25 + .../s5/utils/combine_data.sh | 32 ++ .../s5/utils/convert_ctm.pl | 83 +++ .../s5/utils/eps2disambig.pl | 23 + .../s5/utils/filter_scp.pl | 41 ++ .../s5/utils/find_arpa_oovs.pl | 64 +++ .../s5/utils/fix_data_dir.sh | 80 +++ .../s5/utils/format_lm.sh | 84 +++ .../s5/utils/format_lm_sri.sh | 110 ++++ .../s5/utils/gen_topo.pl | 63 +++ .../s5/utils/int2sym.pl | 71 +++ egs/kaldi-vystadial-recipe/s5/utils/ln.pl | 58 ++ .../s5/utils/make_lexicon_fst.pl | 122 +++++ .../s5/utils/make_unigram_grammar.pl | 54 ++ .../s5/utils/mkgraph.sh | 122 +++++ .../s5/utils/nnet/analyze_alignments.sh | 71 +++ .../s5/utils/nnet/gen_dct_mat.py | 53 ++ .../s5/utils/nnet/gen_hamm_mat.py | 45 ++ .../s5/utils/nnet/gen_mlp_init.py | 83 +++ .../s5/utils/nnet/train_nnet_scheduler.sh | 119 +++++ .../s5/utils/parse_options.sh | 84 +++ .../s5/utils/prepare_lang.sh | 275 ++++++++++ egs/kaldi-vystadial-recipe/s5/utils/queue.pl | 263 +++++++++ .../s5/utils/remove_oovs.pl | 43 ++ .../s5/utils/rnnlm_compute_scores.sh | 69 +++ egs/kaldi-vystadial-recipe/s5/utils/run.pl | 123 +++++ egs/kaldi-vystadial-recipe/s5/utils/s2eps.pl | 27 + .../s5/utils/shuffle_list.pl | 31 ++ .../s5/utils/spk2utt_to_utt2spk.pl | 27 + .../s5/utils/split_data.sh | 97 ++++ .../s5/utils/split_scp.pl | 221 ++++++++ .../s5/utils/subset_data_dir.sh | 119 +++++ .../s5/utils/subset_scp.pl | 84 +++ .../s5/utils/summarize_warnings.pl | 46 ++ .../s5/utils/sym2int.pl | 99 ++++ .../s5/utils/utt2spk_to_spk2utt.pl | 39 ++ .../s5/utils/validate_dict_dir.pl | 142 +++++ .../s5/utils/validate_lang.pl | 501 ++++++++++++++++++ egs/voxforge/online_demo/.gitignore | 9 + src/.gitignore | 115 ++++ src/Makefile | 5 + src/configure | 19 +- src/makefiles/cygwin.mk | 1 + src/makefiles/darwin_10_5.mk | 1 + src/makefiles/darwin_10_6.mk | 1 + src/makefiles/darwin_10_7.mk | 1 + src/makefiles/darwin_10_8.mk | 1 + src/makefiles/linux_atlas.mk | 1 + src/makefiles/linux_atlas_64bit.mk | 1 + src/makefiles/linux_clapack.mk | 1 + src/makefiles/linux_openblas.mk | 4 +- src/python-kaldi-decoding/.gitignore | 17 + src/python-kaldi-decoding/Makefile | 82 +++ src/python-kaldi-decoding/README.md | 65 +++ .../compute-mfcc-feats-test.c | 5 + .../compute-mfcc-feats.cc | 185 +++++++ .../compute-mfcc-feats.h | 16 + src/python-kaldi-decoding/compute-wer-test.c | 5 + src/python-kaldi-decoding/compute-wer.cc | 144 +++++ src/python-kaldi-decoding/compute-wer.h | 16 + .../gmm-latgen-faster-test.c | 5 + .../gmm-latgen-faster.cc | 196 +++++++ src/python-kaldi-decoding/gmm-latgen-faster.h | 16 + .../lattice-best-path-test.c | 5 + .../lattice-best-path.cc | 136 +++++ src/python-kaldi-decoding/lattice-best-path.h | 16 + .../little_wavs_data_void_en.scp | 4 + .../online-wav-gmm-decode-faster-test.c | 5 + .../online-wav-gmm-decode-faster.cc | 247 +++++++++ .../online-wav-gmm-decode-faster.h | 17 + .../ordereddefaultdict.py | 46 ++ src/python-kaldi-decoding/run.py | 291 ++++++++++ .../test_cffi_python_dyn.h | 39 ++ src/vystadial-decoder/.ycm_extra_conf.py | 145 +++++ src/vystadial-decoder/README.md | 86 +++ tools/.gitignore | 7 + tools/Makefile | 2 +- tools/extras/install_portaudio.sh | 2 +- 155 files changed, 14774 insertions(+), 8 deletions(-) create mode 100644 INSTALL.md create mode 100644 README.md create mode 100755 egs/babel/s5/local/annotatedKwlist2KWs.pl create mode 100755 egs/babel/s5/local/buildEditDistanceFst.pl create mode 100755 egs/babel/s5/local/count2logprob.pl create mode 100755 egs/babel/s5/local/subsetATWV.pl create mode 100644 egs/kaldi-vystadial-recipe/.gitingore create mode 100644 egs/kaldi-vystadial-recipe/README.md create mode 100644 egs/kaldi-vystadial-recipe/s5/.gitignore create mode 100644 egs/kaldi-vystadial-recipe/s5/cmd.sh create mode 100644 egs/kaldi-vystadial-recipe/s5/conf/decode.config create mode 100644 egs/kaldi-vystadial-recipe/s5/conf/mfcc.conf create mode 100755 egs/kaldi-vystadial-recipe/s5/conf/train_conf.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/decode/decode-lattice.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/decode/decode-online.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/local/make_trans.py create mode 100755 egs/kaldi-vystadial-recipe/s5/local/results.py create mode 100755 egs/kaldi-vystadial-recipe/s5/local/save_check_conf.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/local/score.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/local/vystadial_data_prep.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/local/vystadial_format_data.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/local/vystadial_prepare_dict.sh create mode 100644 egs/kaldi-vystadial-recipe/s5/logs/README create mode 100755 egs/kaldi-vystadial-recipe/s5/path.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/run.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/align_fmllr.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/align_sgmm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/align_sgmm2.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/align_si.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/compute_cmvn_stats.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_basis_fmllr.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_biglm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_combine.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_fmllr.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_fmmi.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_fromlats.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_nnet.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore_project.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm_rescore.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/decode_si.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/get_fmllr_basis.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/lmrescore.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_bn_feats.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_denlats.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm2.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_fbank.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_mfcc.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/make_plp.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/mixup.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/rnnlmrescore.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_deltas.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_diag_ubm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_lda_mllt.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mmi.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi_indirect.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm2.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mono.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_mpe.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_nnet.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_quick.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_sat.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_sgmm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_sgmm2.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/train_ubm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/steps/word_align_lattices.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/add_disambig.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/add_lex_disambig.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/apply_map.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/best_wer.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/combine_data.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/convert_ctm.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/eps2disambig.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/filter_scp.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/find_arpa_oovs.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/fix_data_dir.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/format_lm.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/format_lm_sri.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/gen_topo.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/int2sym.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/ln.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/make_lexicon_fst.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/make_unigram_grammar.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/mkgraph.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/nnet/analyze_alignments.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_dct_mat.py create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_hamm_mat.py create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_mlp_init.py create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/nnet/train_nnet_scheduler.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/parse_options.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/prepare_lang.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/queue.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/remove_oovs.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/rnnlm_compute_scores.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/run.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/s2eps.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/shuffle_list.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/spk2utt_to_utt2spk.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/split_data.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/split_scp.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/subset_data_dir.sh create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/subset_scp.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/summarize_warnings.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/sym2int.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/utt2spk_to_spk2utt.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/validate_dict_dir.pl create mode 100755 egs/kaldi-vystadial-recipe/s5/utils/validate_lang.pl create mode 100644 egs/voxforge/online_demo/.gitignore create mode 100644 src/.gitignore create mode 100644 src/python-kaldi-decoding/.gitignore create mode 100644 src/python-kaldi-decoding/Makefile create mode 100644 src/python-kaldi-decoding/README.md create mode 100644 src/python-kaldi-decoding/compute-mfcc-feats-test.c create mode 100644 src/python-kaldi-decoding/compute-mfcc-feats.cc create mode 100644 src/python-kaldi-decoding/compute-mfcc-feats.h create mode 100644 src/python-kaldi-decoding/compute-wer-test.c create mode 100644 src/python-kaldi-decoding/compute-wer.cc create mode 100644 src/python-kaldi-decoding/compute-wer.h create mode 100644 src/python-kaldi-decoding/gmm-latgen-faster-test.c create mode 100644 src/python-kaldi-decoding/gmm-latgen-faster.cc create mode 100644 src/python-kaldi-decoding/gmm-latgen-faster.h create mode 100644 src/python-kaldi-decoding/lattice-best-path-test.c create mode 100644 src/python-kaldi-decoding/lattice-best-path.cc create mode 100644 src/python-kaldi-decoding/lattice-best-path.h create mode 100644 src/python-kaldi-decoding/little_wavs_data_void_en.scp create mode 100644 src/python-kaldi-decoding/online-wav-gmm-decode-faster-test.c create mode 100644 src/python-kaldi-decoding/online-wav-gmm-decode-faster.cc create mode 100644 src/python-kaldi-decoding/online-wav-gmm-decode-faster.h create mode 100644 src/python-kaldi-decoding/ordereddefaultdict.py create mode 100755 src/python-kaldi-decoding/run.py create mode 100644 src/python-kaldi-decoding/test_cffi_python_dyn.h create mode 100644 src/vystadial-decoder/.ycm_extra_conf.py create mode 100644 src/vystadial-decoder/README.md create mode 100644 tools/.gitignore diff --git a/INSTALL b/INSTALL index faa8f61829a..2dbf318118c 100644 --- a/INSTALL +++ b/INSTALL @@ -1,4 +1,4 @@ - +This is the official Kaldi INSTALL. Look also at INSTALL.md for the git mirror installation. [for native Windows install, see windows/INSTALL] (1) diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 00000000000..8a7047558d2 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,168 @@ +Installation TIPS for KALDI and installation INSTRUCTIONS for my additional repositories +================================================================================= +Intro +----- +Kaldi has very good instructions and tutorial +for building it from source. It is easy and straightforward. +However, I needed also to build shared libraries +and maybe you will face some of my problems too. +So this is the reasons for writing my building procedure down. + +Installing external dependencies +================================ +See `kaldi-trunk/tools/INSTALL` for info. +Basically it telss you to use `kaldi-trunk/tools/Makefile`, which I used also. + +How have I installed OpenBlas? +---------------------- +Simple enough: +```bash +make openblas +``` + +How have I installed Openfst? +---------------------- +In order to install also shared libraries +I changed the line 37 in +`kaldi-trunk/tools/Makefile` + +```sh +*** Makefile +************ +*** 34,38 **** + +openfst-1.3.2/Makefile: openfst-1.3.2/.patched + cd openfst-1.3.2/; \ +! ./configure --prefix=`pwd` --enable-static --disable-shared --enable-far --enable-ngram-fsts + +--- 34,38 ---- + +openfst-1.3.2/Makefile: openfst-1.3.2/.patched + cd openfst-1.3.2/; \ +! ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts + +``` +Than I ran +```bash +make openfst_tgt +``` + +How have I installed PortAudio? +-------------------------- +NOTE: Necessary only for Kaldi online decoder + +In kaldi-trunk/tools/extras/install_portaudio.sh +I changed line +``` +./configure --prefix=`pwd`/install +``` +to +``` +./configure --prefix=`pwd`/install --with-pic +``` + +Then I ran +```bash +extras/install_portaudio.sh +``` + + +How have I built Kaldi? +------------------ +```bash +./configure --openblas-root=`pwd`/../tools/OpenBLAS/install --fst-root=`pwd`/../tools/openfst --static-math=no +``` + +Edit the `kaldi.mk` and add the `-fPIC` flag. +TODO It would be nice to do something like +```bash +EXTRA_CXXFLAGS=-fPIC make +EXTRA_CXXFLAGS=-fPIC make ext +``` +But the local makefiles overrides `EXTRA_CXXFLAGS`. + +If you updated from the svn repository do not forget to run `make depend` +Since by *default it is turned of! I always forget about that!* +``` +# DO NOT FORGET TO CHANGE kaldi.mk TODO SCRIPT IT! +# make depend and make ext_depend are necessary only if dependencies changed +make depend && make ext_depend && make && make ext +``` + +How have I updated Kaldi src code? +---------------------------- +I checkout the kaldi-trunk version. + +[Kaldi install instructions](http://kaldi.sourceforge.net/install.html) + +Note: If you checkout Kaldi before March 2013 you need to relocate svn. See the instructions in the link above! + + +What setup did I use? +-------------------- +In order to use Kaldi binaries everywhere I add them to `PATH`. +In addition, I needed to add `openfst` directory to `LD_LIBRARY_PATH`, I compiled Kaldi dynamically linked against `openfst`. To conclude, I added following lines to my `.bashrc`. +```bash +############# Kaldi ########### +kaldisrc=/net/work/people/oplatek/kaldi/src +export PATH="$PATH":$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/tiedbin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/sgmmbin + +### Openfst ### +openfst=/ha/home/oplatek/50GBmax/kaldi/tools/openfst +export PATH="$PATH":$openfst/bin +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":$openfst/lib +``` + +Which tool for building a Language Model (LM) have I used? +--------------------------------------------------------- +None. I received built LM in Arpa format. + +NOTE: Probably, I should build my own LM. + + +How have I installed Atlas? +-------------------- +NOTE: I decided NOT to use Atlas, I USE OpenBlas INSTEAD. It is open source and it allows me to compile both shared and static libraries at one run. + +Nevertheless how I install Atlas: + + * I installed version atlas3.10.1.tar.bz2 (available at sourceforge) + * I unpackaged it under `kaldi-trunk/tools` which created `kaldi-trunk/tools/ATLAS` + * The main problem with building ATLAS was for me disabling CPU throtling. + * I solved it by + +```bash +# running following command under root in my Ubuntu 12.10 +# It does not turn off CPU throttling in fact, but I do not need the things optimaze on my local machine +# I ran it for all of my 4 cores +# for n in 0 1 2 3 ; do echo 'performance' > /sys/devices/system/cpu/cpu${n}/cpufreq/scaling_governor ; done +``` + + * Then I needed to install Fortran compiler (The error from configure was little bit covered by consequent errors) by + +```bash +sudo apt-get install gfortran +``` + + * On Ubuntu 12.04 I had issue with + +```bash +/usr/include/features.h:323:26: fatal error: bits/predefs.h +``` + + Which I solved by + +```bash +sudo apt-get install --reinstall libc6-dev +``` + + * Finally, in `kaldi-trunk/tools/ATLAS` I run: + +```bash +mkdir build +mkdir ../atlas_install +cd build +../configure --shared --incdir=`pwd`/../../atlas_install +make +make install + ``` diff --git a/README.md b/README.md new file mode 100644 index 00000000000..b67f5ae3fa9 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +ABOUT +===== + * This is a Git mirror of [Svn trunk of Kaldi project](http://sourceforge.net/projects/kaldi/) + `svn://svn.code.sf.net/p/kaldi/code/trunk` + * In the branch `master` I commit my work. In the branch `svn_mirror` I mirror `svn://svn.code.sf.net/p/kaldi/code/trunk`. In the branch `sandbox-oplatek` I am developing changes which I would like to check in back to Kaldi. + * Currently, I mirror the repository manually as often as I needed. + * The main purpose for mirroring is that I want to build my own decoder and train my models for decoding based on up-to-date Kaldi version. + * Recipe for training the models can be found at `egs/kaldi-vystadial-recipe` + * Source code for python wrapper for online-decoder is at `src/python-kaldi-decoding` + * Remarks about new decoder are located at `src/vystadial-decoder` + * I use the `Fake submodules` approach to merge the 3 subprojects to this repository. More about `Fake submodules` [at this blog](http://debuggable.com/posts/git-fake-submodules:4b563ee4-f3cc-4061-967e-0e48cbdd56cb). + * I mirror the svn via `git svn`. [Nice intro to git svn](http://viget.com/extend/effectively-using-git-with-subversion), [Walk through](http://blog.shinetech.com/2009/02/17/my-git-svn-workflow/) and [Multiple svn-remotes](http://blog.shuningbian.net/2011/05/git-with-multiple-svn-remotes.html) + +OTHER INFO +---------- + * Read `INSTALL.md` and `INSTALL` first! + * For training models read `egs/kaldi-vystadial-recipe/s5/README.md` + * For building and developing decoder callable from python read `src/python-kaldi-decoding/README.md` + * For information about new decoder read `src/vystadial-decoder/README.md` + * This work is done under [Vystadial project](https://sites.google.com/site/filipjurcicek/projects/vystadial). + +LICENSE +-------- + * We release all the changes at pyKaldi under `Apache license 2.0` license. Kaldi also uses `Apache 2.0` license). + * We also want to publicly release the training data in the autumn 2013. diff --git a/README.txt b/README.txt index fa0a7a21b93..e482e3ae176 100644 --- a/README.txt +++ b/README.txt @@ -1,4 +1,5 @@ - +This is oficial Kaldi readme. You are now in Kaldi/trunk mirror. +Read Kaldi.md and INSTALL.md first! See http://kaldi.sourceforge.net/ for documentation diff --git a/egs/babel/s5/local/annotatedKwlist2KWs.pl b/egs/babel/s5/local/annotatedKwlist2KWs.pl new file mode 100755 index 00000000000..566005bc89a --- /dev/null +++ b/egs/babel/s5/local/annotatedKwlist2KWs.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. +# + +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < [category] + e.g.: annotatedKwlist2KWs.pl kwlist.annot.list keywords.list "NGram Order:2,3,4" + +This script reads an annotated kwlist xml file and writes a list of keywords, according +to the given categories. The "category" is a "key:value" pair in the annotated kwlist xml +file. For example +1. "NGram Order:2,3,4" +2. "NGram Order:2" +3. "NGram Order:-" +where "NGram Order" is the category name. The first line means print keywords that are +bigram, trigram and 4gram; The second line means print keywords only for bigram; The last +line means print all possible ngram keywords. +If no "category" is specified, the script will print out the possible categories. + +Allowed options: +EOU + +GetOptions(); + +@ARGV >= 2 || die $Usage; + +# Workout the input/output source +my $kwlist_filename = shift @ARGV; +my $kws_filename = shift @ARGV; + +my $source = "STDIN"; +if ($kwlist_filename ne "-") { + open(KWLIST, "<$kwlist_filename") || die "Fail to open kwlist file: $kwlist_filename\n"; + $source = "KWLIST"; +} + +# Process kwlist.annot.xml +my %attr; +my %attr_kws; +my $kwid=""; +my $name=""; +my $value=""; +while (<$source>) { + chomp; + if (m//) {($name) = /(.*)<\/name>/; next;} + if (m//) { + ($value) = /(.*)<\/value>/; + if (defined($attr{$name})) { + $attr{"$name"}->{"$value"} = 1; + } else { + $attr{"$name"} = {"$value", 1}; + } + if (defined($attr_kws{"${name}_$value"})) { + $attr_kws{"${name}_$value"}->{"$kwid"} = 1; + } else { + $attr_kws{"${name}_$value"} = {"$kwid", 1}; + } + } +} + +my $output = ""; +if (@ARGV == 0) { + # If no category provided, print out the possible categories + $output .= "Possible categories are:\n\n"; + foreach my $name (keys %attr) { + $output .= "$name:"; + my $count = 0; + foreach my $value (keys %{$attr{$name}}) { + if ($value eq "") {$value = "\"\"";} + if ($count == 0) { + $output .= "$value"; + $count ++; next; + } + if ($count == 6) { + $output .= ", ..."; + last; + } + $output .= ",$value"; $count ++; + } + $output .= "\n"; + } + print STDERR $output; + $output = ""; +} else { + my %keywords; + while (@ARGV > 0) { + my $category = shift @ARGV; + my @col = split(/:/, $category); + @col == 2 || die "Bad category \"$category\"\n"; + $name = $col[0]; + if ($col[1] eq "-") { + foreach my $value (keys %{$attr{$name}}) { + foreach my $kw (keys %{$attr_kws{"${name}_$value"}}) { + $keywords{$kw} = 1; + } + } + } else { + my @col1 = split(/,/, $col[1]); + foreach my $value (@col1) { + foreach my $kw (keys %{$attr_kws{"${name}_$value"}}) { + $keywords{$kw} = 1; + } + } + } + } + foreach my $kw (keys %keywords) { + $output .= "$kw\n"; + } +} + +if ($kwlist_filename ne "-") {close(KWLIST);} +if ($kws_filename eq "-") { print $output;} +else { + open(O, ">$kws_filename") || die "Fail to open file $kws_filename\n"; + print O $output; + close(O); +} diff --git a/egs/babel/s5/local/buildEditDistanceFst.pl b/egs/babel/s5/local/buildEditDistanceFst.pl new file mode 100755 index 00000000000..be0e3ec2ea8 --- /dev/null +++ b/egs/babel/s5/local/buildEditDistanceFst.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. +# + +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < + Buld a edit distance FST at the phone level. + +Allowed options: + --confusion-matrix : Matrix for insertion, deletion and substitution. (string, default="") + --ins-cost : Insertion cost (double, default=1 ) + --del-cost : Deletion cost (double, default=1 ) + --subs-cost : substitution cost (double, default=1 ) + --boundary-ins-cost : Cost for insertions at work boundary (double, default=0.1) + --boundary-off : No insertions at word boundary (boolean, default=true) +EOU + +my $confusion_matrix = ""; +my $insertion_cost = 1; +my $deletion_cost = 1; +my $substitution_cost = 1; +my $boundary_ins_cost = 0.1; +my $boundary_off="true"; +GetOptions('confusion-matrix=s' => \$confusion_matrix, + 'ins-cost=f' => \$insertion_cost, + 'del-cost=f' => \$deletion_cost, + 'subs-cost=f' => \$substitution_cost, + 'boundary-ins-cost=f' => \$boundary_ins_cost, + 'boundary-off=s' => \$boundary_off); + +@ARGV == 2 || die $Usage; + +$boundary_off eq "true" || $boundary_off eq "false" || die "$0: Bad value for option --boundary-off\n"; + +# Workout the input and output parameters +my $phone_in = shift @ARGV; +my $fst_out = shift @ARGV; + +open(I, "<$phone_in") || die "$0: Fail to open lexicon $phone_in\n"; +open(O, ">$fst_out") || die "$0: Fail to write FST $fst_out\n"; + +# Read confusion matrix +my %confusion; +if ($confusion_matrix ne "") { + open(M, "<$confusion_matrix") || die "$0: Fail to open confusion matrix $confusion_matrix\n"; + while () { + chomp; + my @col = split(); + @col == 3 || die "$0: Bad line in confusion matrix \"$_\"\n"; + $confusion{"$col[0]_$col[1]"} = $col[2]; + } + close(M); +} + +# Start processing +my @phones; +while () { + chomp; + my @col = split(); + @col == 1 || die "$0: Bad number of columns in phone list \"$_\"\n"; + if ($col[0] eq "") {next;} + push(@phones, $col[0]); +} + +# Add insertions, deletions +my $fst = ""; +foreach my $p (@phones) { + if ($confusion_matrix eq "") { + $fst .= "1 1 $p $deletion_cost\n"; # Deletions + $fst .= "1 1 $p $insertion_cost\n"; # Insertions + if ($boundary_off eq "false") { + $fst .= "0 0 $p $boundary_ins_cost\n"; + $fst .= "0 1 $p $boundary_ins_cost\n"; + $fst .= "2 2 $p $boundary_ins_cost\n"; + $fst .= "1 2 $p $boundary_ins_cost\n"; + } + } else { + my $key = "${p}_"; + if (defined($confusion{$key})) { + $fst .= "1 1 $p $confusion{$key}\n"; + } + $key = "_${p}"; + if (defined($confusion{$key})) { + $fst .= "1 1 $p $confusion{$key}\n"; + if ($boundary_off eq "false") { + $fst .= "0 0 $p $confusion{$key}\n"; + $fst .= "0 1 $p $confusion{$key}\n"; + $fst .= "2 2 $p $confusion{$key}\n"; + $fst .= "1 2 $p $confusion{$key}\n"; + } + } + } +} +foreach my $p1 (@phones) { + foreach my $p2 (@phones) { + if ($p1 eq $p2) { + $fst .= "1 1 $p1 $p2 0\n"; + } else { + if ($confusion_matrix eq "") { + $fst .= "1 1 $p1 $p2 $substitution_cost\n"; + } else { + my $key = "${p1}_${p2}"; + if (defined($confusion{$key})) { + $fst .= "1 1 $p1 $p2 $confusion{$key}\n"; + } + } + } + } +} +if ($boundary_off eq "false") { + $fst .= "0 1 0\n"; + $fst .= "1 2 0\n"; + $fst .= "2\n"; +} else { + $fst .= "1\n"; +} + +print O $fst; + +close(I); +close(O); diff --git a/egs/babel/s5/local/count2logprob.pl b/egs/babel/s5/local/count2logprob.pl new file mode 100755 index 00000000000..378a8b8dd97 --- /dev/null +++ b/egs/babel/s5/local/count2logprob.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. +# + +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < + This script takes in the confusion phone pair counts and converts + the counts into negated log probabilities. The counts should be in + the following format: + p1 p2 count1 // For substitution + p3 count2 // For deletion + p4 count3 // For insertion + +Allowed options: + --cutoff : Minimal count to be considered (int , default=1) +EOU + +my $cutoff = 1; +GetOptions('cutoff=i' => \$cutoff); + +@ARGV == 2 || die $Usage; + +# Workout the input and output parameters +my $cm_in = shift @ARGV; +my $cm_out = shift @ARGV; + +open(I, "<$cm_in") || die "$0: Fail to open keywords file $cm_in\n"; +open(O, ">$cm_out") || die "$0: Fail to write confusion matrix $cm_out\n"; + +# Collect counts +my %ins; +my %del; +my %subs; +my %phone_count; +my $ins_count = 0; +my $del_count = 0; +while () { + chomp; + my @col = split(); + @col == 3 || die "$0: Bad line in confusion matrix file: $_\n"; + my ($p1, $p2, $count) = ($col[0], $col[1], $col[2]); + $count >= $cutoff || next; + if ($p1 eq "" && $p2 ne "") { + $ins{$p2} = $count; + $ins_count += $count; + } elsif ($p1 ne "" && $p2 eq "") { + $del{$p1} = $count; + $del_count += $count; + } elsif ($p1 ne "" && $p2 ne "") { + $p1 ne $p2 || next; # Skip same phone convert + $subs{"${p1}_$p2"} = $count; + if (defined($phone_count{$p1})) { + $phone_count{$p1} += $count; + } else { + $phone_count{$p1} = $count; + } + } +} + +# Compute negated log probability +foreach my $key (keys %ins) { + $ins{$key} = -log($ins{$key}/$ins_count); +} +foreach my $key (keys %del) { + $del{$key} = -log($del{$key}/$del_count); +} +foreach my $key (keys %subs) { + my @col = split(/_/, $key); + $subs{$key} = -log($subs{$key}/$phone_count{$col[0]}); +} + +# print results +my $output = ""; +foreach my $key (keys %ins) { + $output .= " $key $ins{$key}\n"; +} +foreach my $key (keys %del) { + $output .= "$key $del{$key}\n"; +} +foreach my $key (keys %subs) { + my @col = split(/_/, $key); + $output .= "$col[0] $col[1] $subs{$key}\n"; +} + +print O $output; + +close(I); +close(O); diff --git a/egs/babel/s5/local/subsetATWV.pl b/egs/babel/s5/local/subsetATWV.pl new file mode 100755 index 00000000000..364c3805b00 --- /dev/null +++ b/egs/babel/s5/local/subsetATWV.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. +# + +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < + e.g.: subsetATWV.pl keywords.list bsum.txt + +This script will compute the ATWV for a subset of the original keywords in bsum.txt. +Note that bsum.txt is a file generated by the NIST scoring tool F4DE. keywords.list +is a list of the keywords that you want to compute the ATWV for. For example: +KW101-0001 +KW101-0002 +... + +Allowed options: + --subset-name : Name of the subset (string, default = "") + --width : Width of the printed numbers (int, default = 5 ) +EOU + +my $subset_name = ""; +my $width = 5; +GetOptions('subset-name=s' => \$subset_name, + 'width=i' => \$width); + +@ARGV == 2 || die $Usage; + +# Workout the input/output source +my $kws_filename = shift @ARGV; +my $bsum_filename = shift @ARGV; + +my $source = "STDIN"; +if ($kws_filename ne "-") { + open(KWS, "<$kws_filename") || die "Fail to open keywords file: $kws_filename\n"; + $source = "KWS"; +} +open(BSUM, "<$bsum_filename") || die "Fail to open bsum file: $bsum_filename\n"; + +# Read in the keywords. +my $kws = ""; +while (<$source>) { + chomp; + my @col = split(); + @col == 1 || die "Bad line $_\n"; + if ($kws eq "") { + $kws = $col[0]; + } else { + $kws .= "|$col[0]"; + } +} + +# Process bsum.txt +my $targ_sum = 0; +my $corr_sum = 0; +my $fa_sum = 0; +my $miss_sum = 0; +my $twv_sum = 0; +my $count = 0; +my $subset_count = 0; +my $flag = 0; +if ($kws ne "") { + while () { + chomp; + # Workout the total keywords that have occurrence in the search collection + if (/^Summary Totals/) {$flag = 0;} + if (/^Keyword/) {$flag = 1;} + my @col; + if ($flag == 1) { + # Figure out keywords that don't have occurrences in the search collection + @col = split(/\|/, $_); + $col[2] =~ s/^\s+//; + $col[2] =~ s/\s+$//; + $col[2] ne "" || next; + $count ++; + } else { + next; + } + + # Only collect statistics for given subset + m/$kws/ || next; + + # Keywods that are in the given subset, and have occurrences + $targ_sum += $col[2]; + $corr_sum += $col[3]; + $fa_sum += $col[4]; + $miss_sum += $col[5]; + $twv_sum += $col[6]; + $subset_count ++; + } +} + +# Compute ATWV +my $subset_atwv = ($subset_count == 0) ? 0 : $twv_sum/$subset_count; +my $atwv = ($count == 0) ? 0 : $twv_sum/$count; +my $bp_atwv = ($count == 0) ? 0 : $subset_count/$count; + +# Format the numbers +my $format = "%-${width}d"; +$subset_count = sprintf($format, $subset_count); +$targ_sum = sprintf($format, $targ_sum); +$corr_sum = sprintf($format, $corr_sum); +$fa_sum = sprintf($format, $fa_sum); +$miss_sum = sprintf($format, $miss_sum); +$subset_atwv = sprintf("% .4f", $subset_atwv); +$atwv = sprintf("% .4f", $atwv); +$bp_atwv = sprintf("% .4f", $bp_atwv); + +# Print +if ($subset_name ne "") {print "$subset_name: ";} +print "#Keywords=$subset_count, #Targ=$targ_sum, #Corr=$corr_sum, #FA=$fa_sum, #Miss=$miss_sum, "; +print "Contributed ATWV=$atwv, Best Possible Contributed ATWV=$bp_atwv, ATWV=$subset_atwv\n"; + +if ($kws_filename ne "-") {close(KWS);} +close(BSUM); diff --git a/egs/kaldi-vystadial-recipe/.gitingore b/egs/kaldi-vystadial-recipe/.gitingore new file mode 100644 index 00000000000..bbd86a25b01 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/.gitingore @@ -0,0 +1,2 @@ +data +exp diff --git a/egs/kaldi-vystadial-recipe/README.md b/egs/kaldi-vystadial-recipe/README.md new file mode 100644 index 00000000000..7d4fcfe7d56 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/README.md @@ -0,0 +1,54 @@ +SUMMARY +------- +KALDI recipe based on voxforge KALDI recipe +http://vpanayotov.blogspot.cz/2012/07/voxforge-scripts-for-kaldi.html . +Requires KALDI installation and Linux environment. (Tested on Ubuntu 10.04 and 12.10.) +Written in Bash an Python 2.7.3. + +DESCRIPTION +----------- + * Our scripts prepare the data to expected format in s5/data. + * Stores experiments in s5/exp + * steps/ contains common scripts from wsj/s5/utils + * utils/ cotains common scritps from wsj/s5/utils + * local/ contains scripts for data preparation to prepare s5/data structure + * conf/ contains a few configuration files for KALDI + + +Runnning experiments +-------------------- +Before running the experiments check the following files: + * `conf` directory contains different configuration related for the training + * `path.sh` just set up path for running Kaldi binaries and path to data. + You should also setup `njobs` according your computer capabalities. + * `cmd.sh` set training commands e.g. for SGE grid. + * If you set up everything right, just launch `run.sh` It will create `mfcc`, `data` and `exp` directories. + If any of them exists, it will ask you if you want them to be overwritten. + ```bash + ./run.sh | tee mylog.log # I always store the output to the log + ``` + * I wrote a stupid script for collecting results. It's really beta software. It may crash, but it works for me. + ```bash +$ local/results.py exp # specify the experiment directory wait a while +exp RT coef WER SER +_ri3b_fmmi_b 2.42235533333 (19.45, 13) (44.67, 11) +tri2b_mpe 0.37968465 (20.83, 20) (47.2, 14) +mono 0.9478559 (52.42, 15) (77.33, 14) +tri3b_mmi 0.357894733333 (19.77, 16) (46.0, 11) +tri1 0.6558491 (27.12, 18) (57.33, 20) +... +... and other results in plaintex +... +================== +\begin{tabular}{cccc} +exp & RT coef & WER & SER \\ +_ri3b_fmmi_b & 2.42235533333 & (19.45, 13) & (44.67, 11)\\ +tri2b_mpe & 0.37968465 & (20.83, 20) & (47.2, 14) \\ +mono & 0.9478559 & (52.42, 15) & (77.33, 14)\\ +tri3b_mmi & 0.357894733333 & (19.77, 16) & (46.0, 11) \\ +tri1 & 0.6558491 & (27.12, 18) & (57.33, 20)\\ +... +... and the same results in TeX +... + + ``` diff --git a/egs/kaldi-vystadial-recipe/s5/.gitignore b/egs/kaldi-vystadial-recipe/s5/.gitignore new file mode 100644 index 00000000000..35e801d3f8c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/.gitignore @@ -0,0 +1,7 @@ +data +exp* +mfcc +tools +data_voip_en* +Results +voip diff --git a/egs/kaldi-vystadial-recipe/s5/cmd.sh b/egs/kaldi-vystadial-recipe/s5/cmd.sh new file mode 100644 index 00000000000..fb1d5d951d6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/cmd.sh @@ -0,0 +1,13 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" +#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" +# export train_cmd="queue.pl -l mf=5g" +# export decode_cmd="queue.pl -l mf=5g" +export train_cmd=run.pl +export decode_cmd=run.pl diff --git a/egs/kaldi-vystadial-recipe/s5/conf/decode.config b/egs/kaldi-vystadial-recipe/s5/conf/decode.config new file mode 100644 index 00000000000..332ae89de9d --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/conf/decode.config @@ -0,0 +1,3 @@ +first_beam=10.0 +beam=13.0 +lat_beam=6.0 diff --git a/egs/kaldi-vystadial-recipe/s5/conf/mfcc.conf b/egs/kaldi-vystadial-recipe/s5/conf/mfcc.conf new file mode 100644 index 00000000000..cd5dc059b45 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/conf/mfcc.conf @@ -0,0 +1,7 @@ +--use-energy=false # non default option. False -> Use C0 instead energy +# NUMCEPS u HTK is without C0: there 12 here 13- default +--low-freq=125 +--high-freq=3800 +--htk-compat +--remove-dc-offset # equivalent ZMEANSOURCE in HTK +# --subtract-mean # not recommended to do it this way diff --git a/egs/kaldi-vystadial-recipe/s5/conf/train_conf.sh b/egs/kaldi-vystadial-recipe/s5/conf/train_conf.sh new file mode 100755 index 00000000000..a0947dcde96 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/conf/train_conf.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# How big portion of available data to use +# everyN=3 -> we use one third of data +everyN=1 + +# Train monophone models on a subset of the data of this size +monoTrainData=1000 + +# Number of states for phonem training +pdf=1200 + +# Maximum number of Gaussians used for training +gauss=19200 + +# Test-time language model order +# We are just copying the arpa LM (3-order) +lm_order=3 + +train_mmi_boost=0.05 diff --git a/egs/kaldi-vystadial-recipe/s5/decode/decode-lattice.sh b/egs/kaldi-vystadial-recipe/s5/decode/decode-lattice.sh new file mode 100755 index 00000000000..53e2a8b3284 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/decode/decode-lattice.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +# Author: Ondrej Platek,2013, code is without any warranty! +# Created: 10:48:02 09/04/2013 +# Modified: 16:52:59 10/04/2013 + +# Set the paths to the binaries and scripts needed +fwd=`dirname $0` +KALDI_ROOT=$fwd/../../../.. +export PATH="$PATH":$fwd/../steps/:$fwd/../utils/:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/bin + +# Change this to "tri2a" if you like to test using a ML-trained model +ac_model_type=tri2a +exp="$fwd/../Results/expc0bcaa8acd2732dce7c25c27b945d566d80ca7a6" +data="$fwd/../data_voip_en1/test" + +# Alignments and decoding results are saved in this directory(simulated decoding only) +decode_dir="$fwd/../exp-decode-lat" + +# Change this to "live" either here or using command line switch like: +# --test-mode live # NOT SUPPORTED YET +test_mode="simulated" + +# decoding parameters +cmd=run.pl +nj=1 # we do not do data_split as in steps/decode.sh +max_active=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +lmwt=9 # TODO setup according experiments +feat_type='delta' + +. $fwd/../path.sh; # source the path. +. parse_options.sh || exit 1; + + +ac_model="$exp/$ac_model_type" + +if [ ! -d $ac_model ]; then + echo "The directory for AC model does not exist: $ac_model " + exit 1 +fi + +case $test_mode in + live) + echo + echo "CURRENTLY NOT SUPPORTED!" + echo -e " LIVE DEMO MODE - you can use a microphone and say something\n" + echo "Using model in $ac_model directory" + echo "CURRENTLY NOT SUPPORTED!" + echo + exit 1;; + simulated) + echo + echo -e " SIMULATED ONLINE DECODING - pre-recorded audio is used\n" + echo "Test file are from directory $data" + echo "Using model in $ac_model directory" + echo + ;; + + *) + echo "Invalid test mode! Should be either \"live\" or \"simulated\"!"; + exit 1;; +esac + +# Estimate the error rate for the simulated decoding +if [ $test_mode == "simulated" ]; then + mkdir -p $decode_dir + # Resets file - do not append + rm -f $decode_dir/wav.scp "$decode_dir/ref.txt" "$decode_dir/utt2spk" + for f in "$data"/*.wav; do + name=`basename $f` + echo "$name $f" >> $decode_dir/wav.scp + echo "$name $name" >> $decode_dir/utt2spk + # symbols=`sym2int.pl $ac_model/graph/words.txt < "${f}.trn"` + symbols=`cat "${f}.trn"` + echo "$name $symbols" >> $decode_dir/ref.txt + done + + # in utils creates utt2spk + utt2spk_to_spk2utt.pl "$decode_dir"/utt2spk > "$decode_dir/spk2utt" || exit 1 + # # in steps creates feats.scp FIXME creates wrong scp + mkdir -p $decode_dir/mfcc + time ( make_mfcc.sh --cmd "$cmd" --nj $nj $decode_dir $decode_dir $decode_dir/mfcc || exit 1 ) + # in steps creates cmvn.scp + time ( compute_cmvn_stats.sh $decode_dir $decode_dir $decode_dir/mfcc || exit 1 ) + + # Decoding: Based on steps/decode.sh and local/score.sh + case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$decode_dir/utt2spk scp:$decode_dir/cmvn.scp scp:$decode_dir/feats.scp ark:- | add-deltas ark:- ark:- |";; + # lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; + esac + + # TODO How is the gmm-latgen-paralelized? On data -> bad for us! + # TODO $nj == 1 does it depend on data? IMHO yes (See steps/decode.sh) + time ( $cmd JOB=1:$nj $decode_dir/decodeLattice.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$ac_model/graph/words.txt \ + $ac_model/final.mdl $ac_model/graph/HCLG.fst "$feats" "ark:|gzip -c > $decode_dir/lat.JOB.gz" || exit 1 ) + + time ( lattice-best-path --lm-scale=$lmwt --word-symbol-table=$ac_model/graph/words.txt \ + "ark:gunzip -c $decode_dir/lat.*.gz|" ark,t:$decode_dir/trans.txt || exit 1 ) + + # Finally compute WER + cat $decode_dir/trans.txt | \ + utils/int2sym.pl -f 2- $ac_model/graph/words.txt | sed 's:\::g' | \ + compute-wer --text --mode=present \ + ark:$decode_dir/ref.txt ark,p:- >& $decode_dir/wer || exit 1; + +fi diff --git a/egs/kaldi-vystadial-recipe/s5/decode/decode-online.sh b/egs/kaldi-vystadial-recipe/s5/decode/decode-online.sh new file mode 100755 index 00000000000..0b302d3bf42 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/decode/decode-online.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# FIXME in general in bad shape +# 1) WER is wrong + +# Copyright 2013 Ondrej Platek, based on Vassil Panayotov script +# Apache 2.0 + +# Set the paths to the binaries and scripts needed +fwd=`dirname $0` +KALDI_ROOT=$fwd/../../../.. +export PATH=$fwd/../../s5/utils/:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/bin:$PATH + +# Change this to "tri2a" if you like to test using a ML-trained model +ac_model_type=tri2a +exp="$fwd/../Results/expc0bcaa8acd2732dce7c25c27b945d566d80ca7a6" +data="$fwd/../data_voip_en/test" + +# Alignments and decoding results are saved in this directory(simulated decoding only) +decode_dir="$fwd/../exp-decode" + +# Change this to "live" either here or using command line switch like: +# --test-mode live +test_mode="simulated" + +. parse_options.sh + +ac_model="$exp/$ac_model_type" +trans_matrix="" + + +if [ ! -d $ac_model ]; then + echo "The directory for AC model does not exist: $ac_model " + exit 1 +fi + +if [ -s $ac_model/matrix ]; then + trans_matrix=$ac_model/matrix # lda matrix +fi + +case $test_mode in + live) + echo + echo -e " LIVE DEMO MODE - you can use a microphone and say something\n" + echo "Using model in $ac_model directory" + echo + online-gmm-decode-faster --rt-min=0.5 --rt-max=0.7 --max-active=4000 \ + --beam=12.0 --acoustic-scale=0.0769 $ac_model/final.mdl $ac_model/graph/HCLG.fst \ + $ac_model/graph/words.txt '1:2:3:4:5' $trans_matrix;; + + simulated) + echo + echo -e " SIMULATED ONLINE DECODING - pre-recorded audio is used\n" + echo "Test file are from directory $data" + echo "Using model in $ac_model directory" + echo + ;; + + *) + echo "Invalid test mode! Should be either \"live\" or \"simulated\"!"; + exit 1;; +esac + +# Estimate the error rate for the simulated decoding +if [ $test_mode == "simulated" ]; then + mkdir -p $decode_dir + # Create new input.scp file + rm -f $decode_dir/input.scp + for f in "$data"/*.wav; do + bf=`basename $f` + bf=${bf%.wav} + echo $bf $f >> $decode_dir/input.scp + done + # Decode + online-wav-gmm-decode-faster --verbose=1 --rt-min=0.8 --rt-max=0.85\ + --max-active=4000 --beam=12.0 --acoustic-scale=0.0769 \ + scp:$decode_dir/input.scp $ac_model/final.mdl $ac_model/graph/HCLG.fst \ + $ac_model/graph/words.txt '1:2:3:4:5' ark,t:$decode_dir/trans.txt \ + ark,t:$decode_dir/ali.txt $trans_matrix + + # Create new ref.txt file + rm -f "$decode_dir/ref.txt" + cat $decode_dir/input.scp | tr -s ' ' | cut -d ' ' -f 2- |\ + while read wav_file ; do + # Convert the reference transcripts from symbols to word IDs + symbols=`sym2int.pl $ac_model/graph/words.txt < "$wav_file.trn"` + name=`basename "$wav_file"` + name=${name%.wav} + echo "$name $symbols" >> $decode_dir/ref.txt + done + + # Compact the hypotheses belonging to the same test utterance + cat $decode_dir/trans.txt | tr -s ' ' | sed -r 's:_[0-9]+-[0-9]+\>::' |\ + gawk '{key=$1; $1=""; arr[key]=arr[key] " " $0; } END { for (k in arr) { print k " " arr[k]} }' > $decode_dir/hyp.txt + + # Finally compute WER + compute-wer --mode=all --verbose=100 ark,t:$decode_dir/ref.txt ark,t:$decode_dir/hyp.txt +fi diff --git a/egs/kaldi-vystadial-recipe/s5/local/make_trans.py b/egs/kaldi-vystadial-recipe/s5/local/make_trans.py new file mode 100755 index 00000000000..59ced4e3747 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/make_trans.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +# Copyright 2012 Vassil Panayotov +# Apache 2.0 + +""" +Takes a "PROMPTS" file with lines like: +1snoke-20120412-hge/mfc/a0405 IT SEEMED THE ORDAINED ORDER OF THINGS THAT DOGS SHOULD WORK + +, an ID prefix and a list of audio file names (e.g. for above example the list will contain "a0405"). +It checks if the prompts file have transcription for all audio files in the list and +if this is the case produces a transcript line for each file in the format: +prefix_a0405 IT SEEMED THE ORDAINED ORDER OF THINGS THAT DOGS SHOULD WORK +""" + +import sys + +def err(msg): + print >> sys.stderr, msg + +if len(sys.argv) != 4: + err("Usage: %s " % sys.argv[0]) + sys.exit(1) + +#err(str(sys.argv)) +id_prefix = sys.argv[2] +utt_ids = sys.argv[3].strip().split() +utt2trans = dict() +unnorm_utt = set() +for l in file(sys.argv[1]): + u, trans = l.split(None, 1) + u = u.strip().split('/')[-1] + trans = trans.strip().replace("-", " ") + if not trans.isupper() or \ + not trans.strip().replace(' ', '').replace("'", "").isalpha(): + err("The transcript for '%s'(user '%s') is not properly normalized - skipped!" + % (u, id_prefix)) + err(trans) + unnorm_utt.add(u) + continue + utt2trans[u] = trans + +for uid in utt_ids: + if uid in unnorm_utt: + continue # avoid double reporting the same problem + if not uid in utt2trans: + err("No transcript found for %s_%s" % (id_prefix, uid)) + continue + print "%s_%s %s" % (id_prefix, uid, utt2trans[uid]) + diff --git a/egs/kaldi-vystadial-recipe/s5/local/results.py b/egs/kaldi-vystadial-recipe/s5/local/results.py new file mode 100755 index 00000000000..3586197ada8 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/results.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# Author: Ondrej Platek,2013, code is without any warranty! +# Created: 14:29:00 07/03/2013 +# Modified: 14:29:00 07/03/2013 + +import argparse +import subprocess +import re +from numpy import mean + + +def getLog(path): + try: + rt = subprocess.check_output(['grep', '-r', 'real-time factor', path]) + wer = subprocess.check_output(['grep', '-r', '%WER', path]) + ser = subprocess.check_output(['grep', '-r', '%SER', path]) + return (rt, wer, ser) + except subprocess.CalledProcessError as err: + print err + + +def readLogs(rtpath, werpath): + # let the exception be seen + rt = open(rtpath).read() + wer = open(werpath).read() + ser = wer # ATTENTION in our setting the same log + return (rt, wer, ser) + + +def extractResults(rt, wer, ser): + rt = rt.splitlines() + wer = wer.splitlines() + ser = ser.splitlines() + # expp follow convention name the exp directories: + # exp/ expBlabalabla/ exp_asdfasdf/ ... + expp = re.compile(r'exp.*?/(.*?)/') + rtp = re.compile(r'([0-9\.]+)$') + werp = re.compile(r'%WER ([0-9\.]+)') + serp = re.compile(r'%SER ([0-9\.]+)') + itp = re.compile(r'wer_([0-9][0-9]?):%[SW]ER') + + # for l in ser: # debugging + # print l + # print expp.search(l).group(1) + try: + rts = [(expp.search(l).group(1), rtp.search(l).group(1)) for l in rt] + except: + print rt + raise + try: + wers = [(expp.search(l).group(1), werp.search(l).group(1), itp.search(l).group(1)) + for l in wer] + except: + print wer + raise + try: + sers = [(expp.search(l).group(1), serp.search(l).group(1), itp.search(l).group(1)) + for l in ser] + except: + print ser + raise + + exp_names = list( + set([n for (n, _) in rts] + [n for (n, _, _) in wers])) + results = {} + for e in exp_names: + w = [(float(wr), int(it)) for (exp, wr, it) in wers if exp == e] + s = [(float(sr), int(it)) for (exp, sr, it) in sers if exp == e] + r = [float(r_) for (exp, r_) in rts if exp == e] + w.sort() + s.sort() + r.sort() + results[e] = (w, s, r) + return results + + +class Table(object): + def __init__(self, data=[], colnames=[]): + self.data = data + self.colnames = colnames + self.colSep = '\t' + self.lineSep = '\n' + + def data2str(self): + strdata = [] + for r in self.data: + strdata.append([str(c) for c in r]) + return strdata + + def __str__(self): + sd = self.data2str() + colwidth = [len(c) for c in self.colnames] + for j in range(len(colwidth)): + for r in sd: + colwidth[j] = max(colwidth[j], len(r[j])) + + gaps = [m - len(c) for (m, c) in zip(colwidth, self.colnames)] + rows = [self.colSep.join( + [c + ' ' * gap for c, gap in zip(self.colnames, gaps)])] + for r in sd: + gaps = [m - len(c) for (m, c) in zip(colwidth, r)] + rows.append( + self.colSep.join([c + ' ' * d for c, d in zip(r, gaps)])) + return self.lineSep.join(rows) + + +class LatexTable(Table): + def __init__(self, data=[], colnames=[]): + Table.__init__(self, data, colnames) + nc = len(colnames) + self.header = '\\begin{tabular}{%s}' % ('c' * nc) + self.tail = '\\end{tabular}' + self.colSep = ' & ' + self.lineSep = '\\\\ \n' + + def __str__(self): + table_s = super(LatexTable, self).__str__() + return '%s\n%s\n%s\n' % (self.header, table_s, self.tail) + + +def Table2LatexTable(table): + return LatexTable(table.data, table.colnames) + + +def createSmallTable(r): + d = [] + for k, v in r.iteritems(): + w, s, r = v + if w == []: + minw = None + else: + minw = min(w) # returns tuple if s is list of tuples + if s == []: + mins = None + else: + mins = min(s) # returns tuple if s is list of tuples + d.append([k, mean(r), minw, mins]) + t = Table(d, ['exp', 'RT coef', 'WER', 'SER']) + return t + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Parse WER amd Realtime ratio logs generated by run.sh') + # TODO parser with commands log and exp e.g.: + # http://pymotw.com/2/argparse/#mutually-exclusive-options + + exp = True + if exp: + parser.add_argument('expath', type=str, action='store') + p = parser.parse_args() + rt, wer, ser = getLog(p.expath) + else: + parser.add_argument('--werlog', action='store', type=str) + parser.add_argument('--rtlog', action='store', type=str) + p = parser.parse_args() + rt, wer, ser = readLogs(p.rtlog, p.werlog) + + r = extractResults(rt, wer, ser) + t = createSmallTable(r) + print t + print '==================' + t2 = Table2LatexTable(t) + print t2 diff --git a/egs/kaldi-vystadial-recipe/s5/local/save_check_conf.sh b/egs/kaldi-vystadial-recipe/s5/local/save_check_conf.sh new file mode 100755 index 00000000000..b1e7565b6c0 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/save_check_conf.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Author: Ondrej Platek,2013, Apache 2.0 +# Created: 09:56:45 13/03/2013 +# Modified: 09:56:45 13/03/2013 + +if [ ! -d "$DATA_ROOT" ]; then + echo "You need to set \"DATA_ROOT\" variable in path.sh to point to the directory to host data" + exit 1 +fi + +# Ask about REMOVING the exp and data directory +if [ "$(ls -A exp 2>/dev/null)" ]; then + read -p "Directory 'exp' is NON EMPTY. Do you want it to be OVERWRITTEN y/n?" + case $REPLY in + [Yy]* ) echo 'Deleting exp directory'; rm -rf exp;; + [Nn]* ) echo 'Keeping exp directory';; + * ) echo 'Keeping exp directory and cancelling..'; exit 1;; + esac +fi + +if [ "$(ls -A data 2>/dev/null)" ]; then + read -p "Directory 'data' is NON EMPTY. Do you want it to be OVERWRITTEN y/n?" + case $REPLY in + [Yy]* ) echo 'Deleting data directory'; rm -rf data;; + [Nn]* ) echo 'Reusing DATA SPLIT, LM, MFCC. SEE THE SCRIPT!'; + echo 'REUSING DATA from previous experiment!' \ + 'Check that everyN is THE SAME' >> exp/conf/train_conf.sh ;; + * ) echo 'Keeping the data directory and cancelling..' + exit 1;; + esac +fi + +if [ "$(ls -A ${MFCC_DIR} 2>/dev/null)" ]; then + read -p "Directory '${MFCC_DIR}' is NON EMPTY. Do you want it to be OVERWRITTEN y/n?" + case $REPLY in + [Yy]* ) echo "Echo deleting ${MFCC_DIR}"; rm -rf "${MFCC_DIR}";; + [Nn]* ) echo "Echo reusing MFCC at ${MFCC_DIR}!"; + echo 'REUSING MFCC from previous experiment!' \ + 'Check that the settings are THE SAME!' >> exp/conf/mfcc.conf + ;; + * ) echo 'Keeping the data directory and cancelling..'; + exit 1;; + esac +fi + +# make sure that the directories exists +mkdir -p "$MFCC_DIR" +mkdir -p "exp" +mkdir -p "data" + +# Copy the current settings to exp directory +cp -r conf exp +cp cmd.sh path.sh exp/conf +git log -1 > exp/conf/git_log_state.log +git diff > exp/conf/git_diff_state.log diff --git a/egs/kaldi-vystadial-recipe/s5/local/score.sh b/egs/kaldi-vystadial-recipe/s5/local/score.sh new file mode 100755 index 00000000000..e5737c01a65 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/score.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +min_lmwt=9 +max_lmwt=20 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \ + "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; + +# Note: the double level of quoting for the sed command +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:OOV::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +# Show results +for f in $dir/wer_*; do echo $f; egrep '(WER)|(SER)' < $f; done + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/local/vystadial_data_prep.sh b/egs/kaldi-vystadial-recipe/s5/local/vystadial_data_prep.sh new file mode 100755 index 00000000000..4e50619ae14 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/vystadial_data_prep.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Author: Ondrej Platek, Copyright 2012, code is without any warranty! +# Created: 11:06:13 16/11/2012 +# Modified: 11:06:13 16/11/2012 +# +# +# Makes train/test splits +# local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${SELECTED} || exit 1 +# create files: (TYPE=train|test) +# a) ${TYPE}_trans.txt: ID transcription capitalized! No interputction +# b) ${TYPE}_wav.scp: ID path2ID.wav +# c) $TYPE.utt2spk: ID-recording ID-speaker +# d) $TYPE.spk2utt +# e) $TYPE.spk2gender all speakers are male +# we have ID-recording = ID-speaker + +renice 20 $$ + + +every_n=1 +[ -f path.sh ] && . ./path.sh # source the path. +. utils/parse_options.sh || exit 1; + + +msg="Usage: $0 [--every-n 30] "; +if [ $# -gt 1 ] ; then + echo "$msg"; exit 1; +fi +if [ $# -eq 0 ] ; then + echo "$msg"; exit 1; +fi + +DATA=$1 + +echo "=== Starting initial Vystadial data preparation ..." +echo "--- Making test/train data split from $DATA taking every $every_n recording..." + +locdata=data/local +loctmp=$locdata/tmp +rm -rf $loctmp >/dev/null 2>&1 +mkdir -p $locdata +mkdir -p $loctmp + +i=0 +for d in test train ; do + ls $DATA/$d/ | sed -n /.*wav$/p |\ + while read wav ; do + # echo "DEBUGGING wav: $wav" + ((i++)) # bash specific + if [ $i -ge $every_n ] ; then + i=0 + pwav=$DATA/$d/$wav + echo "$wav $pwav" >> ${loctmp}/${d}_wav.scp.unsorted + echo "$wav $wav" >> ${loctmp}/${d}.utt2spk.unsorted + echo "$wav $wav" >> ${loctmp}/${d}.spk2utt.unsorted + # transcribtion of $wav + trn=`cat $DATA/$d/$wav.trn` + # echo "DEBUGGING trn: $trn" + echo "$wav $trn" >> ${loctmp}/${d}_trans.txt.unsorted + echo "$wav M" >> ${loctmp}/spk2gender.unsorted + fi + done # while read wav + + # Sorting + for unsorted in _wav.scp.unsorted _trans.txt.unsorted \ + .spk2utt.unsorted .utt2spk.unsorted _wav.scp.unsorted + do + u="${d}${unsorted}" + s=`echo "$u" | sed -e s:.unsorted::` + sort "${loctmp}/$u" -k1 > "${locdata}/$s" + done # for unsorted + + #### copy to data dir ### + mkdir -p data/$d + cp $locdata/${d}_wav.scp data/$d/wav.scp || exit 1; + cp $locdata/${d}_trans.txt data/$d/text || exit 1; + cp $locdata/$d.spk2utt data/$d/spk2utt || exit 1; + cp $locdata/$d.utt2spk data/$d/utt2spk || exit 1; +done # for in test train + +# should be set..OK for 1:1 spk2utt, spk from test AND train +sort "${loctmp}/spk2gender.unsorted" -k1 > "${locdata}/spk2gender" +utils/filter_scp.pl data/$d/spk2utt $locdata/spk2gender > data/$d/spk2gender || exit 1; diff --git a/egs/kaldi-vystadial-recipe/s5/local/vystadial_format_data.sh b/egs/kaldi-vystadial-recipe/s5/local/vystadial_format_data.sh new file mode 100755 index 00000000000..08d94d223dd --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/vystadial_format_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# Apache 2.0 + +source ./path.sh + +echo "=== Formating train and test data ..." +srcdir=data/local +lmdir=data/local/ +tmpdir=data/local/lm_tmp +lexicon=data/local/dict/lexicon.txt +mkdir -p $tmpdir + +# README ALREADY implemented +# for x in train test; do +# mkdir -p data/$x +# cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; +# cp $srcdir/${x}_trans.txt data/$x/text || exit 1; +# cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; +# cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; +# utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; +# done + + +# Next, for each type of language model, create the corresponding FST +# and the corresponding lang_test_* directory. + +echo "--- Preparing the grammar transducer (G.fst) for testing ..." + +test=data/lang_test +mkdir -p $test +for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do + cp -r data/lang/$f $test +done +cat $lmdir/lm.arpa | \ + utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt + +# grep -v ' ' because the LM seems to have some strange and useless +# stuff in it with multiple 's in the history. Encountered some other similar +# things in a LM from Geoff. Removing all "illegal" combinations of and , +# which are supposed to occur only at being/end of utt. These can cause +# determinization failures of CLG [ends up being epsilon cycles]. +cat $lmdir/lm.arpa | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl $tmpdir/oovs.txt | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ + --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > $test/G.fst +fstisstochastic $test/G.fst +# The output is like: +# 9.14233e-05 -0.259833 +# we do expect the first of these 2 numbers to be close to zero (the second is +# nonzero because the backoff weights make the states sum to >1). +# Because of the fiasco for these particular LMs, the first number is not +# as close to zero as it could be. + +# Everything below is only for diagnostic. +# Checking that G has no cycles with empty words on them (e.g. , ); +# this might cause determinization failure of CLG. +# #0 is treated as an empty word. +mkdir -p $tmpdir/g +awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" >$tmpdir/g/select_empty.fst.txt +fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \ + $tmpdir/g/select_empty.fst.txt | \ +fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst +fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + echo "Language model has cycles with empty words" && exit 1 +rm -rf $tmpdir + +echo "*** Succeeded in formatting data." + diff --git a/egs/kaldi-vystadial-recipe/s5/local/vystadial_prepare_dict.sh b/egs/kaldi-vystadial-recipe/s5/local/vystadial_prepare_dict.sh new file mode 100755 index 00000000000..bd916013af3 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/local/vystadial_prepare_dict.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +# Copyright 2012 Vassil Panayotov +# Apache 2.0 + +renice 20 $$ + +locdata=data/local +locdict=$locdata/dict + +echo "=== Preparing the dictionary ..." + +if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then + echo "--- Downloading CMU dictionary ..." + mkdir -p $locdict + svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ + $locdict/cmudict || exit 1; +fi + +echo "--- Striping stress and pronunciation variant markers from cmudict ..." +perl $locdict/cmudict/scripts/make_baseform.pl \ + $locdict/cmudict/cmudict.0.7a /dev/stdout |\ + sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt + +echo "--- Searching for OOV words ..." +gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ + $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\ + egrep -v '<.?s>' > $locdict/vocab-oov.txt + +gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ + $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\ + egrep -v '<.?s>' > $locdict/lexicon-iv.txt + +wc -l $locdict/vocab-oov.txt +wc -l $locdict/lexicon-iv.txt + +### BEGIN SKIPPING GENERATING PRONUNCIACIONS FOR OOV WORDS #### +# pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` +# if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then +# echo "--- Downloading Sequitur G2P ..." +# echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!" +# wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz +# tar xf tools/g2p-r1668.tar.gz -C tools +# cd tools/g2p +# echo '#include ' >> Utility.hh # won't compile on my system w/o this "patch" +# python setup.py install --prefix=. +# cd ../.. +# if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then +# echo "Sequitur G2P is not found - installation failed?" +# exit 1 +# fi +# fi +# +# if [ ! -f conf/g2p_model ]; then +# echo "--- Downloading a pre-trained Sequitur G2P model ..." +# wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model +# if [ ! -f conf/g2p_model ]; then +# echo "Failed to download the g2p model!" +# exit 1 +# fi +# fi +# +# echo "--- Preparing pronunciations for OOV words ..." +# python tools/g2p/lib/python${pyver}/site-packages/g2p.py \ +# --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt + +# HANDLING OOV WORDS: OOV SPN UNKnow has pronancuation SPoken Noise +echo "OOV SPN" > $locdict/lexicon-oov.txt +echo "_INHALE_ SPN" >> $locdict/lexicon-oov.txt +echo "_LAUGH_ SPN" >> $locdict/lexicon-oov.txt +echo "_EHM_HMM_ SPN" >> $locdict/lexicon-oov.txt +echo "_NOISE_ SPN" >> $locdict/lexicon-oov.txt + +cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\ + sort > $locdict/lexicon.txt + +echo "--- Prepare phone lists ..." +echo SIL > $locdict/silence_phones.txt +echo _SIL_ >> $locdict/silence_phones.txt +echo SIL > $locdict/optional_silence.txt +grep -v -w sil $locdict/lexicon.txt | \ + awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' |\ + sort > $locdict/nonsilence_phones.txt + +echo "--- Adding SIL to the lexicon ..." +echo -e "!SIL\tSIL" >> $locdict/lexicon.txt + +# Some downstream scripts expect this file exists, even if empty +touch $locdict/extra_questions.txt + +echo "*** Dictionary preparation finished!" diff --git a/egs/kaldi-vystadial-recipe/s5/logs/README b/egs/kaldi-vystadial-recipe/s5/logs/README new file mode 100644 index 00000000000..5eecd9f8e69 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/logs/README @@ -0,0 +1 @@ +Directory to store logs diff --git a/egs/kaldi-vystadial-recipe/s5/path.sh b/egs/kaldi-vystadial-recipe/s5/path.sh new file mode 100755 index 00000000000..4b50cebbb7e --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/path.sh @@ -0,0 +1,16 @@ +# The number of parallel jobs to be started for some parts of the recipe +# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs +njobs=10 + +# Needed for "correct" sorting +export LC_ALL=C + +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH + + +# Vystadial data 1 channel 16000 16-bit +export DATA_ROOT="./data_voip_en" + +# Storage dir for MFCC. Need a lot of space. +export MFCC_DIR=./mfcc diff --git a/egs/kaldi-vystadial-recipe/s5/run.sh b/egs/kaldi-vystadial-recipe/s5/run.sh new file mode 100755 index 00000000000..9c692fe5f26 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/run.sh @@ -0,0 +1,201 @@ +#!/bin/bash +renice 20 $$ + +# Copyright Ondrej Platek Apache 2.0 +# based on copyrighted 2012 Vassil Panayotov recipe +# at egs/voxforge/s5/run.sh(Apache 2.0) + +. ./path.sh + +# If you have cluster of machines running GridEngine you may want to +# change the train and decode commands in the file below +. ./cmd.sh + +# Load few variables for changing the parameters of the training +. ./conf/train_conf.sh + +# Copy the configuration files to exp directory. +# Write into the exp WARNINGs if reusing settings from another experiment! +local/save_check_conf.sh || exit 1; + +if [ ! "$(ls -A data 2>/dev/null)" ]; then + + # local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${SELECTED} || exit 1 + local/vystadial_data_prep.sh --every_n $everyN ${DATA_ROOT} || exit 1 + + # prepare an ARPA LM and wordlist + mkdir -p data/local + # LEAVING it with OOV -> Allow train Kaldi for OOV model + # cp -f ${DATA_ROOT}/arpa_trigram data/local/lm.arpa + # NOT ALLOWING OOV WORDS training & also in decoding + grep -v -w OOV ${DATA_ROOT}/arpa_trigram > data/local/lm.arpa + echo '' > data/local/vocab-full.txt + tail -n +3 ${DATA_ROOT}/classic.v3.dct | cut -d ' ' -f 1 |\ + sort | uniq >> data/local/vocab-full.txt + + # Prepare the lexicon and various phone lists + # DISABLED Sequitor model: Pronunciations for OOV words are obtained using a pre-trained Sequitur model + local/vystadial_prepare_dict.sh || exit 1 + + # Prepare data/lang and data/local/lang directories read it IO param describtion + utils/prepare_lang.sh data/local/dict 'OOV' data/local/lang data/lang || exit 1 + + # Prepare G.fst and data/{train,test} directories + local/vystadial_format_data.sh || exit 1 +fi +# end of generating data directory + + +###### TRAINING SETTINGS ####### + +# if ${MFCC_DIR} is empty then generate the content +if [ ! "$(ls -A ${MFCC_DIR} 2>/dev/null)" ]; then + # Creating MFCC features and storing at ${MFCC_DIR} (Could be large). + for x in train test ; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $njobs \ + data/$x exp/make_mfcc/$x ${MFCC_DIR} || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x ${MFCC_DIR} || exit 1; + done +fi + + +# Train monophone models on a subset of the data +utils/subset_data_dir.sh data/train $monoTrainData data/train.1k || exit 1; +steps/train_mono.sh --nj $njobs --cmd "$train_cmd" data/train.1k data/lang exp/mono || exit 1; + +# Monophone decoding +utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1 +# note: local/decode.sh calls the command line once for each +# test, and afterwards averages the WERs into (in this case +# exp/mono/decode/ +steps/decode.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + exp/mono/graph data/test exp/mono/decode + +# Get alignments from monophone system. +steps/align_si.sh --nj $njobs --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + +# train tri1 [first triphone pass] +steps/train_deltas.sh --cmd "$train_cmd" \ + $pdf $gauss data/train data/lang exp/mono_ali exp/tri1 || exit 1; + +# decode tri1 +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; +steps/decode.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + exp/tri1/graph data/test exp/tri1/decode + +# draw-tree data/lang/phones.txt exp/tri1/tree | dot -Tps -Gsize=8,10.5 | ps2pdf - tree.pdf + +#align tri1 +steps/align_si.sh --nj $njobs --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +# train tri2a [delta+delta-deltas] +steps/train_deltas.sh --cmd "$train_cmd" $pdf $gauss \ + data/train data/lang exp/tri1_ali exp/tri2a || exit 1; + +# decode tri2a +utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph +steps/decode.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + exp/tri2a/graph data/test exp/tri2a/decode + +# train and decode tri2b [LDA+MLLT] +steps/train_lda_mllt.sh --cmd "$train_cmd" $pdf $gauss \ + data/train data/lang exp/tri1_ali exp/tri2b || exit 1; +utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph +steps/decode.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b/decode + +# Align all data with LDA+MLLT system (tri2b) +steps/align_si.sh --nj $njobs --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + +# Do MMI on top of LDA+MLLT. +steps/make_denlats.sh --nj $njobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1; +steps/train_mmi.sh data/train data/lang exp/tri2b_ali exp/tri2b_denlats exp/tri2b_mmi || exit 1; +steps/decode.sh --config conf/decode.config --iter 4 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mmi/decode_it4 +steps/decode.sh --config conf/decode.config --iter 3 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mmi/decode_it3 + +# Do the same with boosting. train_mmi_boost is a number e.g. 0.05 +steps/train_mmi.sh --boost ${train_mmi_boost} data/train data/lang \ + exp/tri2b_ali exp/tri2b_denlats exp/tri2b_mmi_b${train_mmi_boost} || exit 1; +steps/decode.sh --config conf/decode.config --iter 4 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mmi_b${train_mmi_boost}/decode_it4 || exit 1; +steps/decode.sh --config conf/decode.config --iter 3 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mmi_b${train_mmi_boost}/decode_it3 || exit 1; + +# Do MPE. +steps/train_mpe.sh data/train data/lang exp/tri2b_ali exp/tri2b_denlats exp/tri2b_mpe || exit 1; +steps/decode.sh --config conf/decode.config --iter 4 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mpe/decode_it4 || exit 1; +steps/decode.sh --config conf/decode.config --iter 3 --nj $njobs --cmd "$decode_cmd" \ + exp/tri2b/graph data/test exp/tri2b_mpe/decode_it3 || exit 1; + + +# Do LDA+MLLT+SAT, and decode. +steps/train_sat.sh $pdf $gauss data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; +utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph || exit 1; +steps/decode_fmllr.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + exp/tri3b/graph data/test exp/tri3b/decode || exit 1; + + +# Align all data with LDA+MLLT+SAT system (tri3b) +steps/align_fmllr.sh --nj $njobs --cmd "$train_cmd" --use-graphs true \ + data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; + +# MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI) +steps/make_denlats.sh --config conf/decode.config \ + --nj $njobs --cmd "$train_cmd" --transform-dir exp/tri3b_ali \ + data/train data/lang exp/tri3b exp/tri3b_denlats || exit 1; +steps/train_mmi.sh data/train data/lang exp/tri3b_ali exp/tri3b_denlats exp/tri3b_mmi || exit 1; + +steps/decode_fmllr.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + --alignment-model exp/tri3b/final.alimdl --adapt-model exp/tri3b/final.mdl \ + exp/tri3b/graph data/test exp/tri3b_mmi/decode || exit 1; + +# Do a decoding that uses the exp/tri3b/decode directory to get transforms from. +steps/decode.sh --config conf/decode.config --nj $njobs --cmd "$decode_cmd" \ + --transform-dir exp/tri3b/decode exp/tri3b/graph data/test exp/tri3b_mmi/decode2 || exit 1; + + +# first, train UBM for fMMI experiments. +steps/train_diag_ubm.sh --silence-weight 0.5 --nj $njobs --cmd "$train_cmd" \ + 250 data/train data/lang exp/tri3b_ali exp/dubm3b + + # Next, various fMMI+MMI configurations. +steps/train_mmi_fmmi.sh --learning-rate 0.0025 \ + --boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri3b_ali exp/dubm3b exp/tri3b_denlats \ + exp/_ri3b_fmmi_b || exit 1; + +for iter in 3 4 5 6 7 8; do + steps/decode_fmmi.sh --nj $njobs --config conf/decode.config --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode exp/tri3b/graph data/test exp/_ri3b_fmmi_b/decode_it$iter & +done + +steps/train_mmi_fmmi.sh --learning-rate 0.001 \ + --boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri3b_ali exp/dubm3b exp/tri3b_denlats \ + exp/tri3b_fmmi_c || exit 1; + +for iter in 3 4 5 6 7 8; do + steps/decode_fmmi.sh --nj $njobs --config conf/decode.config --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode exp/tri3b/graph data/test exp/tri3b_fmmi_c/decode_it$iter & +done + +# for indirect one, use twice the learning rate. +steps/train_mmi_fmmi_indirect.sh --learning-rate 0.002 --schedule "fmmi fmmi fmmi fmmi mmi mmi mmi mmi" \ + --boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri3b_ali exp/dubm3b exp/tri3b_denlats \ + exp/tri3b_fmmi_d || exit 1; + +for iter in 3 4 5 6 7 8; do + steps/decode_fmmi.sh --nj $njobs --config conf/decode.config --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode exp/tri3b/graph data/test exp/tri3b_fmmi_d/decode_it$iter & +done + +# SKIPPING this mixturing and speaker dependant settings +# You don't have to run all 3 of the below, e.g. you can just run the run_sgmm2x.sh +# local/run_sgmm.sh +# local/run_sgmm2.sh +# local/run_sgmm2x.sh diff --git a/egs/kaldi-vystadial-recipe/s5/steps/align_fmllr.sh b/egs/kaldi-vystadial-recipe/s5/steps/align_fmllr.sh new file mode 100755 index 00000000000..937c61010f1 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/align_fmllr.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta) +# + fMLLR (probably with SAT models). +# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl +# is not present), then does 2 iterations of fMLLR estimation. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match the source directory. + + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # factor by which to boost silence during alignment. +fmllr_update_type=full +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr.sh " + echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --fmllr-update-type (full|diag|offset|none) # default full." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + + +if [ $stage -le 1 ]; then + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm.sh b/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm.sh new file mode 100755 index 00000000000..6bc58dfa2b0 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments and (if needed) speaker-vectors, given an +# SGMM system. If the system is built on top of SAT, you should supply +# transforms with the --transform-dir option. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false # use graphs from srcdir +use_gselect=false # use gselect info from srcdir [regardless, we use + # Gaussian-selection info, we might have to compute it though.] +gselect=15 # Number of Gaussian-selection indices for SGMMs. +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= # directory to find fMLLR transforms in. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_sgmm.sh " + echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" + echo " exp/sgmm4a exp/sgmm5a_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --transform-dir # directory to find fMLLR transforms" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +sdata=$data/split$nj + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir +cp $srcdir/final.occs $dir; + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option during alignment." +fi +## + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir + ln.pl $srcdir/fsts.*.gz $dir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + +## Work out where we're getting the Gaussian-selection info from +if $use_gselect; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; + graphdir=$srcdir + gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|" + ln.pl $srcdir/gselect.*.gz $dir +else + graphdir=$dir + if [ $stage -le 1 ]; then + echo "$0: computing Gaussian-selection info" + # Note: doesn't matter whether we use $alimdl or $mdl, they will + # have the same gselect info. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect --full-gmm-nbest=$gselect $alimdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" +fi + + +if [ $alimdl == $mdl ]; then + # Speaker-independent decoding-- just one pass. Not normal. + T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; + [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; + + if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + echo "$0: done aligning data." + exit 0; +fi + +# Continue with system with speaker vectors. +if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $alimdl" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: computing speaker vectors (1st pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ + sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: computing speaker vectors (2nd pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; + rm $dir/pre_vecs.* +fi + +if [ $stage -le 5 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ + --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm2.sh b/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm2.sh new file mode 100755 index 00000000000..58af0677b8c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/align_sgmm2.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments and (if needed) speaker-vectors, given an +# SGMM system. If the system is built on top of SAT, you should supply +# transforms with the --transform-dir option. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false # use graphs from srcdir +use_gselect=false # use gselect info from srcdir [regardless, we use + # Gaussian-selection info, we might have to compute it though.] +gselect=15 # Number of Gaussian-selection indices for SGMMs. +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= # directory to find fMLLR transforms in. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_sgmm.sh " + echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" + echo " exp/sgmm4a exp/sgmm5a_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --transform-dir # directory to find fMLLR transforms" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +sdata=$data/split$nj + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir +cp $srcdir/final.occs $dir; + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option during alignment." +fi +## + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir + ln.pl $srcdir/fsts.*.gz $dir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + +## Work out where we're getting the Gaussian-selection info from +if $use_gselect; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; + graphdir=$srcdir + gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|" + ln.pl $srcdir/gselect.*.gz $dir +else + graphdir=$dir + if [ $stage -le 1 ]; then + echo "$0: computing Gaussian-selection info" + # Note: doesn't matter whether we use $alimdl or $mdl, they will + # have the same gselect info. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect --full-gmm-nbest=$gselect $alimdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" +fi + + +if [ $alimdl == $mdl ]; then + # Speaker-independent decoding-- just one pass. Not normal. + T=`sgmm2-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; + [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; + + if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm2-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + echo "$0: done aligning data." + exit 0; +fi + +# Continue with system with speaker vectors. +if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $alimdl" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: computing speaker vectors (1st pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm2-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ + sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: computing speaker vectors (2nd pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; + rm $dir/pre_vecs.* +fi + +if [ $stage -le 5 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ + --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/align_si.sh b/egs/kaldi-vystadial-recipe/s5/steps/align_si.sh new file mode 100755 index 00000000000..d525550f111 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/align_si.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments using a model with delta or +# LDA+MLLT features. + +# If you supply the "--use-graphs true" option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match with the source directory. + + +# Begin configuration section. +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence during alignment. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_si.sh " + echo "e.g.: steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |" + +if $use_graphs; then + [ $nj != "`cat $srcdir/num_jobs`" ] && echo "$0: mismatch in num-jobs" && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "$0: no such file $srcdir/fsts.1.gz" && exit 1; + + $cmd JOB=1:$nj $dir/log/align.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ + "ark:gunzip -c $srcdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + # We could just use gmm-align in the next line, but it's less efficient as it compiles the + # training graphs one by one. + $cmd JOB=1:$nj $dir/log/align.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" ark:- \ + "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +echo "$0: done aligning data." diff --git a/egs/kaldi-vystadial-recipe/s5/steps/compute_cmvn_stats.sh b/egs/kaldi-vystadial-recipe/s5/steps/compute_cmvn_stats.sh new file mode 100755 index 00000000000..a340a9f54aa --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/compute_cmvn_stats.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Compute cepstral mean and variance statistics per speaker. +# We do this in just one job; it's fast. +# This script takes no options. +# +# Note: there is no option to do CMVN per utterance. The idea is +# that if you did it per utterance it would not make sense to do +# per-speaker fMLLR on top of that (since you'd be doing fMLLR on +# top of different offsets). Therefore what would be the use +# of the speaker information? In this case you should probably +# make the speaker-ids identical to the utterance-ids. The +# speaker information does not have to correspond to actual +# speakers, it's just the level you want to adapt at. + +echo "$0 $@" # Print the command line for logging + +if [ $# != 3 ]; then + echo "usage: compute_cmvn_stats.sh "; + exit 1; +fi + +if [ -f path.sh ]; then . ./path.sh; fi + +data=$1 +logdir=$2 +cmvndir=$3 + +# make $cmvndir an absolute pathname. +cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $cmvndir || exit 1; +mkdir -p $logdir || exit 1; + + +required="$data/feats.scp" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_cmvn.sh: no such file $f" + exit 1; + fi +done + +! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \ + 2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1; + +cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1; + +nc=`cat $data/cmvn.scp | wc -l` +nu=`cat $data/spk2utt | wc -l` +if [ $nc -ne $nu ]; then + echo "Error: it seems not all of the speakers got cmvn stats ($nc != $nu);" + exit 1; +fi + +echo "Succeeded creating CMVN stats for $name" diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode.sh new file mode 100755 index 00000000000..b4618cb1439 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration section. +transform_dir= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +nj=4 +cmd=run.pl +max_active=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +min_lmwt=9 +max_lmwt=20 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " --acwt # acoustic scale used for lattice generation " + echo " --min-lmwt # minumum LM-weight for lattice rescoring " + echo " --max-lmwt # maximum LM-weight for lattice rescoring " + echo " # speaker-adapted decoding" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + + +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" --min_lmwt $min_lmwt --max_lmwt $max_lmwt $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_basis_fmllr.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_basis_fmllr.sh new file mode 100755 index 00000000000..b0521aa59b3 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_basis_fmllr.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# Copyright 2012 Carnegie Mellon University (Author: Yajie Miao) +# Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that does basis fMLLR. This can be on top of delta+delta-delta, +# or LDA+MLLT features. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in initial pass. +alignment_model= +adapt_model= +final_model= +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. + +# Parameters in alignment of training data +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +align_beam=10 +retry_beam=40 + +max_active=7000 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +cmd=run.pl +si_dir= +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_basis_fmllr.sh [options] " + echo " e.g.: steps/decode_basis_fmllr.sh exp/tri2b/graph_tgpr data/train_si84 data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree $srcdir/fmllr.basis; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model --max-active $first_max_active $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +## Set up the unadapted features "$sifeats" for testing set +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +## + +## Now get the first-pass fMLLR transforms. +## We give all the default parameters in gmm-est-basis-fmllr +if [ $stage -le 1 ]; then + echo "$0: getting first-pass fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-basis-fmllr-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + --fmllr-min-count=200 --num-iters=10 --size-scale=0.2 \ + --step-size-iters=3 --write-weights=ark:$dir/pre_wgt.JOB \ + $adapt_model $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \ + ark:$dir/pre_trans.JOB || exit 1; +fi +## + +pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |" + +## Do the main lattice generation pass. Note: we don't determinize the lattices at +## this stage, as we're going to use them in acoustic rescoring with the larger +## model, and it's more correct to store the full state-level lattice for this purpose. +if [ $stage -le 2 ]; then + echo "$0: doing main lattice generation phase" + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt \ + --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \ + || exit 1; +fi +## + +## Do a second pass of estimating the transform-- this time with the lattices +## generated from the alignment model. Compose the transforms to get +## $dir/trans.1, etc. +if [ $stage -le 3 ]; then + echo "$0: estimating fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \ + "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-basis-fmllr --fmllr-min-count=200 \ + --spk2utt=ark:$sdata/JOB/spk2utt --write-weights=ark:$dir/trans_tmp_wgt.JOB \ + $adapt_model $srcdir/fmllr.basis "$pass1feats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \ + ark:$dir/trans.JOB || exit 1; +fi +## + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +# Rescore the state-level lattices with the final adapted features, and the final model +# (which by default is $srcdir/final.mdl, but which may be specified on the command line, +# useful in case of discriminatively trained systems). +# At this point we prune and determinize the lattices and write them out, ready for +# language model rescoring. + +if [ $stage -le 4 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1; +fi + +[ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +rm $dir/{trans_tmp,pre_trans}.* + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_biglm.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_biglm.sh new file mode 100755 index 00000000000..1586db1bb13 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_biglm.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration. +nj=4 +cmd=run.pl +maxactive=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/decode_si_biglm.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + + +graphdir=$1 +oldlm_fst=$2 +newlm_fst=$3 +data=$4 +dir=$5 + +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do + [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1; +done + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \ + echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work."; +[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \ + echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work."; + +oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |" +newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |" + +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_combine.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_combine.sh new file mode 100755 index 00000000000..b8ac5ede10b --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_combine.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Combine two decoding directories by composing the lattices (we +# apply a weight to each of the original weights, by default 0.5 each). + +# Begin configuration section. +weight1=0.5 # Weight on 1st set of lattices. +cmd=run.pl +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/decode_combine.sh [options] " + echo " e.g.: steps/decode_combine.sh data/lang data/test exp/dir1/decode exp/dir2/decode exp/combine_1_2/decode" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --weight1 # Weight on 1st set of lattices (default 0.5)" + exit 1; +fi + +data=$1 +lang_or_graphdir=$2 +srcdir1=$3 +srcdir2=$4 +dir=$5 + +for f in $data/utt2spk $lang_or_graphdir/phones.txt $srcdir1/lat.1.gz $srcdir2/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj1=`cat $srcdir1/num_jobs` || exit 1; +nj2=`cat $srcdir2/num_jobs` || exit 1; +[ $nj1 -ne $nj2 ] && echo "$0: mismatch in number of jobs $nj1 versus $nj2" && exit 1; +nj=$nj1 + +mkdir -p $dir/log +echo $nj > $dir/num_jobs + +# The lattice-interp command does the score interpolation (with composition), +# and the lattice-copy-backoff replaces the result with the 1st lattice, in +# cases where the composed result was empty. +$cmd JOB=1:$nj $dir/log/interp.JOB.log \ + lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \ + "ark,s,cs:gunzip -c $srcdir2/lat.JOB.gz|" ark:- \| \ + lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $lang_or_graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_fmllr.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_fmllr.sh new file mode 100755 index 00000000000..0b17e0bf3a6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_fmllr.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that does fMLLR. This can be on top of delta+delta-delta, or +# LDA+MLLT features. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in initial pass. +alignment_model= +adapt_model= +final_model= +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. +max_active=7000 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +cmd=run.pl +si_dir= +fmllr_update_type=full +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_fmllr.sh [options] " + echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model --max-active $first_max_active $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +## Set up the unadapted features "$sifeats" +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +## + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting first-pass fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \ + ark:$dir/pre_trans.JOB || exit 1; +fi +## + +pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |" + +## Do the main lattice generation pass. Note: we don't determinize the lattices at +## this stage, as we're going to use them in acoustic rescoring with the larger +## model, and it's more correct to store the full state-level lattice for this purpose. +if [ $stage -le 2 ]; then + echo "$0: doing main lattice generation phase" + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt \ + --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \ + || exit 1; +fi +## + +## Do a second pass of estimating the transform-- this time with the lattices +## generated from the alignment model. Compose the transforms to get +## $dir/trans.1, etc. +if [ $stage -le 3 ]; then + echo "$0: estimating fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \ + "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \ + ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \ + ark:$dir/trans.JOB || exit 1; +fi +## + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +# Rescore the state-level lattices with the final adapted features, and the final model +# (which by default is $srcdir/final.mdl, but which may be specified on the command line, +# useful in case of discriminatively trained systems). +# At this point we prune and determinize the lattices and write them out, ready for +# language model rescoring. + +if [ $stage -le 4 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1; +fi + +[ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +rm $dir/{trans_tmp,pre_trans}.* + +exit 0; + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_fmmi.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_fmmi.sh new file mode 100755 index 00000000000..5f1571faefc --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_fmmi.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# Decoding of fMMI or fMPE models (feature-space discriminative training). +# If transform-dir supplied, expects e.g. fMLLR transforms in that dir. + +# Begin configuration section. +iter=final +nj=4 +cmd=run.pl +maxactive=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE. Should match train. +transform_dir= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_fmmi.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode_fmmi.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "You can also use fMLLR features-- you have to supply --transform-dir option." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # where to find fMLLR transforms." + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +model=$srcdir/$iter.mdl + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode_fmmi.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_fmmi.sh: feature type is $feat_type"; + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +fmpefeats="$feats fmpe-apply-transform $srcdir/$iter.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" + +# Get Gaussian selection info. +$cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$ngselect $srcdir/$iter.fmpe "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; + +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$fmpefeats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_fromlats.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_fromlats.sh new file mode 100755 index 00000000000..5b8f41a868f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_fromlats.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Decode, limited to the word-sequences that were present in a set +# of lattices on disk. The other lattices do not have to be built +# with the same tree or the same context size-- however, you do +# have to be using the same vocabulary (words.txt)-- if not you'd +# have to map the vocabulary somehow. + +# Note: if the trees are identical, you can use gmm-rescore-lattice. + +# Mechanism: create an unweighted acceptor (on words) for each utterance, +# compose that with G, determinize, and then use compile-train-graphs-fsts +# to compile a graph for each utterance, to decode with. + +# Begin configuration. +cmd=run.pl +maxactive=7000 +beam=20.0 +latbeam=7.0 +acwt=0.083333 +batch_size=75 # Limits memory blowup in compile-train-graphs-fsts +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + + + +if [ $# != 4 ]; then + echo "Usage: steps/decode_si_fromlats.sh [options] " + echo "e.g.: steps/decode_si_fromlats.sh data/test_dev93 data/lang_test_tg exp/tri2b/decode_tgpr_dev93 exp/tri2a/decode_tgpr_dev93_fromlats" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + + +data=$1 +lang=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +mkdir -p $dir/log + +nj=`cat $olddir/num_jobs` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +sdata=$data/split$nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj >$dir/num_jobs + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $olddir/lat.1.gz \ + $srcdir/tree $lang/L_disambig.fst $lang/phones.txt; do + [ ! -f $f ] && echo "decode_si_fromlats.sh: no such file $f" && exit 1; +done + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + + +$cmd JOB=1:$nj $dir/log/decode_lats.JOB.log \ + lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \ + fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \ + fstdeterminizestar ark:- ark:- \| \ + compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \ + --batch-size=$batch_size $scale_opts $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam --acoustic-scale=$acwt \ + --allow-partial=true --word-symbol-table=$lang/words.txt \ + $srcdir/final.mdl ark:- "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $lang $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_nnet.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_nnet.sh new file mode 100755 index 00000000000..8bc37539c60 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_nnet.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely, Daniel Povey +# Apache 2.0 + +# Begin configuration section. +iter= +nnet= # You can specify the nnet to use (e.g. if you want to use the .alinnet) +model= # You can specify the transition model to use (e.g. if you want to use the .alimdl) + +nj=4 +cmd=run.pl +max_active=7000 +beam=19.0 # GMM:13.0 +latbeam=9.0 # GMM:6.0 +acwt=0.12 # GMM:0.0833, note: only really affects pruning (scoring is on lattices). +min_lmwt=4 +max_lmwt=15 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: $0 exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --nnet # which nnet to use (e.g. to" + echo " --model # which model to use (e.g. to" + echo " # specify the final.nnet)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " # speaker-adapted decoding" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$nnet" ]; then # if --nnet was not specified on the command line... + if [ -z $iter ]; then nnet=$srcdir/final.nnet; + else nnet=$(find $srcdir/nnet/ -name nnet_*_iter{,0}${iter}_lrate*); fi +fi +[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1; + +if [ -z "$model" ]; then # if --model was not specified on the command line... + model=$srcdir/final.mdl; +fi + +#hard-select feature-extraction files +hamm_dct=$srcdir/hamm_dct.mat +cmvn_g=$srcdir/cmvn_glob.mat + +#remove the softmax from the nnet +nnet_i=$nnet; nnet=$dir/$(basename $nnet)_nosoftmax; +nnet-trim-n-last-transforms --n=1 --binary=false $nnet_i $nnet 2>$dir/$(basename $nnet)_nosoftmax_log || exit 1; + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $hamm_dct $cmvn_g $nnet_i $nnet $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +# PREPARE THE LOG-POSTERIOR COMPUTATION PIPELINE +norm_vars=$(cat $srcdir/norm_vars 2>/dev/null) +splice_opts=$(cat $srcdir/splice_opts 2>/dev/null) +feat_type=$(cat $srcdir/feat_type 2>/dev/null) + +# We use the pre-computed CMVN as well as pre-defined splicing +feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" + +# Transform feats +echo "Feature type : $feat_type" +case $feat_type in + plain) + ;; + traps) + transf=$srcdir/hamm_dct.mat + feats="$feats transform-feats $transf ark:- ark:- |" + ;; + transf) + feats="$feats transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + transf-sat) + echo yet unimplemented... + exit 1; + ;; + *) + echo "Unknown feature type $feat_type" + exit 1 + ;; +esac + +# Global normalization and the MLP +feats="$feats apply-cmvn --norm-vars=true $cmvn_g ark:- ark:- | nnet-forward --no-softmax=true --class-frame-counts=$srcdir/ali_train.counts $nnet ark:- ark:- |" + +# Run the decoding in the queue +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + latgen-faster-mapped --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +# Run the scoring +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --min-lmwt $min_lmwt --max-lmwt $max_lmwt --cmd "$cmd" $data $graphdir $dir 2>$dir/scoring.log || exit 1; + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm.sh new file mode 100755 index 00000000000..211cb03921c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm.sh @@ -0,0 +1,254 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, with speaker vectors. +# If the SGMM system was +# built on top of fMLLR transforms from a conventional system, you should +# provide the --transform-dir option. + +# Begin configuration section. +stage=1 +alignment_model= +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=15.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 +lat_beam=8.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: steps/decode_sgmm.sh [options] " + echo " e.g.: steps/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --alignment-model # Model for the first-pass decoding." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +## Calculate FMLLR pre-transforms if needed. We are doing this here since this +## step is requried by models both with and without speaker vectors +if $use_fmllr; then + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi +fi + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; + +# Generate state-level lattice which we can rescore. This is done with the +# alignment model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lat_beam \ + --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +## Check if the model has speaker vectors +spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` + +if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm-est-spkvecs. + if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ + sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ + sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; + fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; + fi + rm $dir/pre_vecs.* + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lat_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + fi + rm $dir/pre_lat.*.gz + +else ### For models without speaker vectors: + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ] && $use_fmllr; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lat_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + rm $dir/pre_lat.*.gz + else # Already done with decoding if no adaptation needed. + for n in `seq 1 $nj`; do + mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz + done + fi + +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir +fi +echo "Decoding done." +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2.sh new file mode 100755 index 00000000000..53c2f67e3a3 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, with speaker vectors. +# If the SGMM system was +# built on top of fMLLR transforms from a conventional system, you should +# provide the --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=13.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 +lat_beam=6.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: steps/decode_sgmm2.sh [options] " + echo " e.g.: steps/decode_sgmm2.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 +gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +# Generate state-level lattice which we can rescore. This is done with the alignment +# model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + sgmm2-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lat_beam \ + --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $srcdir/final.alimdl \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm2-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm2-est-spkvecs. +if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \| \ + sgmm2-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- ark:- \| \ + sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; +fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. +if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; +fi +rm $dir/pre_vecs.* + +if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" +fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. +if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lat_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi +rm $dir/pre_lat.*.gz + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at different +# acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh --cmd "$cmd" $data $graphdir $dir +fi +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore.sh new file mode 100755 index 00000000000..c8467a66924 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. The directory with the lattices +# is assumed to contain speaker vectors, if used. Basically it rescores +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_rescore.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --iter # iteration of model to use (default: final)" + exit 1; +fi + +graphdir=$1 +data=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \ + $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +gselect_opt="--gselect=ark:gunzip -c $olddir/gselect.JOB.gz|" +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +if [ -f $olddir/trans.1 ]; then + echo "$0: using (in addition to any previous transforms) transforms from $olddir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |" +fi +## + +# Rescore the state-level lattices with the model provided. Just +# one command in this script. +echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl" +$cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt \ + $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore_project.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore_project.sh new file mode 100755 index 00000000000..eb8347f7532 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm2_rescore_project.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. This version does the "predictive" +# SGMM, where we subtract some constant times the log-prob of the left +# few spliced frames, and the same for the right few. +# The directory with the lattices +# is assumed to contain any speaker vectors, if used. This script just +# adds into the acoustic scores, (some constant, default -0.25) times +# the acoustic score of the left model, and the same for the right model. + +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +stage=0 +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +prob_scale=-0.25 +dimensions=0:13:104:117 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/decode_sgmm_rescore_project.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore_project.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/tri2b/full.mat exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a/decode_dev93_tgpr_predict" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --prob-scale # Default -0.25, scale on left and right models." + exit 1; +fi + +full_lda_mat=$1 +graphdir=$2 +data=$3 +olddir=$4 +dir=$5 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $full_lda_mat $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz \ + $olddir/gselect.1.gz $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + +if [ $stage -le 0 ]; then + # Get full LDA+MLLT mat and its inverse. Note: the full LDA+MLLT mat is + # the LDA+MLLT mat, plus the "rejected" rows of the LDA matrix. + $cmd $dir/log/get_full_lda.log \ + get-full-lda-mat $srcdir/final.mat $full_lda_mat $dir/full.mat $dir/full_inv.mat || exit 1; +fi + +if [ $stage -le 1 ]; then + left_start=`echo $dimensions | cut '-d:' -f 1`; + left_end=`echo $dimensions | cut '-d:' -f 2`; + right_start=`echo $dimensions | cut '-d:' -f 3`; + right_end=`echo $dimensions | cut '-d:' -f 4`; + + # Prepare left and right models. For now, the dimensions are hardwired (e.g., 13 MFCCs and splice 9 frames). + # Note: the choice of dividing by the prob of the left 4 and the right 4 frames is a bit arbitrary and + # we could investigate different configurations. + $cmd $dir/log/left.log \ + sgmm2-project --start-dim=$left_start --end-dim=$left_end $srcdir/final.mdl $dir/full.mat $dir/left.mdl $dir/left.mat || exit 1; + $cmd $dir/log/right.log \ + sgmm2-project --start-dim=$right_start --end-dim=$right_end $srcdir/final.mdl $dir/full.mat $dir/right.mdl $dir/right.mat || exit 1; +fi + + +# we apply the scaling on the new acoustic probs by adding the inverse +# of that to the old acoustic probs, and then later inverting again. +# this has to do with limitations in sgmm2-rescore-lattice: we can only +# scale the *old* acoustic probs, not the new ones. +inverse_prob_scale=`perl -e "print (1.0 / $prob_scale);"` +cur_lats="ark:gunzip -c $olddir/lat.JOB.gz | lattice-scale --acoustic-scale=$inverse_prob_scale ark:- ark:- |" + +## Set up features. Note: we only support LDA+MLLT features, this +## is inherent in the method, we could not support deltas. + +for model_type in left right; do + + feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features. + if [ ! -z "$transform_dir" ]; then # using speaker-specific transforms. + # we want to transform in the sequence: $dir/full.mat, then the result of + # (extend-transform-dim $transform_dir/trans.JOB), then $dir/full_inv.mat to + # get back to the spliced space, then the left.mat or right.mat. But + # note that compose-transforms operates in matrix-multiplication order, + # which is opposite from the "order of applying the transforms" order. + new_dim=$[`copy-matrix --binary=false $dir/full.mat - | wc -l` - 1]; # 117 in normal case. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk 'ark:extend-transform-dim --new-dimension=$new_dim ark:$transform_dir/trans.JOB ark:- | compose-transforms ark:- $dir/full.mat ark:- | compose-transforms $dir/full_inv.mat ark:- ark:- | compose-transforms $dir/${model_type}.mat ark:- ark:- |' ark:- ark:- |" + else # else, we transform with the "left" or "right" matrix; these transform from the + # spliced space. + feats="$feats transform-feats $dir/${model_type}.mat |" + # If we don't have the --transform-dir option, make sure the model was + # trained in the same way. + if grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." + fi + fi + if [ -f $olddir/trans.1 ]; then + echo "$0: warning: not using transforms in $olddir (this is just a " + echo " limitation of the script right now, and could be fixed)." + fi + + if [ $stage -le 2 ]; then + echo "Getting gselect info for $model_type model." + $cmd JOB=1:$nj $dir/log/gselect.$model_type.JOB.log \ + sgmm2-gselect $dir/$model_type.mdl "$feats" \ + "ark,t:|gzip -c >$dir/gselect.$model_type.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.$model_type.JOB.gz|" + + + # Rescore the state-level lattices with the model provided. Just + # one command in this script. + # The --old-acoustic-scale=1.0 option means we just add the scores + # to the old scores. + if [ $stage -le 3 ]; then + echo "$0: rescoring lattices with $model_type model" + $cmd JOB=1:$nj $dir/log/rescore.${model_type}.JOB.log \ + sgmm2-rescore-lattice --old-acoustic-scale=1.0 "$gselect_opt" $spkvecs_opt \ + $dir/$model_type.mdl "$cur_lats" "$feats" \ + "ark:|gzip -c > $dir/lat.${model_type}.JOB.gz" || exit 1; + fi + cur_lats="ark:gunzip -c $dir/lat.${model_type}.JOB.gz |" +done + +if [ $stage -le 4 ]; then + echo "$0: getting final lattices." + $cmd JOB=1:$nj $dir/log/scale_lats.JOB.log \ + lattice-scale --acoustic-scale=$prob_scale "$cur_lats" "ark:|gzip -c >$dir/lat.JOB.gz" \ + || exit 1; +fi + +rm $dir/lat.{left,right}.*.gz 2>/dev/null # note: if these still exist, it will + # confuse the scoring script. + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm_rescore.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm_rescore.sh new file mode 100755 index 00000000000..8650776539b --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_sgmm_rescore.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. The directory with the lattices +# is assumed to contain speaker vectors, if used. Basically it rescores +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_rescore.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --iter # iteration of model to use (default: final)" + exit 1; +fi + +graphdir=$1 +data=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \ + $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +gselect_opt="--gselect=ark:gunzip -c $olddir/gselect.JOB.gz|" +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +if [ -f $olddir/trans.1 ]; then + echo "$0: using (in addition to any previous transforms) transforms from $olddir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |" +fi +## + +# Rescore the state-level lattices with the model provided. Just +# one command in this script. +echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl" +$cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt \ + $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/decode_si.sh b/egs/kaldi-vystadial-recipe/s5/steps/decode_si.sh new file mode 100755 index 00000000000..b4618cb1439 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/decode_si.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration section. +transform_dir= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +nj=4 +cmd=run.pl +max_active=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +min_lmwt=9 +max_lmwt=20 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " --acwt # acoustic scale used for lattice generation " + echo " --min-lmwt # minumum LM-weight for lattice rescoring " + echo " --max-lmwt # maximum LM-weight for lattice rescoring " + echo " # speaker-adapted decoding" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + + +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" --min_lmwt $min_lmwt --max_lmwt $max_lmwt $data $graphdir $dir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/get_fmllr_basis.sh b/egs/kaldi-vystadial-recipe/s5/steps/get_fmllr_basis.sh new file mode 100755 index 00000000000..9ae46bc245d --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/get_fmllr_basis.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2012 Carnegie Mellon University (Author: Yajie Miao) +# Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that computes basis for basis-fMLLR (see decode_fmllr_basis.sh). +# This can be on top of delta+delta-delta, or LDA+MLLT features. + +stage=0 +# Parameters in alignment of training data +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +per_utt=true # If true, then treat each utterance as a separate speaker for purposes of + # basis training... this is recommended if the number of actual speakers in your + # training set is less than (feature-dim) * (feature-dim+1). +align_beam=10 +retry_beam=40 +silence_weight=0.01 +cmd=run.pl +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/get_fmllr_basis.sh [options] " + echo " e.g.: steps/decode_basis_fmllr.sh data/train_si84 data/lang exp/tri3b/" + echo "Note: we currently assume that this is the same data you trained the model with." + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +data=$1 +lang=$2 +dir=$3 + +nj=`cat $dir/num_jobs` || exit 1; +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +splice_opts=`cat $dir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +for f in $data/feats.scp $dir/final.alimdl $dir/final.mdl $dir/ali.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set up the unadapted features "$sifeats". +if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + + # Set up the adapted features "$feats" for training set. +if [ -f $srcdir/trans.1 ]; then + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$sdata/trans.JOB ark:- ark:- |"; +else + feats="$sifeats"; +fi + + +if $per_utt; then + spk2utt_opt= # treat each utterance as separate speaker when computing basis. + echo "Doing per-utterance adaptation for purposes of computing the basis." +else + echo "Doing per-speaker adaptation for purposes of computing the basis." + [ `cat $sdata/spk2utt | wc -l` -lt $[41*40] ] && \ + echo "Warning: number of speakers is small, might be better to use --per-utt=true." + spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt" +fi + +# Note: we get Gaussian level alignments with the "final.mdl" and the +# speaker adapted features. +$cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \ + gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \ + gmm-basis-fmllr-accs-gpost $spk2utt_opt \ + $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; + +# Compute the basis matrices. +$cmd $dir/log/basis_training.log \ + gmm-basis-fmllr-training $dir/final.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1; +rm $dir/basis.acc.* 2>/dev/null + +exit 0; + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/lmrescore.sh b/egs/kaldi-vystadial-recipe/s5/steps/lmrescore.sh new file mode 100755 index 00000000000..3553a40ea33 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/lmrescore.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Begin configuration section. +mode=4 +cmd=run.pl +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +for x in `seq 2`; do + [ "$1" == "--cmd" ] && cmd=$2 && shift 2; + [ "$1" == "--mode" ] && mode=$2 && shift 2; +done + +if [ $# != 5 ]; then + echo "Do language model rescoring of lattices (remove old LM, add new LM)" + echo "Usage: steps/lmrescore.sh [options] " + echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +newlang=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +newlm=$newlang/G.fst +! cmp $oldlang/words.txt $newlang/words.txt && echo "Warning: vocabularies may be incompatible." +[ ! -f $oldlm ] && echo Missing file $oldlm && exit 1; +[ ! -f $newlm ] && echo Missing file $newlm && exit 1; +! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1; + +oldlmcommand="fstproject --project_output=true $oldlm |" +newlmcommand="fstproject --project_output=true $newlm |" + +mkdir -p $outdir/log + +phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'` + +if [ "$mode" == 4 ]; then + # we have to prepare $outdir/Ldet.fst in this case: determinized + # lexicon (determinized on phones), with disambig syms removed. + # take L_disambig.fst; get rid of transition with "#0 #0" on it; determinize + # with epsilon removal; remove disambiguation symbols. + fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \ + fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$outdir/Ldet.fst || exit 1; +fi + +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + + +#for lat in $indir/lat.*.gz; do +# number=`basename $lat | cut -d. -f2`; +# newlat=$outdir/`basename $lat` + +case "$mode" in + 1) # 1 is inexact, it's the original way of doing it. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ + lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$outdir/lat.JOB.gz" \ + || exit 1; + ;; + 2) # 2 is equivalent to 1, but using more basic operations, combined. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; + 3) # 3 is "exact" in that we remove the old LM scores accepting any path + # through G.fst (which is what we want as that happened in lattice + # generation), but we add the new one with "phi matcher", only taking + # backoff arcs if an explicit arc did not exist. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \ + lattice-determinize ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; + 4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores, + # it removes the old graph scores entirely and adds in the lexicon, + # grammar and transition weights. + mdl=`dirname $indir`/final.mdl + [ ! -f $mdl ] && echo No such model $mdl && exit 1; + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --lm-scale=0.0 ark:- ark:- \| \ + lattice-to-phone-lattice $mdl ark:- ark:- \| \ + lattice-compose ark:- $outdir/Ldet.fst ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \ + lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \ + $mdl ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; +esac + +rm $outdir/Ldet.fst 2>/dev/null + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $newlang $outdir + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_bn_feats.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_bn_feats.sh new file mode 100755 index 00000000000..0ee91959d5a --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_bn_feats.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely, Daniel Povey +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +trim_transforms=4 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "usage: $0 [oprtions] "; + echo "options: " + echo " --trim-transforms # number of NNet Components to remove from the end" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +if [ -f path.sh ]; then . path.sh; fi + +data=$1 +srcdata=$2 +nndir=$3 +logdir=$4 +bnfeadir=$5 + +######## CONFIGURATION +norm_vars=$(cat $nndir/norm_vars) +splice_opts=$(cat $nndir/splice_opts) +feat_type=$(cat $nndir/feat_type) +cmvn_g=$nndir/cmvn_glob.mat + +# copy the dataset metadata from srcdata. +mkdir -p $data || exit 1; +cp $srcdata/* $data 2>/dev/null; rm $data/feats.scp $data/cmvn.scp; + +# make $bnfeadir an absolute pathname. +bnfeadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $bnfeadir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $bnfeadir || exit 1; +mkdir -p $data || exit 1; +mkdir -p $logdir || exit 1; + + +srcscp=$srcdata/feats.scp +scp=$data/feats.scp + +required="$srcscp $nndir/final.nnet $cmvn_g $srcdata/cmvn.scp" + +for f in $required; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1; + fi +done + +if [ ! -d $srcdata/split$nj -o $srcdata/split$nj -ot $srcdata/feats.scp ]; then + utils/split_data.sh $srcdata $nj +fi + + +#cut the MLP +nnet=$bnfeadir/feature_extractor.nnet +nnet-trim-n-last-transforms --n=$trim_transforms --binary=false $nndir/final.nnet $nnet 2>$logdir/feature_extractor.log + + +rm $data/.error 2>/dev/null + +echo "Creating bn-feats into $data" + + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. +for ((n=1; n<=nj; n++)); do + log=$logdir/make_bnfeats.$n.log + # Prepare features : do per-speaker CMVN and splicing + feats="ark:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$srcdata/split$nj/$n/utt2spk scp:$srcdata/cmvn.scp scp:$srcdata/split$nj/$n/feats.scp ark:- | splice-feats --print-args=false $splice_opts ark:- ark:- |" + # Choose further processing according to : feat_type + case $feat_type in + plain) + ;; + traps) + transf=$nndir/hamm_dct.mat + feats="$feats transform-feats --print-args=false $transf ark:- ark:- |" + ;; + transf) + feats="$feats transform-feats $nndir/final.mat ark:- ark:- |" + ;; + transf-sat) + echo yet unimplemented... + exit 1; + ;; + *) + echo "Unknown feature type $feat_type" + exit 1; + esac + # Rescale to zero mean and unit variance + feats="$feats apply-cmvn --print-args=false --norm-vars=true $cmvn_g ark:- ark:- |" + + # MLP forward + $cmd $log \ + nnet-forward $nnet "$feats" \ + ark,scp:$bnfeadir/raw_bnfea_$name.$n.ark,$bnfeadir/raw_bnfea_$name.$n.scp \ + || touch $data/.error & + +done +wait; + +N0=$(cat $srcdata/feats.scp | wc -l) +N1=$(cat $bnfeadir/raw_bnfea_$name.*.scp | wc -l) +if [[ -f $data/.error && "$N0" != "$N1" ]]; then + echo "Error producing bnfea features for $name:" + echo "Original feats : $N0 Bottleneck feats : $N1" + exit 1; +fi + +if [[ -f $data/.error ]]; then + echo "Warning : .error producing bnfea features, but all the $N1 features were computed..."; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $bnfeadir/raw_bnfea_$name.$n.scp >> $data/feats.scp +done + + +echo "Succeeded creating MLP-BN features for $name ($data)" + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_denlats.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats.sh new file mode 100755 index 00000000000..be0fe5e9fb8 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training. +# Creates its output in $dir/lat.*.gz + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats.sh [options] " + echo " e.g.: steps/make_denlats.sh data/train data/lang exp/tri1 exp/tri1_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. + +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + + +if [ -s $dir/dengraph/HCLG.fst ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1; +fi + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \ + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + if [ -f $srcdir/final.alimdl ]; then + echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option."; + exit 1; + fi +fi + + +if [ $sub_split -eq 1 ]; then + $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ + gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices." diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm.sh new file mode 100755 index 00000000000..a18934d04ed --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training, with SGMM models. If the +# features have fMLLR transforms you have to supply the --transform-dir option. +# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is +# possibly a slight mismatch because the speaker vectors come from supervised +# adaptation. + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats_sgmm.sh [options] " + echo " e.g.: steps/make_denlats_sgmm.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 # could also be $srcdir, but only if no vectors supplied. +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. + +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph. + +if [ -s $dir/dengraph/HCLG.fst ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1; +fi + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \ + echo "$0: LDA transforms differ between $alidir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "Assuming you don't have a SAT system, since no --transform-dir option supplied " +fi + +if [ -f $alidir/gselect.1.gz ]; then + gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: no such file $alidir/gselect.1.gz" && exit 1; +fi + +if [ -f $alidir/vecs.1 ]; then + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + if [ -f $alidir/final.alimdl ]; then + echo "You seem to have an SGMM system with speaker vectors," + echo "yet we can't find speaker vectors. Perhaps you supplied" + echo "the model director instead of the alignment directory?" + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ + sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` + gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` + $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \ + --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ + --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm2.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm2.sh new file mode 100755 index 00000000000..dc5dccdf684 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_denlats_sgmm2.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training, with SGMM models. If the +# features have fMLLR transforms you have to supply the --transform-dir option. +# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is +# possibly a slight mismatch because the speaker vectors come from supervised +# adaptation. + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats_sgmm2.sh [options] " + echo " e.g.: steps/make_denlats_sgmm2.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 # could also be $srcdir, but only if no vectors supplied. +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. + +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph. + +if [ -s $dir/dengraph/HCLG.fst ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1; +fi + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \ + echo "$0: LDA transforms differ between $alidir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "Assuming you don't have a SAT system, since no --transform-dir option supplied " +fi + +if [ -f $alidir/gselect.1.gz ]; then + gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: no such file $alidir/gselect.1.gz" && exit 1; +fi + +if [ -f $alidir/vecs.1 ]; then + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + if [ -f $alidir/final.alimdl ]; then + echo "$0: You seem to have an SGMM system with speaker vectors," + echo "yet we can't find speaker vectors. Perhaps you supplied" + echo "the model director instead of the alignment directory?" + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ + sgmm2-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` + gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` + $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + sgmm2-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \ + --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ + --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_fbank.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_fbank.sh new file mode 100755 index 00000000000..d482e4737ae --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_fbank.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +fbank_config=conf/fbank.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_fbank.sh [options] "; + echo "options: " + echo " --fbank-config # config passed to compute-fbank-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +fbankdir=$3 + + +# make $fbankdir an absolute pathname. +fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $fbankdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $fbank_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_fbank.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_fbank.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-fbank-feats --verbose=2 --config=$fbank_config ark:- \ + ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_fbank.JOB.log \ + compute-fbank-feats --verbose=2 --config=$fbank_config scp:$logdir/wav.JOB.scp \ + ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing fbank features for $name:" + tail $logdir/make_fbank.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $fbankdir/raw_fbank_$name.$n.scp >> $data/feats.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating filterbank features for $name" diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_mfcc.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_mfcc.sh new file mode 100755 index 00000000000..5951bf96fc0 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_mfcc.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +mfcc_config=conf/mfcc.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_mfcc.sh [options] "; + echo "options: " + echo " --mfcc-config # config passed to compute-mfcc-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +mfccdir=$3 + + +# make $mfccdir an absolute pathname. +mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $mfccdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $mfcc_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_mfcc.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- \ + ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \ + compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp \ + ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing mfcc features for $name:" + tail $logdir/make_mfcc.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $mfccdir/raw_mfcc_$name.$n.scp >> $data/feats.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating MFCC features for $name" diff --git a/egs/kaldi-vystadial-recipe/s5/steps/make_plp.sh b/egs/kaldi-vystadial-recipe/s5/steps/make_plp.sh new file mode 100755 index 00000000000..0e58e9aa058 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/make_plp.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +plp_config=conf/plp.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_plp.sh [options] "; + echo "options: " + echo " --plp-config # config passed to compute-plp-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +plpdir=$3 + + +# make $plpdir an absolute pathname. +plpdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $plpdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $plpdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $plp_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_plp.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_plp.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-plp-feats --verbose=2 --config=$plp_config ark:- \ + ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_plp.JOB.log \ + compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp \ + ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing plp features for $name:" + tail $logdir/make_plp.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $plpdir/raw_plp_$name.$n.scp >> $data/feats.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating PLP features for $name" diff --git a/egs/kaldi-vystadial-recipe/s5/steps/mixup.sh b/egs/kaldi-vystadial-recipe/s5/steps/mixup.sh new file mode 100755 index 00000000000..f22d51244ca --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/mixup.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# mix up (or down); do 3 iters of model training; realign; then do two more +# iterations of model training. + +# Begin configuration section. +cmd=run.pl +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +num_iters=5 +realign_iters=3 # Space-separated list of iterations to realign on. +stage=0 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/mixup.sh " + echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numgauss=$1 +data=$2 +lang=$3 +srcdir=$4 +dir=$5 + +for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do + [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1; +done + +nj=`cat $srcdir/num_jobs` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/tree $dir + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ -f $srcdir/trans.1 ]; then + echo Using transforms from $srcdir; + rm $dir/trans.* 2>/dev/null + ln.pl $srcdir/trans.* $dir # Link those transforms to current directory. + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" +else + feats="$sifeats" +fi +## Done setting up features. + +rm $dir/fsts.*.gz 2>/dev/null +ln.pl $srcdir/fsts.*.gz $dir # Link training-graph FSTs to current directory. + +## Mix up old model +if [ $stage -le 0 ]; then + echo Mixing up old model to $numgauss Gaussians +# Note: this script also works for mixing down. + $cmd $dir/log/mixup.log \ + gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \ + $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1; +fi +## Done. + +cur_alidir=$srcdir # dir to find alignments. +[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if + # we won't be generating them. + +x=1 +while [ $x -le $num_iters ]; do + echo "$0: iteration $x" + if echo $realign_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo "$0: realigning data" + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + cur_alidir=$dir + fi + if [ $stage -le $x ]; then + echo "$0: accumulating statistics" + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + echo "$0: re-estimating model" + [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs 2>/dev/null + fi + x=$[$x+1] +done + +rm $dir/final.mdl $dir/final.occs 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +if [ -f $dir/trans.1 ]; then + echo "$0: accumulating stats for alignment model." + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ + ark,s,cs:- $dir/$x.JOB.acc || exit 1; + [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; + echo "$0: Re-estimating alignment model." + $cmd $dir/log/est_alimdl.log \ + gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; + rm $dir/$x.*.acc + rm $dir/final.alimdl 2>/dev/null + ln -s $x.alimdl $dir/final.alimdl +fi + +utils/summarize_warnings.pl $dir/log + +echo Done diff --git a/egs/kaldi-vystadial-recipe/s5/steps/rnnlmrescore.sh b/egs/kaldi-vystadial-recipe/s5/steps/rnnlmrescore.sh new file mode 100755 index 00000000000..e204e1acd65 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/rnnlmrescore.sh @@ -0,0 +1,176 @@ +#!/bin/bash + + +# Begin configuration section. +N=10 +inv_acwt=12 +cmd=run.pl +use_phi=false # This is kind of an obscure option. If true, we'll remove the old + # LM weights (times 1-RNN_scale) using a phi (failure) matcher, which is + # appropriate if the old LM weights were added in this way, e.g. by + # lmrescore.sh. Otherwise we'll use normal composition, which is appropriate + # if the lattices came directly from decoding. This won't actually make much + # difference (if any) to WER, it's more so we know we are doing the right thing. +test=false # Activate a testing option. +stage=1 # Stage of this script, for partial reruns. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh + + +if [ $# != 6 ]; then + echo "Do language model rescoring of lattices (partially remove old LM, add new LM)" + echo "This version applies an RNNLM and mixes it with the LM scores" + echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)" + echo "" + echo "Usage: utils/rnnlmrescore.sh " + echo "Main options:" + echo " --inv-acwt # default 12. e.g. --inv-acwt 17. Equivalent to LM scale to use." + echo " # for N-best list generation... note, we'll score at different acwt's" + echo " --cmd # how to run jobs." + echo " --phi (true|false) # Should be set to true if the source lattices were created" + echo " # by lmrescore.sh, false if they came from decoding." + echo " --N # Value of N in N-best rescoring (default: 10)" + exit 1; +fi + + + +rnnweight=$1 +oldlang=$2 +rnndir=$3 +data=$4 +indir=$5 +dir=$6 + + +acwt=`perl -e "print (1.0/$inv_acwt);"` # Note: we'll actually produce lattices + # that will be scored at a range of acoustic weights. This acwt should be close + # to the final one we'll pick, though, for best performance (it controls the + # N-best list generation). + +for f in $oldlang/G.fst $rnndir/rnnlm $data/feats.scp $indir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1; +done + +nj=`cat $indir/num_jobs` || exit 1; +oldlm=$oldlang/G.fst +adir=$dir/archives + +mkdir -p $dir; +phi=`grep -w '#0' $oldlang/words.txt | awk '{print $2}'` + +rm $dir/.error 2>/dev/null +mkdir -p $dir/log + +# First convert lattice to N-best. Be careful because this +# will be quite sensitive to the acoustic scale; this should be close +# to the one we'll finally get the best WERs with. +# Note: the lattice-rmali part here is just because we don't +# need the alignments for what we're doing. +if [ $stage -le 1 ]; then + echo "$0: converting lattices to N-best." + $cmd JOB=1:$nj $dir/log/lat2nbest.JOB.log \ + lattice-to-nbest --acoustic-scale=$acwt --n=$N \ + "ark:gunzip -c $indir/lat.JOB.gz|" ark:- \| \ + lattice-rmali ark:- "ark:|gzip -c >$dir/nbest1.JOB.gz" || exit 1; +fi + +# next remove part of the old LM probs. +if $use_phi; then + if [ $stage -le 2 ]; then + echo "$0: removing old LM scores." + # Use the phi-matcher style of composition.. this is appropriate + # if the old LM scores were added e.g. by lmrescore.sh, using + # phi-matcher composition. + $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \ + lattice-compose --phi-label=$phi "ark:gunzip -c $dir/nbest1.JOB.gz|" $oldlm \ + "ark:|gzip -c >$dir/nbest2.JOB.gz" || exit 1; + fi +else + if [ $stage -le 2 ]; then + echo "$0: removing old LM scores." + # this approach chooses the best path through the old LM FST, while + # subtracting the old scores. If the lattices came straight from decoding, + # this is what we want. + $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \ + || exit 1; + fi +fi + +if [ $stage -le 3 ]; then +# Decompose the n-best lists into 4 archives. + echo "$0: creating separate-archive form of N-best lists." + $cmd JOB=1:$nj $dir/log/make_new_archives.JOB.log \ + mkdir -p $adir.JOB '&&' \ + nbest-to-linear "ark:gunzip -c $dir/nbest2.JOB.gz|" \ + "ark,t:$adir.JOB/ali" "ark,t:$adir.JOB/words" \ + "ark,t:$adir.JOB/lmwt.nolm" "ark,t:$adir.JOB/acwt" || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: doing the same with old LM scores." +# Create an archive with the LM scores before we +# removed the LM probs (will help us do interpolation). +$cmd JOB=1:$nj $dir/log/make_old_archives.JOB.log \ + nbest-to-linear "ark:gunzip -c $dir/nbest1.JOB.gz|" "ark:/dev/null" \ + "ark:/dev/null" "ark,t:$adir.JOB/lmwt.withlm" "ark:/dev/null" || exit 1; +fi + +if $test; then # This branch is a sanity check that at the acwt where we generated + # the N-best list, we get the same WER. + echo "$0 [testing branch]: generating lattices without changing scores." + $cmd JOB=1:$nj $dir/log/test.JOB.log \ + linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" "ark:$adir.JOB/lmwt.withlm" \ + "ark:$adir.JOB/acwt" ark:- \| \ + nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; + exit 0; +fi + +if [ $stage -le 5 ]; then + echo "$0: Creating archives with text-form of words, and LM scores without graph scores." + # Do some small tasks; for these we don't use the queue, it will only slow us down. + for n in `seq $nj`; do + utils/int2sym.pl -f 2- $oldlang/words.txt < $adir.$n/words > $adir.$n/words_text || exit 1; + mkdir -p $adir.$n/temp + paste $adir.$n/lmwt.nolm $adir.$n/lmwt.withlm | awk '{print $1, ($4-$2);}' > \ + $adir.$n/lmwt.lmonly || exit 1; + done +fi +if [ $stage -le 6 ]; then + echo "$0: invoking rnnlm_compute_scores.sh which calls rnnlm, to get RNN LM scores." + $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \ + utils/rnnlm_compute_scores.sh $rnndir $adir.JOB/temp $adir.JOB/words_text $adir.JOB/lmwt.rnn \ + || exit 1; +fi +if [ $stage -le 7 ]; then + echo "$0: reconstructing total LM+graph scores including interpolation of RNNLM and old LM scores." + for n in `seq $nj`; do + paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.rnn | awk -v rnnweight=$rnnweight \ + '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6; + score = graphscore+(rnnweight*rnnscore)+((1-rnnweight)*lmscore); + print $1,score; } ' > $adir.$n/lmwt.interp.$rnnweight || exit 1; + done +fi + +if [ $stage -le 8 ]; then + echo "$0: reconstructing archives back into lattices." + $cmd JOB=1:$nj $dir/log/reconstruct_lattice.JOB.log \ + linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" \ + "ark:$adir.JOB/lmwt.interp.$rnnweight" "ark:$adir.JOB/acwt" ark:- \| \ + nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $oldlang $dir + +exit 0; + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_deltas.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_deltas.sh new file mode 100755 index 00000000000..daa23acddec --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_deltas.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration. +stage=-4 # This allows restarting after partway, when something when wrong. +config= +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +realign_iters="10 20 30"; +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +power=0.2 # Exponent for number of gaussians according to occurrence counts +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_deltas.sh " + echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do + [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss +oov=`cat $lang/oov.int` || exit 1; +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |" + +rm $dir/.error 2>/dev/null + +if [ $stage -le -3 ]; then + echo "$0: accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -2 ]; then + echo "$0: getting questions for tree-building, via clustering" + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: building the tree" + $cmd $dir/log/build_tree.log \ + build-tree --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + + gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "$0: converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ]; then + echo "$0: compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +x=1 +while [ $x -lt $num_iters ]; do + echo "$0: training pass $x" + if [ $stage -le $x ]; then + if echo $realign_iters | grep -w $x >/dev/null; then + echo "$0: aligning data" + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --mix-up=$numgauss --power=$power \ + --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +# Summarize warning messages... +utils/summarize_warnings.pl $dir/log + +echo "$0: Done training system with delta+delta-delta features in $dir" + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_diag_ubm.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_diag_ubm.sh new file mode 100755 index 00000000000..e43a9cb5b8b --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_diag_ubm.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey), 2012. +# Apache 2.0. + +# Train a diagonal mixture of Gaussians. This is trained without +# reference to class labels-- except that, optionally, you can down-weight +# silence phones, and alignments are needed for that. +# +# The current use for this is in fMMI training. + +# Begin configuration section. +nj=4 +cmd=run.pl +num_iters=3 +silence_weight= +stage=-2 +# The value "intermediate" is a number of Gaussians we first obtain by clustering +# the Gaussians within each state of the model, before clustering down to +# $num_Gauss. This is for efficiency. It's not a very important parameter, +# as far as I know. +intermediate=2000 +num_gselect=50 # Number of Gaussian-selection indices to use while training + # the model. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + + +if [ $# != 5 ]; then + echo "Usage: steps/train_diag_ubm.sh " + echo " e.g.: steps/train_diag_ubm.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c" + echo "Options: " + echo " --silence-weight # default 1.0. Use to down-weight silence." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --num-iters # number of iterations of training (default: $num_iters)" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +num_gauss=$1 +data=$2 +lang=$3 +alidir=$4 +dir=$5 + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ -f $alidir/trans.1 ]; then + echo Using transforms from $alidir; + [ "$nj" -ne "`cat $alidir/num_jobs`" ] && \ + echo "The number of jobs differs from alignment directory $alidir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |" +fi + +if [ ! -z "$silence_weight" ]; then + [ ! -f $alidir/ali.1.gz ] && \ + echo "You specified weighting for silence but $alidir/ali.1.gz does not exist." && exit 1; + [ "$nj" -ne "`cat $alidir/num_jobs`" ] && \ + echo "You specified silence weight but $alidir has different #jobs." && exit 1; + weights="--weights='ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'" +else + weights= +fi + +# $intermediate should be more than $num_gauss.. +[ $[$num_gauss*2] -gt $intermediate ] && intermediate=$[$num_gauss*2] \ + && echo "Setting intermediate=$intermediate (it was too small)"; + +if [ $stage -le -2 ]; then + echo "Clustering Gaussians in $alidir/final.mdl" + $cmd $dir/log/cluster.log \ + init-ubm --fullcov-ubm=false --intermediate-num-gauss=$intermediate \ + --ubm-num-gauss=$num_gauss $alidir/final.mdl $alidir/final.occs $dir/0.dubm || exit 1; +fi + +# Store Gaussian selection indices on disk-- this speeds up the training passes. +if [ $stage -le -1 ]; then + echo Getting Gaussian-selection info + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +for x in `seq 0 $[$num_iters-1]`; do + echo "Training pass $x" + if [ $stage -le $x ]; then + # Accumulate stats. + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-global-acc-stats $weights "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \ + $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1; + if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter, + opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more. + fi + $cmd $dir/log/update.$x.log \ + gmm-global-est $opt $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \ + $dir/$[$x+1].dubm || exit 1; + rm $dir/$x.*.acc $dir/$x.dubm + fi +done + +rm $dir/gselect.*.gz +mv $dir/$num_iters.dubm $dir/final.dubm || exit 1; +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_lda_mllt.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_lda_mllt.sh new file mode 100755 index 00000000000..7bd283c8658 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_lda_mllt.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# Begin configuration. +cmd=run.pl +config= +stage=-4 +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +realign_iters="10 20 30"; +mllt_iters="2 4 6 12"; +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +dim=40 +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +power=0.2 # Exponent for number of gaussians according to occurrence counts +randprune=4.0 # This is approximately the ratio by which we will speed up the + # LDA and MLLT calculations via randomized pruning. +splice_opts= +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> " + echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do + [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment +oov=`cat $lang/oov.int` || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; + +mkdir -p $dir/log +echo $nj >$dir/num_jobs +echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options + # so that later stages of system building can know what they were. + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + + +splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" +# Note: $feats gets overwritten later in the script. +feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |" + + + +if [ $stage -le -4 ]; then + echo "Accumulating LDA statistics." + $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ + acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \ + $dir/lda.JOB.acc || exit 1; + est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \ + 2>$dir/log/lda_est.log || exit 1; + rm $dir/lda.*.acc +fi + +cur_lda_iter=0 + +if [ $stage -le -3 ]; then + echo "Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + + +if [ $stage -le -2 ]; then + echo "Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + + # could mix up if we wanted: + # gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1; + rm $dir/treeacc +fi + + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "Converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ]; then + echo "Compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + + +x=1 +while [ $x -lt $num_iters ]; do + echo Training pass $x + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo Aligning data + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + if echo $mllt_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo "Estimating MLLT" + $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \ + gmm-acc-mllt --rand-prune=$randprune $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \ + || exit 1; + est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1; + gmm-transform-means $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \ + 2> $dir/log/transform_means.$x.log || exit 1; + compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1; + rm $dir/$x.*.macc + fi + feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |" + cur_lda_iter=$x + fi + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \ + $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + +rm $dir/final.{mdl,mat,occs} 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs +ln -s $cur_lda_iter.mat $dir/final.mat + +# Summarize warning messages... + +utils/summarize_warnings.pl $dir/log + +echo Done training system with LDA+MLLT features in $dir diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mmi.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi.sh new file mode 100755 index 00000000000..b4b976de199 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# MMI training (or optionally boosted MMI, if you give the --boost option). +# 4 iterations (by default) of Extended Baum-Welch update. +# +# For the numerator we have a fixed alignment rather than a lattice-- +# this actually follows from the way lattices are defined in Kaldi, which +# is to have a single path for each word (output-symbol) sequence. + +# Begin configuration section. +cmd=run.pl +num_iters=4 +boost=0.0 +cancel=true # if true, cancel num and den counts on each frame. +tau=400 +weight_tau=10 +acwt=0.1 +stage=0 +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/train_mmi.sh " + echo " e.g.: steps/train_mmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --tau # tau for i-smooth to last iter (default 200)" + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +dir=$5 +mkdir -p $dir/log + +for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +cp $alidir/{final.mdl,tree} $dir + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +# Set up featuresl + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \ + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +cur_mdl=$alidir/final.mdl +x=0 +while [ $x -lt $num_iters ]; do + echo "Iteration $x of MMI training" + # Note: the num and den states are accumulated at the same time, so we + # can cancel them per frame. + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-rescore-lattice $cur_mdl "$lats" "$feats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=$cancel --scale1=-1 \ + ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ + gmm-acc-stats2 $cur_mdl "$feats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + $cmd $dir/log/den_acc_sum.$x.log \ + gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; + rm $dir/den_acc.$x.*.acc + $cmd $dir/log/num_acc_sum.$x.log \ + gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; + rm $dir/num_acc.$x.*.acc + + # note: this tau value is for smoothing towards model parameters, not + # as in the Boosted MMI paper, not towards the ML stats as in the earlier + # work on discriminative training (e.g. my thesis). + # You could use gmm-ismooth-stats to smooth to the ML stats, if you had + # them available [here they're not available if cancel=true]. + + $cmd $dir/log/update.$x.log \ + gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \ + gmm-est-weights-ebw --weight-tau=$weight_tau - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + rm $dir/{den,num}_acc.$x.acc + fi + cur_mdl=$dir/$[$x+1].mdl + + # Some diagnostics: the objective function progress and auxiliary-function + # improvement. + + tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while() { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf + objf=`cat $dir/tmpf | awk '{print $1}'`; + nf=`cat $dir/tmpf | awk '{print $2}'`; + rm $dir/tmpf + impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` + impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. + echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log + x=$[$x+1] +done + +echo "MMI training finished" + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi.sh new file mode 100755 index 00000000000..b78ffa98f78 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# by Johns Hopkins University (Author: Daniel Povey), 2012. Apache 2.0. + +# This script does MMI discriminative training, including +# feature-space (like fMPE) and model-space components. +# If you give the --boost option it does "boosted MMI" (BMMI). +# On the iterations of training it alternates feature-space +# and model-space training. We do 8 iterations in total-- +# 4 of each type ((B)MMI, f(B)MMI) + + +# Begin configuration section. +cmd=run.pl +schedule="fmmi fmmi fmmi fmmi mmi mmi mmi mmi" +boost=0.0 +learning_rate=0.01 +tau=400 # For model. Note: we're doing smoothing "to the previous iteration", + # so --smooth-from-model so 400 seems like a more sensible default + # than 100. We smooth to the previous iteration because now + # we are discriminatively training the features (and not using + # the indirect differential), so it seems like it wouldn't make + # sense to use any element of ML. +weight_tau=10 # for model weights. +cancel=true # if true, cancel num and den counts as described in + # the boosted MMI paper. +indirect=true # if true, use indirect derivative. +acwt=0.1 +stage=-1 +ngselect=2; # Just the 2 top Gaussians. Beyond that, adding more Gaussians + # wouldn't make much difference since the posteriors would be very small. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + + +if [ $# != 6 ]; then + echo "Usage: steps/train_mmi_fmmi.sh " + echo " e.g.: steps/train_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b_fmmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1) ... boosted MMI." + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --tau # tau for i-smooth to last iter (default 200)" + echo " --learning-rate # learning rate for fMMI, default 0.01" + echo " --schedule # learning schedule: by default," + echo " # \"fmmi mmi fmmi mmi fmmi mmi fmmi mmi\"" + exit 1; +fi + + +data=$1 +lang=$2 +alidir=$3 +dubmdir=$4 # where diagonal UBM is. +denlatdir=$5 +dir=$6 + +silphonelist=`cat $lang/phones/silence.csl` +mkdir -p $dir/log + +for f in $data/feats.scp $lang/phones.txt $dubmdir/final.dubm $alidir/final.mdl \ + $alidir/ali.1.gz $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "Expected file $f to exist" && exit 1; +done +cp $alidir/final.mdl $alidir/tree $dir || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +# Note: $feats is the features before fMPE. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \ + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |" + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +fmpefeats="$feats" # At first, the features "after fMPE" are the same as the + # base features. + + +# Initialize the fMPE object. Note: we call it .fmpe because +# that's what it was called in the original paper, but since +# we're using the MMI objective function, it's really fMMI. + +fmpe-init $dubmdir/final.dubm $dir/0.fmpe 2>$dir/log/fmpe_init.log || exit 1; + + +if [ $stage -le -1 ]; then + # Get the gselect (Gaussian selection) info for fMPE. + # Note: fMPE object starts with GMM object, so can be read + # as one. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$ngselect $dir/0.fmpe "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +cp $alidir/final.mdl $dir/0.mdl + +x=0 +num_iters=`echo $schedule | wc -w` + +while [ $x -lt $num_iters ]; do + iter_type=`echo $schedule | cut -d ' ' -f $[$x+1]` + case $iter_type in + fmmi) + echo "Iteration $x: doing fMMI" + if [ $stage -le $x ]; then + numpost="ark,s,cs:gunzip -c $alidir/ali.JOB.gz| ali-to-post ark:- ark:-|" + # Note: the command gmm-fmpe-acc-stats below requires the pre-fMPE features. + $cmd JOB=1:$nj $dir/log/acc_fmmi.$x.JOB.log \ + gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --scale1=-1 ark:- "$numpost" ark:- \| \ + gmm-fmpe-acc-stats $dir/$x.mdl $dir/$x.fmpe "$feats" \ + "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark,s,cs:- \ + $dir/$x.JOB.fmpe_acc || exit 1; + + ( fmpe-sum-accs $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \ + rm $dir/$x.*.fmpe_acc && \ + fmpe-est --learning-rate=$learning_rate $dir/$x.fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \ + 2>$dir/log/est_fmpe.$x.log || exit 1; + fi + # We need to set the features to use the correct fMPE object. + fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" + rm $dir/$[x+1].mdl 2>/dev/null; ln -s $x.mdl $dir/$[$x+1].mdl # link previous model. + # Now, diagnostics. + objf_nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'` + objf=`echo $objf_nf | awk '{print $1}'`; + nf=`echo $objf_nf | awk '{print $2}'`; + impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'` + impr=`perl -e "print ($impr/$nf);"` # normalize by #frames. + echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log + ;; + mmi) # MMI iteration. + echo "Iteration $x: doing MMI (getting stats)..." + # Get denominator stats... For simplicity we rescore the lattice + # on all iterations, even though it shouldn't be necessary on the zeroth + # (but we want this script to work even if $alidir doesn't contain the + # model used to generate the lattice). + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=$cancel --scale1=-1 \ + ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ + gmm-acc-stats2 $dir/$x.mdl "$fmpefeats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + $cmd $dir/log/den_acc_sum.$x.log \ + gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; + rm $dir/den_acc.$x.*.acc + $cmd $dir/log/num_acc_sum.$x.log \ + gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; + rm $dir/num_acc.$x.*.acc + + # note: this tau value is for smoothing to model parameters; + # you need to use gmm-ismooth-stats to smooth to the ML stats, + # but anyway this script does canceling of num and den stats on + # each frame (as suggested in the Boosted MMI paper) which would + # make smoothing to ML impossible without accumulating extra stats. + $cmd $dir/log/update.$x.log \ + gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \ + gmm-est-weights-ebw --weight-tau=$weight_tau - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + else + echo "not doing this iteration because --stage=$stage" + fi + + # Some diagnostics.. note, this objf is somewhat comparable to the + # MMI objective function divided by the acoustic weight, and differences in it + # are comparable to the auxf improvement printed by the update program. + objf_nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'` + objf=`echo $objf_nf | awk '{print $1}'`; + nf=`echo $objf_nf | awk '{print $2}'`; + impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` + impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct + # for the canceling of stats. + echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log + rm $dir/$[x+1].fmpe 2>/dev/null; ln -s $x.fmpe $dir/$[$x+1].fmpe # link previous fMPE transform + ;; + *) echo "Invalid --schedule option: expected only mmi or fmmi."; + esac + x=$[$x+1] +done + +echo "Succeeded with $num_iters iters iterations of MMI+fMMI training (boosting factor = $boost)" + +rm $dir/final.mdl 2>/dev/null; ln -s $num_iters.mdl $dir/final.mdl +rm $dir/final.fmpe 2>/dev/null; ln -s $num_iters.fmpe $dir/final.fmpe + +# Now do some cleanup. +rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc +exit 0; + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi_indirect.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi_indirect.sh new file mode 100755 index 00000000000..2bed327a3a6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_fmmi_indirect.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# by Johns Hopkins University (Author: Daniel Povey), 2012. Apache 2.0. + +# This script does MMI discriminative training, including +# feature-space (like fMPE) and model-space components. +# If you give the --boost option it does "boosted MMI" (BMMI). +# On the iterations of training it alternates feature-space +# and model-space training. We do 8 iterations in total-- +# 4 of each type ((B)MMI, f(B)MMI) + + +# Begin configuration section. +cmd=run.pl +schedule="fmmi mmi fmmi mmi fmmi mmi fmmi mmi" +boost=0.0 +learning_rate=0.02 +tau=200 # For model. Note: we're doing smoothing "to the previous iteration", + # so --smooth-from-model so 200 seems like a more sensible default + # than 100. We smooth to the previous iteration because now + # we are discriminatively training the features (and not using + # the indirect differential), so it seems like it wouldn't make + # sense to use any element of ML. +cancel=true # if true, cancel num and den counts as described in + # the boosted MMI paper. +indirect=true # if true, use indirect derivative. +acwt=0.1 +stage=-1 +ngselect=2; # Just the 2 top Gaussians. Beyond that, adding more Gaussians + # wouldn't make much difference since the posteriors would be very small. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + + +if [ $# != 6 ]; then + echo "Usage: steps/train_mmi_fmmi.sh " + echo " e.g.: steps/train_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b_fmmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1) ... boosted MMI." + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --tau # tau for i-smooth to last iter (default 200)" + echo " --learning-rate # learning rate for fMMI, default 0.01" + echo " --schedule # learning schedule: by default," + echo " # \"fmmi mmi fmmi mmi fmmi mmi fmmi mmi\"" + exit 1; +fi + + +data=$1 +lang=$2 +alidir=$3 +dubmdir=$4 # where diagonal UBM is. +denlatdir=$5 +dir=$6 + +silphonelist=`cat $lang/phones/silence.csl` +mkdir -p $dir/log + +for f in $data/feats.scp $lang/phones.txt $dubmdir/final.dubm $alidir/final.mdl \ + $alidir/ali.1.gz $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "Expected file $f to exist" && exit 1; +done +cp $alidir/final.mdl $alidir/tree $dir || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +# Note: $feats is the features before fMPE. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \ + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |" + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +fmpefeats="$feats" # At first, the features "after fMPE" are the same as the + # base features. + + +# Initialize the fMPE object. Note: we call it .fmpe because +# that's what it was called in the original paper, but since +# we're using the MMI objective function, it's really fMMI. + +fmpe-init $dubmdir/final.dubm $dir/0.fmpe 2>$dir/log/fmpe_init.log || exit 1; + + +if [ $stage -le -1 ]; then + # Get the gselect (Gaussian selection) info for fMPE. + # Note: fMPE object starts with GMM object, so can be read + # as one. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$ngselect $dir/0.fmpe "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +cp $alidir/final.mdl $dir/0.mdl + +x=0 +num_iters=`echo $schedule | wc -w` + +while [ $x -lt $num_iters ]; do + iter_type=`echo $schedule | cut -d ' ' -f $[$x+1]` + case $iter_type in + fmmi) fmmi_iter=true; local_cancel=false;; + mmi) fmmi_iter=false; local_cancel=$cancel;; + *) echo "Bad iteration type $iter_type"; exit 1;; + esac + + echo "Getting MMI stats (needed for fMMI and MMI iterations)."; + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=$local_cancel --scale1=-1 \ + ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ + gmm-acc-stats2 $dir/$x.mdl "$fmpefeats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + rm $dir/.error 2>/dev/null + $cmd $dir/log/den_acc_sum.$x.log \ + gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || touch $dir/.error & + $cmd $dir/log/num_acc_sum.$x.log \ + gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || touch $dir/.error & + wait + [ -f $dir/.error ] && echo "Error summing accs" && exit 1; + rm $dir/den_acc.$x.*.acc + rm $dir/num_acc.$x.*.acc + fi + + if $fmmi_iter; then + echo "Iteration $x: doing fMMI" + if [ $stage -le $x ]; then + # Get model derivative. Note: the "ml accumulator" is the same as the "numerator" + # since this is MMI. We avoided doing the "canceling of stats" on this iteration + # so that this would be true (this canceling wouldn't affect the derivative anyway, + # so can have no benefit for fMMI, unlike MMI). + $cmd $dir/log/get_stats_deriv.$x.log \ + gmm-get-stats-deriv $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc \ + $dir/num_acc.$x.acc $dir/model_deriv.$x.gmmacc + numpost="ark,s,cs:gunzip -c $alidir/ali.JOB.gz| ali-to-post ark:- ark:-|" + # Note: the command gmm-fmpe-acc-stats below requires the pre-fMPE features. + $cmd JOB=1:$nj $dir/log/acc_fmmi.$x.JOB.log \ + gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=false --scale1=-1 ark:- "$numpost" ark:- \| \ + gmm-fmpe-acc-stats --model-derivative=$dir/model_deriv.$x.gmmacc \ + $dir/$x.mdl $dir/$x.fmpe "$feats" \ + "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark,s,cs:- \ + $dir/$x.JOB.fmpe_acc || exit 1; + + ( fmpe-sum-accs $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \ + rm $dir/$x.*.fmpe_acc && \ + fmpe-est --learning-rate=$learning_rate $dir/$x.fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \ + 2>$dir/log/est_fmpe.$x.log || exit 1; + + fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" + # OK, now we do one iteration of the "rescaling update" where we use the + # old and new ML accs, and we shift and rescale the model to match the new + # features. + $cmd JOB=1:$nj $dir/log/acc_ml.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$fmpefeats" "ark:gunzip -c $alidir/ali.JOB.gz|" \ + $dir/new_ml_acc.$x.JOB.acc || exit 1; + $cmd $dir/log/new_ml_acc_sum.$x.log \ + gmm-sum-accs $dir/new_ml_acc.$x.acc $dir/new_ml_acc.$x.*.acc || exit 1; + $cmd $dir/log/update_rescale.$x.log \ + gmm-est-rescale $dir/$x.mdl $dir/num_acc.$x.acc $dir/new_ml_acc.$x.acc \ + $dir/$[$x+1].mdl || exit 1; + fi + # We need to set the features to use the correct fMPE object. + # This is a repeat of a command above-- in case we didn't do this stage. + fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" + # Now, diagnostics. + objf_nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'` + objf=`echo $objf_nf | awk '{print $1}'`; + nf=`echo $objf_nf | awk '{print $2}'`; + impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'` + impr=`perl -e "print ($impr/$nf);"` # normalize by #frames. + echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log + else # MMI iteration-- on this iteration do model-space update. + echo "Iteration $x: doing MMI update" + # note: this tau value is for smoothing to model parameters; + # you need to use gmm-ismooth-stats to smooth to the ML stats, + # but anyway this script does canceling of num and den stats on + # each frame (as suggested in the Boosted MMI paper) which would + # make smoothing to ML impossible without accumulating extra stats. + if [ $stage -le $x ]; then + $cmd $dir/log/update.$x.log \ + gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \ + gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + else + echo "not doing this iteration because --stage=$stage" + fi + + # Some diagnostics.. note, this objf is somewhat comparable to the + # MMI objective function divided by the acoustic weight, and differences in it + # are comparable to the auxf improvement printed by the update program. + objf_nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'` + objf=`echo $objf_nf | awk '{print $1}'`; + nf=`echo $objf_nf | awk '{print $2}'`; + impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'` + impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct + # for the canceling of stats. + echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log + rm $dir/$[x+1].fmpe 2>/dev/null; ln -s $x.fmpe $dir/$[$x+1].fmpe # link previous fMPE transform + fi + x=$[$x+1] +done + +echo "Succeeded with $num_iters iters iterations of MMI+fMMI training (boosting factor = $boost)" + +rm $dir/final.mdl 2>/dev/null; ln -s $num_iters.mdl $dir/final.mdl +rm $dir/final.fmpe 2>/dev/null; ln -s $num_iters.fmpe $dir/final.fmpe + +# Now do some cleanup. +rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc +exit 0; + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm.sh new file mode 100755 index 00000000000..9f7b081ca82 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# MMI training (or optionally boosted MMI, if you give the --boost option), +# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update. +# +# Begin configuration section. +cmd=run.pl +num_iters=4 +boost=0.0 +cancel=true # if true, cancel num and den counts on each frame. +acwt=0.1 +stage=0 + +update_opts= +transform_dir= +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/train_mmi_sgmm.sh " + echo " e.g.: steps/train_mmi_sgmm.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +dir=$5 +mkdir -p $dir/log + +for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +cp $alidir/splice_opts $dir 2>/dev/null +echo $nj > $dir/num_jobs + +cp $alidir/{final.mdl,tree} $dir + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +# Set up featuresl + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \ + && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "$0: no fMLLR transforms." +fi + +if [ -f $alidir/vecs.1 ]; then + echo "$0: using speaker vectors from $alidir" + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors." + spkvecs_opt= +fi + +if [ -f $alidir/gselect.1.gz ]; then + echo "$0: using Gaussian-selection info from $alidir" + gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: error: no Gaussian-selection info found" && exit 1; +fi + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +cur_mdl=$alidir/final.mdl +x=0 +while [ $x -lt $num_iters ]; do + echo "Iteration $x of MMI training" + # Note: the num and den states are accumulated at the same time, so we + # can cancel them per frame. + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt $cur_mdl "$lats" "$feats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=$cancel --scale1=-1 \ + ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ + sgmm-acc-stats2 "$gselect_opt" $spkvecs_opt $cur_mdl "$feats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + $cmd $dir/log/den_acc_sum.$x.log \ + sgmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; + rm $dir/den_acc.$x.*.acc + $cmd $dir/log/num_acc_sum.$x.log \ + sgmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; + rm $dir/num_acc.$x.*.acc + + $cmd $dir/log/update.$x.log \ + sgmm-est-ebw $update_opts $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + fi + cur_mdl=$dir/$[$x+1].mdl + + + # Some diagnostics: the objective function progress and auxiliary-function + # improvement. Note: this code is same as in train_mmi.sh + tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while() { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf + objf=`cat $dir/tmpf | awk '{print $1}'`; + nf=`cat $dir/tmpf | awk '{print $2}'`; + rm $dir/tmpf + impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` + impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. + echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log + x=$[$x+1] +done + +echo "MMI training finished" + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm2.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm2.sh new file mode 100755 index 00000000000..ef45769fbbf --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mmi_sgmm2.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# MMI training (or optionally boosted MMI, if you give the --boost option), +# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update. +# +# Begin configuration section. +cmd=run.pl +num_iters=4 +boost=0.0 +cancel=true # if true, cancel num and den counts on each frame. +acwt=0.1 +stage=0 +update_opts= +transform_dir= +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/train_mmi_sgmm2.sh " + echo " e.g.: steps/train_mmi_sgmm2.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +dir=$5 +mkdir -p $dir/log + +for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +cp $alidir/{final.mdl,tree} $dir + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +# Set up featuresl + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \ + && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "$0: no fMLLR transforms." +fi + +if [ -f $alidir/vecs.1 ]; then + echo "$0: using speaker vectors from $alidir" + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors." + spkvecs_opt= +fi + +if [ -f $alidir/gselect.1.gz ]; then + echo "$0: using Gaussian-selection info from $alidir" + gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: error: no Gaussian-selection info found" && exit 1; +fi + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +cur_mdl=$alidir/final.mdl +x=0 +while [ $x -lt $num_iters ]; do + echo "Iteration $x of MMI training" + # Note: the num and den states are accumulated at the same time, so we + # can cancel them per frame. + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt $cur_mdl "$lats" "$feats" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + sum-post --merge=$cancel --scale1=-1 \ + ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ + sgmm2-acc-stats2 "$gselect_opt" $spkvecs_opt $cur_mdl "$feats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + $cmd $dir/log/den_acc_sum.$x.log \ + sgmm2-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; + rm $dir/den_acc.$x.*.acc + $cmd $dir/log/num_acc_sum.$x.log \ + sgmm2-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; + rm $dir/num_acc.$x.*.acc + + $cmd $dir/log/update.$x.log \ + sgmm2-est-ebw $update_opts $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + fi + cur_mdl=$dir/$[$x+1].mdl + + + # Some diagnostics: the objective function progress and auxiliary-function + # improvement. Note: this code is same as in train_mmi.sh + tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while() { if(m/sgmm2-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf + objf=`cat $dir/tmpf | awk '{print $1}'`; + nf=`cat $dir/tmpf | awk '{print $2}'`; + rm $dir/tmpf + impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` + impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. + echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log + x=$[$x+1] +done + +echo "MMI training finished" + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mono.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mono.sh new file mode 100755 index 00000000000..41aab425c15 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mono.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + + +# To be run from .. +# Flat start and monophone training, with delta-delta features. +# This script applies cepstral mean normalization (per speaker). + +# Begin configuration section. +nj=4 +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +num_iters=40 # Number of iterations of training +max_iter_inc=30 # Last iter to increase #Gauss on. +totgauss=1000 # Target #Gaussians. +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38"; +config= # name of config file. +stage=-4 +power=0.2 # exponent to determine number of gaussians from occurrence counts +feat_dim=39 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/train_mono.sh [options] " + echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --feat_dim # dimension of feature vector (39)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +dir=$3 + +oov_sym=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + + +feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |" +example_feats="`echo '$feats' | sed s/JOB/1/g`"; + +echo "$0: Initializing monophone system." + +[ ! -f $lang/phones/sets.int ] && exit 1; +shared_phones_opt="--shared-phones=$lang/phones/sets.int" + +if [ $stage -le -3 ]; then +# Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway. + $cmd JOB=1 $dir/log/init.log \ + gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/0.mdl $dir/tree || exit 1; +fi + +numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'` +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss + +if [ $stage -le -2 ]; then + echo "$0: Compiling training graphs" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ + "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +if [ $stage -le -1 ]; then + echo "$0: Aligning data equally (pass 0)" + $cmd JOB=1:$nj $dir/log/align.0.JOB.log \ + align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:- \| \ + gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \ + $dir/0.JOB.acc || exit 1; +fi + +# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise +# we fail to est "rare" phones and later on, they never align properly. + +if [ $stage -le 0 ]; then + gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss --power=$power \ + $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1; + rm $dir/0.*.acc +fi + + +beam=6 # will change to 10 below after 1st pass +# note: using slightly wider beams for WSJ vs. RM. +x=1 +while [ $x -lt $num_iters ]; do + echo "$0: Pass $x" + if [ $stage -le $x ]; then + if echo $realign_iters | grep -w $x >/dev/null; then + echo "$0: Aligning data" + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \ + || exit 1; + fi + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \ + $dir/$x.JOB.acc || exit 1; + + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null + fi + if [ $x -le $max_iter_inc ]; then + numgauss=$[$numgauss+$incgauss]; + fi + beam=10 + x=$[$x+1] +done + +( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs ) + +utils/summarize_warnings.pl $dir/log + +echo Done + +# example of showing the alignments: +# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4 + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_mpe.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_mpe.sh new file mode 100755 index 00000000000..0808dea6a27 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_mpe.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# MMI training (or optionally boosted MMI, if you give the --boost option). +# 4 iterations (by default) of Extended Baum-Welch update. +# +# For the numerator we have a fixed alignment rather than a lattice-- +# this actually follows from the way lattices are defined in Kaldi, which +# is to have a single path for each word (output-symbol) sequence. + +# Begin configuration section. +cmd=run.pl +num_iters=4 +boost=0.0 +cancel=true # if true, cancel num and den counts on each frame. +tau=400 +weight_tau=10 +acwt=0.1 +stage=0 +smooth_to_mode=true +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/train_mmi.sh " + echo " e.g.: steps/train_mmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" + echo "Main options (for others, see top of script file)" + echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" + echo " --cancel (true|false) # cancel stats (true by default)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --tau # tau for i-smooth to last iter (default 200)" + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +dir=$5 +mkdir -p $dir/log + +for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +nj=`cat $alidir/num_jobs` || exit 1; +[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ + echo "$alidir and $denlatdir have different num-jobs" && exit 1; + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +cp $alidir/{final.mdl,tree} $dir + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +# Set up features + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \ + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + +lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" +if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then + lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" +fi + + +cur_mdl=$alidir/final.mdl +x=0 +while [ $x -lt $num_iters ]; do + echo "Iteration $x of MPE training" + # Note: the num and den states are accumulated at the same time, so we + # can cancel them per frame. + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-rescore-lattice $cur_mdl "$lats" "$feats" ark:- \| \ + lattice-to-mpe-post --acoustic-scale=$acwt $cur_mdl \ + "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- ark:- \| \ + gmm-acc-stats2 $cur_mdl "$feats" ark,s,cs:- \ + $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; + + n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; + [ "$n" -ne $[$nj*2] ] && \ + echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; + $cmd $dir/log/den_acc_sum.$x.log \ + gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; + rm $dir/den_acc.$x.*.acc + $cmd $dir/log/num_acc_sum.$x.log \ + gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; + rm $dir/num_acc.$x.*.acc + + # note: this tau value is for smoothing towards model parameters, not + # as in the Boosted MMI paper, not towards the ML stats as in the earlier + # work on discriminative training (e.g. my thesis). + # You could use gmm-ismooth-stats to smooth to the ML stats, if you had + # them available [here they're not available if cancel=true]. + if ! $smooth_to_model; then + echo "Iteration $x of MPE: computing ml (smoothing) stats" + $cmd JOB=1:$nj $dir/log/acc_ml.$x.JOB.log \ + gmm-acc-stats $cur_mdl "$feats" \ + "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" \ + $dir/ml.$x.JOB.acc || exit 1; + $cmd $dir/log/acc_ml_sum.$x.log \ + gmm-sum-accs $dir/ml.$x.acc $dir/ml.$x.*.acc || exit 1; + rm $dir/ml.$x.*.acc + num_stats="gmm-ismooth-stats --tau=$tau $dir/ml.$x.acc $dir/num_acc.$x.acc -|" + else + num_stats="gmm-ismooth-stats --smooth-from-model=true --tau=$tau $cur_mdl $dir/num_acc.$x.acc -|" + fi + + $cmd $dir/log/update.$x.log \ + gmm-est-gaussians-ebw $cur_mdl "$num_stats" $dir/den_acc.$x.acc - \| \ + gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; + rm $dir/{den,num}_acc.$x.acc + fi + cur_mdl=$dir/$[$x+1].mdl + + # Some diagnostics: the objective function progress and auxiliary-function + # improvement. + + tail -n 50 $dir/log/acc.$x.*.log | perl -e 'while() { if(m/lattice-to-mpe-post.+Overall average frame-accuracy is (\S+) over (\S+) frames/) { $tot_objf += $1*$2; $tot_frames += $2; }} $tot_objf /= $tot_frames; print "$tot_objf $tot_frames\n"; ' > $dir/tmpf + objf=`cat $dir/tmpf | awk '{print $1}'`; + nf=`cat $dir/tmpf | awk '{print $2}'`; + rm $dir/tmpf + impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` + impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. + # This gives us a projected objective function improvement. + echo "Iteration $x: objf was $objf, MPE auxf change was $impr" | tee $dir/objf.$x.log + x=$[$x+1] +done + +echo "MPE training finished" + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl + +exit 0; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_nnet.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_nnet.sh new file mode 100755 index 00000000000..dde713fbbd1 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_nnet.sh @@ -0,0 +1,284 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely (Brno University of Technology) +# Apache 2.0 + +# Begin configuration. +cmd=run.pl + +# nnet config +model_size=3000000 # nr. of parameteres in MLP +hid_layers=2 # nr. of hidden layers (prior to sotfmax or bottleneck) +bn_dim= # set value to get a bottleneck network +hid_dim= # set this to override the $model_size +mlp_init= # set this to override MLP initialization +# training config +learn_rate=0.008 # initial learning rate +momentum=0.0 # momentum +l1_penalty=0.0 # L1 regualrization constant (lassoo) +l2_penalty=0.0 # L2 regualrization constant (weight decay) +# data processing config +bunch_size=256 # size of the training block +cache_size=16384 # size of the randimizatio cache +randomize=true # do the frame level randomization +# feature config +norm_vars=false # normalize the FBANKs (CVN) +splice_lr=15 # temporal splicing +feat_type=traps +dct_basis=16 # nr. od DCT basis +# scheduling config +min_iters= # set to enforce minimum number of iterations +max_iters=20 # maximum number of iterations +start_halving_inc=0.5 # frm-accuracy improvement to begin learn_rate reduction +end_halving_inc=0.1 # frm-accuracy improvement to terminate the training +halving_factor=0.5 # factor to multiply learn_rate +# tool config +TRAIN_TOOL="nnet-train-xent-hardlab-frmshuff" # training tool used for training / cross validation +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + + +if [ $# != 6 ]; then + echo "Usage: $0 " + echo " e.g.: $0 data/train data/cv data/lang exp/mono_ali exp/mono_ali_cv exp/mono_nnet" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + exit 1; +fi + +data=$1 +data_cv=$2 +lang=$3 +alidir=$4 +alidir_cv=$5 +dir=$6 + +for f in $alidir/final.mdl $alidir/ali.1.gz $alidir_cv/ali.1.gz $data/feats.scp $data_cv/feats.scp $data/cmvn.scp $data_cv/cmvn.scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +echo "$0 [info]: Training Neural Network" +printf "\t dir : $dir \n" +printf "\t Train-set : $data $alidir \n" +printf "\t CV-set : $data_cv $alidir_cv \n" + +mkdir -p $dir/{log,nnet} + +###### PREPARE ALIGNMENTS ###### +echo "Preparing alignments" +#convert ali to pdf +labels_tr="ark:$dir/ali_train.pdf" +ali-to-pdf $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz |" $labels_tr 2> $dir/ali_train.pdf_log || exit 1 +if [[ "$alidir" == "$alidir_cv" ]]; then + labels=$labels_tr +else + #convert ali to pdf (cv set) + labels_cv="ark:$dir/ali_cv.pdf" + ali-to-pdf $alidir/final.mdl "ark:gunzip -c $alidir_cv/ali.*.gz |" $labels_cv 2> $dir/ali_cv.pdf_log || exit 1 + #merge the two parts (scheduler expects one file in $labels) + labels="ark:$dir/ali_train_and_cv.pdf" + cat $dir/ali_train.pdf $dir/ali_cv.pdf > $dir/ali_train_and_cv.pdf +fi + +#get the priors, count the class examples from alignments +analyze-counts --binary=false $labels_tr $dir/ali_train.counts 2>$dir/ali_train.counts_log || exit 1 +#copy the old transition model, will be needed by decoder +copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl 2>$dir/final.mdl_log || exit 1 +cp $alidir/tree $dir/tree || exit 1 + +#analyze the train/cv alignments +utils/nnet/analyze_alignments.sh "TRAINING SET" "ark:gunzip -c $alidir/ali.*.gz |" $dir/final.mdl $lang > $dir/__ali_stats_train +utils/nnet/analyze_alignments.sh "VALIDATION SET" "ark:gunzip -c $alidir_cv/ali.*.gz |" $dir/final.mdl $lang > $dir/__ali_stats_cv + + +###### PREPARE FEATURES ###### +# shuffle the list +echo "Preparing train/cv lists" +cat $data/feats.scp | utils/shuffle_list.pl ${seed:-777} > $dir/train.scp +cp $data_cv/feats.scp $dir/cv.scp +# print the list sizes +wc -l $dir/train.scp $dir/cv.scp + +#get feature dim +echo -n "Getting feature dim : " +feat_dim=$(feat-to-dim --print-args=false scp:$dir/train.scp -) +echo $feat_dim + +#add per-speaker CMVN +echo "Will use CMVN statistics : $data/cmvn.scp, $data_cv/cmvn.scp" +cmvn="scp:$data/cmvn.scp" +cmvn_cv="scp:$data_cv/cmvn.scp" +feats_tr="ark:apply-cmvn --print-args=false --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk $cmvn scp:$dir/train.scp ark:- |" +feats_cv="ark:apply-cmvn --print-args=false --norm-vars=$norm_vars --utt2spk=ark:$data_cv/utt2spk $cmvn_cv scp:$dir/cv.scp ark:- |" +# keep track of norm_vars option +echo "$norm_vars" >$dir/norm_vars + +#add splicing +splice_opts="--left-context=$splice_lr --right-context=$splice_lr" +feats_tr="$feats_tr splice-feats --print-args=false $splice_opts ark:- ark:- |" +feats_cv="$feats_cv splice-feats --print-args=false $splice_opts ark:- ark:- |" +# keep track of splice_opts +echo "$splice_opts" >$dir/splice_opts + +#choose further processing of spliced features +echo "Feature type : $feat_type" +case $feat_type in + plain) + ;; + traps) + #generate hamming+dct transform + transf=$dir/hamm_dct.mat + echo "Preparing Hamming DCT transform : $transf" + utils/nnet/gen_hamm_mat.py --fea-dim=$feat_dim --splice=$splice_lr > $dir/hamm.mat + utils/nnet/gen_dct_mat.py --fea-dim=$feat_dim --splice=$splice_lr --dct-basis=$dct_basis > $dir/dct.mat + compose-transforms --binary=false $dir/dct.mat $dir/hamm.mat $transf 2>${transf}_log || exit 1 + #convert transform to NNET format + { + echo " $((feat_dim*dct_basis)) $((feat_dim*(2*splice_lr+1)))" + cat $transf + echo -n ' [ ' + for i in $(seq $((feat_dim*dct_basis))); do echo -n '0 '; done + echo ']' + } > $transf.net + #append transform to features + feats_tr="$feats_tr nnet-forward --print-args=false --silent=true $transf.net ark:- ark:- |" + feats_cv="$feats_cv nnet-forward --print-args=false --silent=true $transf.net ark:- ark:- |" + ;; + transf) + transf=$dir/final.mat + [ ! -f $alidir/final.mat ] && echo "Missing transform $alidir/final.mat" && exit 1; + cp $alidir/final.mat $transf + echo "Copied transform $transf" + feats_tr="$feats_tr transform-feats $transf ark:- ark:- |" + feats_cv="$feats_cv transform-feats $transf ark:- ark:- |" + ;; + transf-sat) + echo yet unimplemented... + exit 1; + ;; + *) + echo "Unknown feature type $feat_type" + exit 1; + ;; +esac +# keep track of feat_type +echo $feat_type > $dir/feat_type + +#renormalize the MLP input to zero mean and unit variance +cmvn_g="$dir/cmvn_glob.mat" +echo "Renormalizing MLP input features by : $cmvn_g" +compute-cmvn-stats --binary=false "$feats_tr" $cmvn_g 2>${cmvn_g}_log || exit 1 +feats_tr="$feats_tr apply-cmvn --print-args=false --norm-vars=true $cmvn_g ark:- ark:- |" +feats_cv="$feats_cv apply-cmvn --print-args=false --norm-vars=true $cmvn_g ark:- ark:- |" + + +###### INITIALIZE THE NNET ###### + +if [ "" != "$mlp_init" ]; then + echo "Using pre-initalized netwk $mlp_init"; +else + echo -n "Initializng MLP : " + num_fea=$((feat_dim*dct_basis)) + num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }') + # What is the topology? + if [ "" == "$bn_dim" ]; then #MLP w/o bottleneck + case "$hid_layers" in + 1) #3-layer MLP + if [ "" != "$hid_dim" ]; then + num_hid=$hid_dim + else + num_hid=$((model_size/(num_fea+num_tgt))) + fi + mlp_init=$dir/nnet_${num_fea}_${num_hid}_${num_tgt}.init + echo " $mlp_init" + utils/nnet/gen_mlp_init.py --dim=${num_fea}:${num_hid}:${num_tgt} \ + --gauss --negbias --seed=777 > $mlp_init + ;; + 2|3|4|5|6|7|8|9|10) #(>3)-layer MLP + if [ "" != "$hid_dim" ]; then + num_hid=$hid_dim + else + a=$((hid_layers-1)) + b=$((num_fea+num_tgt)) + c=$((-model_size)) + num_hid=$(awk "BEGIN{ num_hid= -$b/(2*$a) + sqrt($b^2 -4*$a*$c)/(2*$a); print int(num_hid) }") + fi + mlp_init=$dir/nnet_${num_fea} + dim_arg=${num_fea} + for i in $(seq $hid_layers); do + mlp_init=${mlp_init}_$num_hid + dim_arg=${dim_arg}:${num_hid} + done + mlp_init=${mlp_init}_${num_tgt}.init + dim_arg=${dim_arg}:${num_tgt} + echo " $mlp_init" + utils/nnet/gen_mlp_init.py --dim=${dim_arg} --gauss --negbias --seed=777 > $mlp_init + ;; + *) + echo "Unsupported number of hidden layers $hid_layers" + exit 1; + esac + else #bn-system + num_bn=$bn_dim + case "$hid_layers" in # ie. number of layers in front of bottleneck + 1) #5-layer MLP + if [ "" != "$hid_dim" ]; then + num_hid=$hid_dim + else + num_hid=$((model_size/(num_fea+num_tgt+(2*num_bn)))) + fi + mlp_init=$dir/nnet_${num_fea}_${num_hid}_${num_bn}_${num_hid}_${num_tgt}.init + echo " $mlp_init" + utils/nnet/gen_mlp_init.py --dim=${num_fea}:${num_hid}:${num_bn}:${num_hid}:${num_tgt} --gauss --negbias --seed=777 --linBNdim=$num_bn > $mlp_init + ;; + 2|3|4|5|6|7|8|9|10) #(>5)-layer MLP + if [ "" != "$hid_dim" ]; then + num_hid=$hid_dim + else + a=$((hid_layers-1)) + b=$((num_fea+2*num_bn+num_tgt)) + c=$((-model_size)) + num_hid=$(awk "BEGIN{ num_hid= -$b/(2*$a) + sqrt($b^2 -4*$a*$c)/(2*$a); print int(num_hid) }") + fi + mlp_init=$dir/nnet_${num_fea} + dim_arg=${num_fea} + for i in $(seq $hid_layers); do + mlp_init=${mlp_init}_$num_hid + dim_arg=${dim_arg}:${num_hid} + done + mlp_init=${mlp_init}_${num_bn}lin_${num_hid}_${num_tgt}.init + dim_arg=${dim_arg}:${num_bn}:${num_hid}:${num_tgt} + echo " $mlp_init" + utils/nnet/gen_mlp_init.py --dim=${dim_arg} --gauss --negbias --seed=777 --linBNdim=$num_bn > $mlp_init + ;; + *) + echo "Unsupported number of hidden layers $hid_layers" + exit 1; + esac + fi +fi + + + +###### TRAIN ###### +echo "Starting training : " +source utils/nnet/train_nnet_scheduler.sh +echo "Training finished." +echo +if [ "" == "$mlp_final" ]; then + echo "No final network returned!"; + exit 1; +else + ( cd $dir; ln -s nnet/${mlp_final##*/} final.nnet; ) + echo "Final network $mlp_final linked to $dir/final.nnet"; +fi + +echo "Succeeded training the Neural Network : $dir/final.nnet" + + + diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_quick.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_quick.sh new file mode 100755 index 00000000000..80638b3c8c4 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_quick.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +# Train a model on top of existing features (no feature-space learning of any +# kind is done). This script initializes the model from each stage of the +# previous system's model, judging the similarities based on overlap of counts +# in the tree stats. + +# Begin configuration.. +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +realign_iters="10 15"; # Only realign twice. +num_iters=20 # Number of iterations of training +maxiterinc=15 # Last iter to increase #Gauss on. +batch_size=750 # batch size to use while compiling graphs... memory/speed tradeoff. +beam=10 # alignment beam. +retry_beam=40 +stage=-5 +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_quick.sh " + echo " e.g.: steps/train_quick.sh 2500 15000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +# Set various variables. +oov=`cat $lang/oov.int` +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` +numgauss=$[totgauss/2] # Start with half the total number of Gaussians. We won't have + # to mix up much probably, as we're initializing with the old (already mixed-up) pdf's. +[ $numgauss -lt $numleaves ] && numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss +nj=`cat $alidir/num_jobs` || exit 1; +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. + +mkdir -p $dir/log +echo $nj >$dir/num_jobs +cp $alidir/splice_opts $dir 2>/dev/null +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +## Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ -f $alidir/trans.1 ]; then + echo "$0: using transforms from $alidir" + ln.pl $alidir/trans.* $dir # Link them to dest dir. + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" +fi +## + + +if [ $stage -le -5 ]; then + echo "$0: accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1; + sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -4 ]; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -3 ]; then + echo "$0: Initializing the model" + + # The gmm-init-model command (with more than the normal # of command-line args) + # will initialize the p.d.f.'s to the p.d.f.'s in the alignment model. + + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/tmp.mdl $alidir/tree $alidir/final.mdl \ + 2>$dir/log/init_model.log || exit 1; + + grep 'no stats' $dir/log/init_model.log && echo "$0: This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -2 ]; then + echo "$0: mixing up old model." + # We do both mixing-down and mixing-up to get the target #Gauss in each state, + # since the initial model may have either more or fewer Gaussians than we want. + gmm-mixup --mix-down=$numgauss --mix-up=$numgauss $dir/tmp.mdl $dir/1.occs $dir/1.mdl \ + 2> $dir/log/mixup.log || exit 1; + rm $dir/tmp.mdl +fi + +# Convert alignments to the new tree. +if [ $stage -le -1 ]; then + echo "$0: converting old alignments" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs --batch-size=$batch_size $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +x=1 +while [ $x -lt $num_iters ]; do + echo "$0: pass $x" + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo "$0: aligning data" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/$x.mdl \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" \ + || exit 1; + fi + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs + fi + [[ $x -le $maxiterinc ]] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + +if [ -f $alidir/trans.1 ]; then + echo "$0: estimating alignment model" + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ + ark,s,cs:- $dir/$x.JOB.acc || exit 1; + [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1; + + $cmd $dir/log/est_alimdl.log \ + gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; + rm $dir/$x.*.acc + rm $dir/final.alimdl 2>/dev/null + ln -s $x.alimdl $dir/final.alimdl +fi + +rm $dir/final.mdl 2>/dev/null +ln -s $x.mdl $dir/final.mdl + +echo Done diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_sat.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_sat.sh new file mode 100755 index 00000000000..b9356cf2d6f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_sat.sh @@ -0,0 +1,238 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +# This does Speaker Adapted Training (SAT), i.e. train on +# fMLLR-adapted features. It can be done on top of either LDA+MLLT, or +# delta and delta-delta features. If there are no transforms supplied +# in the alignment directory, it will estimate transforms itself before +# building the tree (and in any case, it estimates transforms a number +# of times during training). + + +# Begin configuration section. +stage=-5 +fmllr_update_type=full +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +realign_iters="10 20 30"; +fmllr_iters="2 4 6 12"; +silence_weight=0.0 # Weight on silence in fMLLR estimation. +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +power=0.2 # Exponent for number of gaussians according to occurrence counts +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_sat.sh <#leaves> <#gauss> " + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +## Get initial fMLLR transforms (possibly from alignment dir) +if [ -f $alidir/trans.1 ]; then + echo "$0: Using transforms from $alidir" + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + cur_trans_dir=$alidir +else + if [ $stage -le -4 ]; then + echo "$0: obtaining initial fMLLR transforms since not present in $alidir" + $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \ + ark:- ark:$dir/trans.JOB || exit 1; + fi + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + cur_trans_dir=$dir +fi + +if [ $stage -le -3 ]; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -2 ]; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "$0: This is a bad warning."; + + rm $dir/treeacc +fi + + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "$0: Converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ]; then + echo "$0: Compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +x=1 +while [ $x -lt $num_iters ]; do + echo Pass $x + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo Aligning data + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + + if echo $fmllr_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo Estimating fMLLR transforms + # We estimate a transform that's additional to the previous transform; + # we'll compose them. + $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \ + "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1; + for n in `seq $nj`; do + ! ( compose-transforms --b-is-affine=true \ + ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \ + && mv $dir/composed_trans.$n $dir/trans.$n && \ + rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \ + && echo "$0: Error composing transforms" && exit 1; + done + fi + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + cur_trans_dir=$dir + fi + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + + +if [ $stage -le $x ]; then + # Accumulate stats for "alignment model"-- this model is + # computed with the speaker-independent features, but matches Gaussian-for-Gaussian + # with the final speaker-adapted model. + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ + ark,s,cs:- $dir/$x.JOB.acc || exit 1; + [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; + # Update model. + $cmd $dir/log/est_alimdl.log \ + gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; + rm $dir/$x.*.acc +fi + +rm $dir/final.{mdl,alimdl,occs} 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs +ln -s $x.alimdl $dir/final.alimdl + + + +utils/summarize_warnings.pl $dir/log +( + echo "$0: Likelihood evolution:" + for x in `seq $[$num_iters-1]`; do + tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); } + /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} + END{ d /= t2; l /= t; printf("%s ", d+l); } ' + done + echo +) | tee $dir/log/summary.log + +echo Done diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm.sh new file mode 100755 index 00000000000..8c866a3961a --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm.sh @@ -0,0 +1,273 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# SGMM training, with speaker vectors. This script would normally be called on +# top of fMLLR features obtained from a conventional system, but it also works +# on top of any type of speaker-independent features (based on +# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The +# subspace Gaussian mixture model--A structured model for speech recognition". +# (Computer Speech and Language, 2011). + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=-6 +context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a +# quinphone system. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +num_iters=25 # Total number of iterations +num_iters_alimdl=3 # Number of iterations for estimating alignment model. +max_iter_inc=15 # Last iter to increase #substates on. +realign_iters="5 10 15"; # Iters to realign on. +spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on. +increase_dim_iters="6 8"; # Iters on which to increase phn dim and/or spk dim; + # rarely necessary, and if it is, only the 1st will normally be necessary. +rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training. +phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1] +spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim] +power=0.2 # Exponent for number of gaussians according to occurrence counts +beam=8 +retry_beam=40 +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 7 ]; then + echo "Usage: steps/train_sgmm.sh " + echo " e.g.: steps/train_sgmm.sh 3500 10000 data/train_si84 data/lang \\" + echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --silence-weight # weight for silence (e.g. 0.5 or 0.0)" + echo " --num-iters <#iters> # Number of iterations of E-M" + exit 1; +fi + + +num_leaves=$1 +totsubstates=$2 +data=$3 +lang=$4 +alidir=$5 +ubm=$6 +dir=$7 + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +oov=`cat $lang/oov.int` +silphonelist=`cat $lang/phones/silence.csl` +numsubstates=$num_leaves # Initial #-substates. +incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates +feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1; +[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric. +[ -z $phn_dim ] && phn_dim=$[$feat_dim+1] +[ -z $spk_dim ] && spk_dim=$feat_dim +nj=`cat $alidir/num_jobs` || exit 1; + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +cp $alidir/splice_opts $dir 2>/dev/null +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +spkvecs_opt= # Empty option for now, until we estimate the speaker vectors. +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" + +## Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ -f $alidir/trans.1 ]; then + echo "$0: using transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" +fi +## + + +if [ $stage -le -6 ]; then + echo "$0: accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1; + sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -5 ]; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree --verbose=1 --max-leaves=$num_leaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -4 ]; then + echo "$0: Initializing the model" + # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims + # will be truncated on initialization. + $cmd $dir/log/init_sgmm.log \ + sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo \ + $dir/tree $ubm $dir/0.mdl || exit 1; +fi + +if [ $stage -le -3 ]; then + echo "$0: doing Gaussian selection" + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect $dir/0.mdl "$feats" \ + "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: compiling training graphs" + text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ + "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +if [ $stage -le -1 ]; then + echo "$0: Converting alignments" + $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \ + convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +x=0 +while [ $x -lt $num_iters ]; do + echo "$0: training pass $x ... " + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo "$0: re-aligning data" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \ + --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \ + $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \ + sgmm-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \ + $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \ + ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1; + fi + spkvecs_opt[$n]="--spk-vecs=ark:$dir/vecs.JOB" + fi + if [ $x -eq 0 ]; then + flags=vwcSt # on the first iteration, don't update projections M or N + elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then + # Update N if we have speaker-vector space and x is odd, + # and we've already updated the speaker vectors... + flags=vNwcSt + else + # otherwise update M. + flags=vMwcSt + fi + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + sgmm-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \ + --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \ + $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \ + $dir/$x.JOB.acc || exit 1; + fi + + # The next option is needed if the user specifies a phone or speaker sub-space + # dimension that's higher than the "normal" one. + increase_dim_opts= + if echo $increase_dim_iters | grep -w $x >/dev/null; then + increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim" + # Note: the command below might have a null effect on some iterations. + if [ $spk_dim -gt $feat_dim ]; then + cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \ + copy-vector --print-args=false --change-dim=$spk_dim \ + ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \ + mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1; + fi + fi + + if [ $stage -le $x ]; then + $cmd $dir/log/update.$x.log \ + sgmm-est --update-flags=$flags --split-substates=$numsubstates $increase_dim_opts \ + --power=$power --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \ + $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null + fi + + if [ $x -lt $max_iter_inc ]; then + numsubstates=$[$numsubstates+$incsubstates] + fi + x=$[$x+1]; +done + +rm $dir/final.mdl $dir/final.occs 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +if [ $spk_dim -gt 0 ]; then + # We need to create an "alignment model" that's been trained + # without the speaker vectors, to do the first-pass decoding with. + # in test time. + + # We do this for a few iters, in this recipe. + final_mdl=$dir/$x.mdl + cur_alimdl=$dir/$x.mdl + while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do + echo "$0: building alignment model (pass $x)" + if [ $x -eq $num_iters ]; then # 1st pass of building alimdl. + flags=MwcS # don't update v the first time. Note-- we never update transitions. + # they wouldn't change anyway as we use the same alignment as previously. + else + flags=vMwcS + fi + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \ + --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \ + sgmm-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \ + $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1; + $cmd $dir/log/update_ali.$x.log \ + sgmm-est --update-flags=$flags --remove-speaker-space=true --power=$power $cur_alimdl \ + "sgmm-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1; + rm $dir/$x.*.aliacc || exit 1; + [ $x -gt $num_iters ] && rm $dir/$x.alimdl + fi + cur_alimdl=$dir/$[$x+1].alimdl + x=$[$x+1] + done + rm $dir/final.alimdl 2>/dev/null + ln -s $x.alimdl $dir/final.alimdl +fi + +utils/summarize_warnings.pl $dir/log + +echo Done diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm2.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm2.sh new file mode 100755 index 00000000000..9cfce3ae6ab --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_sgmm2.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# SGMM training, with speaker vectors. This script would normally be called on +# top of fMLLR features obtained from a conventional system, but it also works +# on top of any type of speaker-independent features (based on +# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The +# subspace Gaussian mixture model--A structured model for speech recognition". +# (Computer Speech and Language, 2011). + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=-6 # use this to resume partially finished training +context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a +# quinphone system. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +num_iters=25 # Total number of iterations of training +num_iters_alimdl=3 # Number of iterations for estimating alignment model. +max_iter_inc=15 # Last iter to increase #substates on. +realign_iters="5 10 15"; # Iters to realign on. +spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on. +increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim; + # rarely necessary, and if it is, only the 1st will normally be necessary. +rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training. + # Bigger -> more pruning; zero = no pruning. +phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1] +spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim] +power=0.2 # Exponent for number of gaussians according to occurrence counts +beam=8 +self_weight=0.9 +retry_beam=40 +leaves_per_group=5 # Relates to the SCTM (state-clustered tied-mixture) aspect: + # average number of pdfs in a "group" of pdfs. +update_m_iter=4 +spk_dep_weights=true # [Symmetric SGMM] set this to false if you don't want "u" (i.e. to turn off + # symmetric SGMM. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 7 ]; then + echo "Usage: steps/train_sgmm2.sh " + echo " e.g.: steps/train_sgmm2.sh 5000 8000 data/train_si84 data/lang \\" + echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --silence-weight # weight for silence (e.g. 0.5 or 0.0)" + echo " --num-iters <#iters> # Number of iterations of E-M" + echo " --leaves-per-group <#leaves> # Average #leaves shared in one group" + exit 1; +fi + +num_pdfs=$1 # final #leaves, at 2nd level of tree. +totsubstates=$2 +data=$3 +lang=$4 +alidir=$5 +ubm=$6 +dir=$7 + +num_groups=$[$num_pdfs/$leaves_per_group] +first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1; + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +oov=`cat $lang/oov.int` +silphonelist=`cat $lang/phones/silence.csl` +if [ "$self_weight" == "1.0" ]; then + numsubstates=$num_groups # Initial #-substates. +else + numsubstates=$num_pdfs # Initial #-substates. +fi +incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates +feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1; +[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric. +[ -z $phn_dim ] && phn_dim=$[$feat_dim+1] +[ -z $spk_dim ] && spk_dim=$feat_dim +nj=`cat $alidir/num_jobs` || exit 1; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +spkvecs_opt= # Empty option for now, until we estimate the speaker vectors. +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" + +## Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ -f $alidir/trans.1 ]; then + echo "$0: using transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" +fi +## + + +if [ $stage -le -6 ]; then + echo "$0: accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1; + sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -5 ]; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree-two-level --binary=false --verbose=1 --max-leaves-first=$num_groups \ + --max-leaves-second=$num_pdfs $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree $dir/pdf2group.map || exit 1; +fi + +if [ $stage -le -4 ]; then + echo "$0: Initializing the model" + # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims + # will be truncated on initialization. + $cmd $dir/log/init_sgmm.log \ + sgmm2-init --spk-dep-weights=$spk_dep_weights --self-weight=$self_weight \ + --pdf-map=$dir/pdf2group.map --phn-space-dim=$phn_dim \ + --spk-space-dim=$spk_dim $lang/topo $dir/tree $ubm $dir/0.mdl || exit 1; +fi + +if [ $stage -le -3 ]; then + echo "$0: doing Gaussian selection" + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect $dir/0.mdl "$feats" \ + "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: compiling training graphs" + text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ + "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +if [ $stage -le -1 ]; then + echo "$0: converting alignments" + $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \ + convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + + +x=0 +while [ $x -lt $num_iters ]; do + echo "$0: training pass $x ... " + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo "$0: re-aligning data" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + sgmm2-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \ + --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \ + $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \ + sgmm2-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \ + $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \ + ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1; + fi + spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB" + fi + if [ $x -eq 0 ]; then + flags=vwcSt # on the first iteration, don't update projections M or N + elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then + # Update N if we have speaker-vector space and x is odd, + # and we've already updated the speaker vectors... + flags=vNwSct + else + if [ $x -ge $update_m_iter ]; then + flags=vMwSct # udpate M. + else + flags=vwSct # no M on early iters, if --update-m-iter option given. + fi + fi + $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update + # spk-weight projections "u". + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + sgmm2-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \ + --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \ + $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \ + $dir/$x.JOB.acc || exit 1; + fi + + # The next option is needed if the user specifies a phone or speaker sub-space + # dimension that's higher than the "normal" one. + increase_dim_opts= + if echo $increase_dim_iters | grep -w $x >/dev/null; then + increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim" + # Note: the command below might have a null effect on some iterations. + if [ $spk_dim -gt $feat_dim ]; then + cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \ + copy-vector --print-args=false --change-dim=$spk_dim \ + ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \ + mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1; + fi + fi + + if [ $stage -le $x ]; then + $cmd $dir/log/update.$x.log \ + sgmm2-est --update-flags=$flags --split-substates=$numsubstates \ + $increase_dim_opts --power=$power --write-occs=$dir/$[$x+1].occs \ + $dir/$x.mdl "sgmm2-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null + fi + if [ $x -lt $max_iter_inc ]; then + numsubstates=$[$numsubstates+$incsubstates] + fi + x=$[$x+1]; +done + +rm $dir/final.mdl $dir/final.occs 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +if [ $spk_dim -gt 0 ]; then + # We need to create an "alignment model" that's been trained + # without the speaker vectors, to do the first-pass decoding with. + # in test time. + + # We do this for a few iters, in this recipe. + final_mdl=$dir/$x.mdl + cur_alimdl=$dir/$x.mdl + while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do + echo "$0: building alignment model (pass $x)" + if [ $x -eq $num_iters ]; then # 1st pass of building alimdl. + flags=MwcS # don't update v the first time. Note-- we never update transitions. + # they wouldn't change anyway as we use the same alignment as previously. + else + flags=vMwcS + fi + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + sgmm2-post-to-gpost $spkvecs_opt "$gselect_opt" \ + --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \ + sgmm2-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \ + $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1; + $cmd $dir/log/update_ali.$x.log \ + sgmm2-est --update-flags=$flags --remove-speaker-space=true --power=$power \ + $cur_alimdl "sgmm2-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1; + rm $dir/$x.*.aliacc || exit 1; + [ $x -gt $num_iters ] && rm $dir/$x.alimdl + fi + cur_alimdl=$dir/$[$x+1].alimdl + x=$[$x+1] + done + rm $dir/final.alimdl 2>/dev/null + ln -s $x.alimdl $dir/final.alimdl +fi + +utils/summarize_warnings.pl $dir/log + +echo Done diff --git a/egs/kaldi-vystadial-recipe/s5/steps/train_ubm.sh b/egs/kaldi-vystadial-recipe/s5/steps/train_ubm.sh new file mode 100755 index 00000000000..768025e25c6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/train_ubm.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This trains a UBM (i.e. a mixture of Gaussians), by clustering +# the Gaussians from a trained HMM/GMM system and then doing a few +# iterations of UBM training. +# We mostly use this for SGMM systems. + +# Begin configuration section. +nj=4 +cmd=run.pl +silence_weight= # You can set it to e.g. 0.0, to weight down silence in training. +stage=-2 +num_gselect1=50 # first stage of Gaussian-selection +num_gselect2=25 # second stage. +intermediate_num_gauss=2000 +num_iters=3 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 5 ]; then + echo "Usage: steps/train_ubm.sh " + echo " e.g.: steps/train_ubm.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --silence-weight # weight for silence (e.g. 0.5 or 0.0)" + echo " --num-iters <#iters> # Number of iterations of E-M" + exit 1; +fi + +num_gauss=$1 +data=$2 +lang=$3 +alidir=$4 +dir=$5 + +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +if [ $[$num_gauss*2] -gt $intermediate_num_gauss ]; then + echo "intermediate_num_gauss was too small $intermediate_num_gauss" + intermediate_num_gauss=$[$num_gauss*2]; + echo "setting it to $intermediate_num_gauss" +fi + + +# Set various variables. +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. + +## Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ -f $alidir/trans.1 ]; then + echo "$0: using transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" +fi +## + +if [ ! -z "$silence_weight" ]; then + weights_opt="--weights='ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'" +else + weights_opt= +fi + +if [ $stage -le -2 ]; then + echo "$0: clustering model $alidir/final.mdl to get initial UBM" + $cmd $dir/log/cluster.log \ + init-ubm --intermediate-num-gauss=$intermediate_num_gauss --ubm-num-gauss=$num_gauss \ + --verbose=2 --fullcov-ubm=true $alidir/final.mdl $alidir/final.occs \ + $dir/0.ubm || exit 1; +fi + +# Do initial phase of Gaussian selection and save it to disk -- later on we'll +# do more Gaussian selection to further prune, as the model changes. + + +if [ $stage -le -1 ]; then + echo "$0: doing Gaussian selection" + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$num_gselect1 "fgmm-global-to-gmm $dir/0.ubm - |" "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + + +x=0 +while [ $x -lt $num_iters ]; do + echo "Pass $x" + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-gselect --n=$num_gselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \ + "fgmm-global-to-gmm $dir/$x.ubm - |" "$feats" ark:- \| \ + fgmm-global-acc-stats $weights_opt --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \ + $dir/$x.JOB.acc || exit 1; + lowcount_opt="--remove-low-count-gaussians=false" + [ $[$x+1] -eq $num_iters ] && lowcount_opt= # Only remove low-count Gaussians + # on last iter-- we can't do it earlier, or the Gaussian-selection info would + # be mismatched. + $cmd $dir/log/update.$x.log \ + fgmm-global-est $lowcount_opt --verbose=2 $dir/$x.ubm "fgmm-global-sum-accs - $dir/$x.*.acc |" \ + $dir/$[$x+1].ubm || exit 1; + rm $dir/$x.*.acc $dir/$x.ubm + x=$[$x+1] +done + +rm $dir/gselect.*.gz +rm $dir/final.ubm 2>/dev/null +mv $dir/$x.ubm $dir/final.ubm || exit 1; diff --git a/egs/kaldi-vystadial-recipe/s5/steps/word_align_lattices.sh b/egs/kaldi-vystadial-recipe/s5/steps/word_align_lattices.sh new file mode 100755 index 00000000000..2adcfdec606 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/steps/word_align_lattices.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012 +# Apache 2.0. + +# Begin configuration section. +silence_label=0 +cmd=run.pl +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +for x in `seq 2`; do + [ "$1" == "--silence-label" ] && silence_label=$2 && shift 2; + [ "$1" == "--cmd" ] && cmd="$2" && shift 2; +done + +if [ $# != 3 ]; then + echo "Word-align lattices (make the arcs sync up with words)" + echo "" + echo "Usage: scripts/walign_lats.sh [options] " + echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--silence-label ]" + exit 1; +fi + +. ./path.sh || exit 1; + +lang=$1 +indir=$2 +outdir=$3 + +mdl=`dirname $indir`/final.mdl +wbfile=$lang/phones/word_boundary.int + +for f in $mdl $wbfile $indir/num_jobs; do + [ ! -f $f ] && echo "word_align_lattices.sh: no such file $f" && exit 1; +done + +mkdir -p $outdir/log + + +cp $indir/num_jobs $outdir; +nj=`cat $indir/num_jobs` + +$cmd JOB=1:$nj $outdir/log/align.JOB.log \ + lattice-align-words --silence-label=$silence_label --test=true \ + $wbfile $mdl "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c >$outdir/lat.JOB.gz" || exit 1; + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/add_disambig.pl b/egs/kaldi-vystadial-recipe/s5/utils/add_disambig.pl new file mode 100755 index 00000000000..c605659e105 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/add_disambig.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Adds some specified number of disambig symbols to a symbol table. +# Adds these as #1, #2, etc. +# If the --include-zero option is specified, includes an extra one +# #0. + +$include_zero = 0; +if($ARGV[0] eq "--include-zero") { + $include_zero = 1; + shift @ARGV; +} + +if(@ARGV != 2) { + die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; +} + + +$input = $ARGV[0]; +$nsyms = $ARGV[1]; + +open(F, "<$input") || die "Opening file $input"; + +while() { + @A = split(" ", $_); + @A == 2 || die "Bad line $_"; + $lastsym = $A[1]; + print; +} + +if(!defined($lastsym)){ + die "Empty symbol file?"; +} + +if($include_zero) { + $lastsym++; + print "#0 $lastsym\n"; +} + +for($n = 1; $n <= $nsyms; $n++) { + $y = $n + $lastsym; + print "#$n $y\n"; +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/add_lex_disambig.pl b/egs/kaldi-vystadial-recipe/s5/utils/add_lex_disambig.pl new file mode 100755 index 00000000000..9f9054e1795 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/add_lex_disambig.pl @@ -0,0 +1,101 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Adds disambiguation symbols to a lexicon. +# Outputs still in the normal lexicon format. +# Disambig syms are numbered #1, #2, #3, etc. (#0 +# reserved for symbol in grammar). +# Outputs the number of disambig syms to the standard output. + +if(@ARGV != 2) { + die "Usage: add_lex_disambig.pl lexicon.txt lexicon_disambig.txt " +} + + +$lexfn = shift @ARGV; +$lexoutfn = shift @ARGV; + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + +# (1) Read in the lexicon. +@L = ( ); +while() { + @A = split(" ", $_); + push @L, join(" ", @A); +} + +# (2) Work out the count of each phone-sequence in the +# lexicon. + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + $count{join(" ",@A)}++; +} + +# (3) For each left sub-sequence of each phone-sequence, note down +# that exists (for identifying prefixes of longer strings). + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + while(@A > 0) { + pop @A; # Remove last phone + $issubseq{join(" ",@A)} = 1; + } +} + +# (4) For each entry in the lexicon: +# if the phone sequence is unique and is not a +# prefix of another word, no diambig symbol. +# Else output #1, or #2, #3, ... if the same phone-seq +# has already been assigned a disambig symbol. + + +open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; + +$max_disambig = 0; +foreach $l (@L) { + @A = split(" ", $l); + $word = shift @A; + $phnseq = join(" ",@A); + if(!defined $issubseq{$phnseq} + && $count{$phnseq}==1) { + ; # Do nothing. + } else { + if($phnseq eq "") { # need disambig symbols for the empty string + # that are not use anywhere else. + $max_disambig++; + $reserved{$max_disambig} = 1; + $phnseq = "#$max_disambig"; + } else { + $curnumber = $disambig_of{$phnseq}; + if(!defined{$curnumber}) { $curnumber = 0; } + $curnumber++; # now 1 or 2, ... + while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols + if($curnumber > $max_disambig) { + $max_disambig = $curnumber; + } + $disambig_of{$phnseq} = $curnumber; + $phnseq = $phnseq . " #" . $curnumber; + } + } + print O "$word\t$phnseq\n"; +} + +print $max_disambig . "\n"; + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/apply_map.pl b/egs/kaldi-vystadial-recipe/s5/utils/apply_map.pl new file mode 100755 index 00000000000..4f89d584b36 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/apply_map.pl @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This program is a bit like ./sym2int.pl in that it applies a map +# to things in a file, but it's a bit more general in that it doesn't +# assume the things being mapped to are single tokens, they could +# be sequences of tokens. + +# This program takes two arguments, which may be files or "-" for the +# standard input. Both files must have lines with one or more fields, +# interpreted as a map from the first field (a string) to a list of strings. +# if the first file has as one of its lines +# A x y +# and the second has the lines +# x P +# y Q R +# then the output of this program will be +# A P Q R +# +# Note that if x or y did not appear as the first field of file b, we would +# print a warning and omit the whole line rather than map it to the empty +# string. + +if(@ARGV != 1) { + print STDERR "Usage: apply_map.pl map output\n" . + "e.g.: echo A B | apply_map.pl ) { + @A = split(" ", $_); + @A >= 1 || die "apply_map.pl: empty line."; + $i = shift @A; + $o = join(" ", @A); + $map{$i} = $o; +} + +while() { + @A = split(" ", $_); + for ($x = 0; $x < @A; $x++) { + $a = $A[$x]; + if (!defined $map{$a}) { die "compose_maps.pl: undefined key $a\n"; } + $A[$x] = $map{$a}; + } + print join(" ", @A) . "\n"; +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/best_wer.sh b/egs/kaldi-vystadial-recipe/s5/utils/best_wer.sh new file mode 100755 index 00000000000..126d59bb87a --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/best_wer.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# To be run from one directory above this script. + +perl -e 'while(<>){ + if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool. + elsif (m: (Mean|Sum/Avg|)\s+\|\s+\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|: + && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite. + if (defined $bestline){ print $bestline; } ' + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/combine_data.sh b/egs/kaldi-vystadial-recipe/s5/utils/combine_data.sh new file mode 100755 index 00000000000..7b2e2062336 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/combine_data.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +if [ $# -le 2 ]; then + echo "Usage: combine_data.sh ..." + exit 1 +fi + +dest=$1; +shift; + +first_src=$1; + +mkdir -p $dest; + +export LC_ALL=C + +for file in utt2spk feats.scp text cmvn.scp segments reco2file_and_channel wav.scp; do + if [ -f $first_src/$file ]; then + ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; + echo "$0: combined $file" + else + echo "$0 [info]: not combining $file as it does not exist" + fi +done + +utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt + +utils/fix_data_dir.sh $dest || exit 1; + +exit 0 diff --git a/egs/kaldi-vystadial-recipe/s5/utils/convert_ctm.pl b/egs/kaldi-vystadial-recipe/s5/utils/convert_ctm.pl new file mode 100755 index 00000000000..7676a1d1321 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/convert_ctm.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This takes as standard input a ctm file that's "relative to the utterance", +# i.e. times are measured relative to the beginning of the segments, and it +# uses a "segments" file (format: +# utterance-id recording-id start-time end-time +# ) and a "reco2file_and_channel" file (format: +# recording-id basename-of-file + +if (@ARGV < 2 || @ARGV > 3) { + print STDERR "Usage: convert_ctm.pl [] > real-ctm\n"; + exit(1); +} + +$segments = shift @ARGV; +$reco2file_and_channel = shift @ARGV; + +open(S, "<$segments") || die "opening segments file $segments"; +while() { + @A = split(" ", $_); + @A == 4 || die "Bad line in segments file: $_"; + ($utt, $recording_id, $begin_time, $end_time) = @A; + $utt2reco{$utt} = $recording_id; + $begin{$utt} = $begin_time; + $end{$utt} = $end_time; +} +close(S); +open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; +while() { + @A = split(" ", $_); + @A == 3 || die "Bad line in reco2file_and_channel file: $_"; + ($recording_id, $file, $channel) = @A; + $reco2file{$recording_id} = $file; + $reco2channel{$recording_id} = $channel; +} + + +# Now process the ctm file, which is either the standard input or the third +# command-line argument. +while(<>) { + @A= split(" ", $_); + ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_"; + # lines look like: + # 1 [ confidence ] + ($utt, $one, $wbegin, $wlen, $w, $conf) = @A; + $reco = $utt2reco{$utt}; + if (!defined $reco) { die "Utterance-id $utt not defined in segments file $segments"; } + $file = $reco2file{$reco}; + $channel = $reco2channel{$reco}; + if (!defined $file || !defined $channel) { + die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; + } + $b = $begin{$utt}; + $e = $end{$utt}; + $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording. + $wbegin_r = sprintf("%.2f", $wbegin_r); + $wlen = sprintf("%.2f", $wlen); + if (defined $conf) { + $line = "$file $channel $wbegin_r $wlen $w $conf\n"; + } else { + $line = "$file $channel $wbegin_r $wlen $w\n"; + } + if ($wbegin_r + $wlen > $e + 0.01) { + print STDERR "Warning: word appears to be past end of recording; line is $line"; + } + print $line; # goes to stdout. +} + +__END__ + +# Test example [also test it without the 0.5's] +echo utt reco 10.0 20.0 > segments +echo reco file A > reco2file_and_channel +echo utt 1 8.0 1.0 word 0.5 > ctm_in +echo file A 18.00 1.00 word 0.5 > ctm_out +utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error +rm segments reco2file_and_channel ctm_in ctm_out + + + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/eps2disambig.pl b/egs/kaldi-vystadial-recipe/s5/utils/eps2disambig.pl new file mode 100755 index 00000000000..fecbdc83368 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/eps2disambig.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces epsilon with #0 on the input side only, of the G.fst +# acceptor. + +while(<>){ + s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; + print; +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/filter_scp.pl b/egs/kaldi-vystadial-recipe/s5/utils/filter_scp.pl new file mode 100755 index 00000000000..dfe4b13d14d --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/filter_scp.pl @@ -0,0 +1,41 @@ +#!/usr/bin/perl -w +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script takes a list of utterance-ids or any file whose first field +# of each line is an utterance-id, and filters an scp +# file (or any file whose first field is an utterance id), printing +# out only those lines whose first field is in id_list. + +if(@ARGV < 1 || @ARGV > 2) { + die "Usage: filter_scp.pl id_list [in.scp] > out.scp "; +} + +$idlist = shift @ARGV; +open(F, "<$idlist") || die "Could not open id-list file $idlist"; +while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; +} + +while(<>) { + @A = split; + @A > 0 || die "Invalid scp file line $_"; + if($seen{$A[0]}) { + print $_; + } +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/find_arpa_oovs.pl b/egs/kaldi-vystadial-recipe/s5/utils/find_arpa_oovs.pl new file mode 100755 index 00000000000..abd63f65e7a --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/find_arpa_oovs.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +if ( @ARGV < 1 && @ARGV > 2) { + die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n"; + # This program finds words in the arpa file that are not symbols + # in the OpenFst-format symbol table words.txt. It prints them + # on the standard output, one per line. +} + +$symtab = shift @ARGV; +open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n"; +while(){ + @A = split(" ", $_); + @A == 2 || die "Bad line in symbol table file: $_"; + $seen{$A[0]} = 1; +} + +$curgram=0; +while(<>) { # Find the \data\ marker. + if(m:^\\data\\$:) { last; } +} +while(<>) { + if(m/^\\(\d+)\-grams:\s*$/) { + $curgram = $1; + if($curgram > 1) { + last; # This is an optimization as we can get the vocab from the 1-grams + } + } elsif($curgram > 0) { + @A = split(" ", $_); + if(@A > 1) { + shift @A; + for($n=0;$n<$curgram;$n++) { + $word = $A[$n]; + if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; } + $in_arpa{$word} = 1; + } + } else { + if(@A > 0 && $A[0] !~ m:\\end\\:) { + print STDERR "Unusual line $_ (line $.) in arpa file\n"; + } + } + } +} + +foreach $w (keys %in_arpa) { + if(!defined $seen{$w} && $w ne "" && $w ne "") { + print "$w\n"; + } +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/fix_data_dir.sh b/egs/kaldi-vystadial-recipe/s5/utils/fix_data_dir.sh new file mode 100755 index 00000000000..d8b937ac21b --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/fix_data_dir.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# This script makes sure that only the segments present in +# all of "feats.scp", "wav.scp" [if present], segments[if prsent] +# text, and utt2spk are present in any of them. +# It puts the original contents of data-dir into +# data-dir/.backup + +if [ $# != 1 ]; then + echo "Usage: fix_data_dir.sh data-dir" + exit 1 +fi + +data=$1 +mkdir -p $data/.backup + +[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; + +[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; + +cat $data/utt2spk | awk '{print $1}' > $data/utts + +# Do a check. +export LC_ALL=C +! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ + echo "utt2spk is not in sorted order (fix this yourelf)" && exit 1; + +! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ + echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ + echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; + +! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ + echo "spk2utt is not in sorted order (fix this yourelf)" && exit 1; + +maybe_wav= +[ ! -f $data/segments ] && maybe_wav=wav # wav indexed by utts only if segments does not exist. +for x in feats.scp text segments $maybe_wav; do + if [ -f $data/$x ]; then + utils/filter_scp.pl $data/$x $data/utts > $data/utts.tmp + mv $data/utts.tmp $data/utts + fi +done +[ ! -s $data/utts ] && echo "fix_data_dir.sh: no utterances remained: not doing anything." && \ + rm $data/utts && exit 1; + +nutts=`cat $data/utts | wc -l` +if [ -f $data/feats.scp ]; then + nfeats=`cat $data/feats.scp | wc -l` +else + nfeats=0 +fi +ntext=`cat $data/text | wc -l` +if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then + echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions." +else + echo "fix_data_dir.sh: kept all $nutts utterances." +fi + +for x in utt2spk feats.scp text segments $maybe_wav; do + if [ -f $data/$x ]; then + mv $data/$x $data/.backup/$x + utils/filter_scp.pl $data/utts $data/.backup/$x > $data/$x + fi +done + + +if [ -f $data/segments ]; then + awk '{print $2}' $data/segments | sort | uniq > $data/reco # reco means the id's of the recordings. + [ -f $data/wav.scp ] && mv $data/wav.scp $data/.backup/ && \ + utils/filter_scp.pl $data/reco $data/.backup/wav.scp >$data/wav.scp + [ -f $data/reco2file_and_channel ] && mv $data/reco2file_and_channel $data/.backup/ && \ + utils/filter_scp.pl $data/reco $data/.backup/reco2file_and_channel >$data/reco2file_and_channel + rm $data/reco +fi + +utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt + +rm $data/utts + +echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/egs/kaldi-vystadial-recipe/s5/utils/format_lm.sh b/egs/kaldi-vystadial-recipe/s5/utils/format_lm.sh new file mode 100755 index 00000000000..b6ba4ce7d1c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/format_lm.sh @@ -0,0 +1,84 @@ +#!/bin/bash -u + +# Copyright 2012 Arnab Ghoshal +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +set -o errexit + +if [ $# -ne 4 ]; then + printf "Usage: %s lang_dir LM lexicon out_dir\n" `basename $0` + echo " Convert ARPA-format language models to FSTs."; + exit 1; +fi + +lang_dir=$1 +lm=$2 +lexicon=$3 +out_dir=$4 +mkdir -p $out_dir + +[ -f ./path.sh ] && . ./path.sh + +echo "Converting '$lm' to FST" + +for f in phones.txt words.txt L.fst L_disambig.fst phones/; do + cp -r $lang_dir/$f $out_dir +done + +lm_base=$(basename $lm '.gz') +gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ + > $out_dir/oovs_${lm_base}.txt + +# Removing all "illegal" combinations of and , which are supposed to +# occur only at being/end of utt. These can cause determinization failures +# of CLG [ends up being epsilon cycles]. +gunzip -c $lm \ + | egrep -v ' | | ' \ + | arpa2fst - | fstprint \ + | utils/remove_oovs.pl $out_dir/oovs_${lm_base}.txt \ + | utils/eps2disambig.pl | utils/s2eps.pl \ + | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false \ + | fstrmepsilon > $out_dir/G.fst +set +e +fstisstochastic $out_dir/G.fst +set -e +# The output is like: +# 9.14233e-05 -0.259833 +# we do expect the first of these 2 numbers to be close to zero (the second is +# nonzero because the backoff weights make the states sum to >1). + +# Everything below is only for diagnostic. +# Checking that G has no cycles with empty words on them (e.g. , ); +# this might cause determinization failure of CLG. +# #0 is treated as an empty word. +mkdir -p $out_dir/tmpdir.g +awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} + END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt + +fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ + $out_dir/tmpdir.g/select_empty.fst.txt \ + | fstarcsort --sort_type=olabel \ + | fstcompose - $out_dir/G.fst > $out_dir/tmpdir.g/empty_words.fst + +fstinfo $out_dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \ + && echo "Language model has cycles with empty words" && exit 1 + +rm -r $out_dir/tmpdir.g + + +echo "Succeeded in formatting LM: '$lm'" diff --git a/egs/kaldi-vystadial-recipe/s5/utils/format_lm_sri.sh b/egs/kaldi-vystadial-recipe/s5/utils/format_lm_sri.sh new file mode 100755 index 00000000000..8489267911f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/format_lm_sri.sh @@ -0,0 +1,110 @@ +#!/bin/bash -u + +# Copyright 2012 Arnab Ghoshal +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +set -o errexit + +# Begin configuration section. +srilm_opts="-subset -prune-lowprobs -unk -tolower" +# end configuration sections + +help_message="Usage: "`basename $0`" [options] lang_dir LM lexicon out_dir +Convert ARPA-format language models to FSTs. Change the LM vocabulary using SRILM.\n +options: + --help # print this message and exit + --srilm-opts STRING # options to pass to SRILM tools (default: '$srilm_opts') +"; + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + printf "$help_message\n"; + exit 1; +fi + +lang_dir=$1 +lm=$2 +lexicon=$3 +out_dir=$4 +mkdir -p $out_dir + +[ -f ./path.sh ] && . ./path.sh +( which change-lm-vocab >&/dev/null && which ngram >&/dev/null ) \ + || { echo "SRILM not found on PATH. Exiting ..."; exit 1; } + +echo "Converting '$lm' to FST" +tmpdir=$(mktemp -d); +trap 'rm -rf "$tmpdir"' EXIT + +for f in phones.txt words.txt L.fst L_disambig.fst phones/; do + cp -r $lang_dir/$f $out_dir +done + +lm_base=$(basename $lm '.gz') +gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ + > $out_dir/oovs_${lm_base}.txt + +# Removing all "illegal" combinations of and , which are supposed to +# occur only at being/end of utt. These can cause determinization failures +# of CLG [ends up being epsilon cycles]. +gunzip -c $lm \ + | egrep -v ' | | ' \ + | gzip -c > $tmpdir/lm.gz + +awk '{print $1}' $out_dir/words.txt > $tmpdir/voc + +# Change the LM vocabulary to be the intersection of the current LM vocabulary +# and the set of words in the pronunciation lexicon. This also renormalizes the +# LM by recomputing the backoff weights, and remove those ngrams whose +# probabilities are lower than the backed-off estimates. +change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \ + $srilm_opts + +arpa2fst $tmpdir/out_lm | fstprint \ + | utils/eps2disambig.pl | utils/s2eps.pl \ + | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false \ + | fstrmepsilon > $out_dir/G.fst +set +e +fstisstochastic $out_dir/G.fst +set -e +# The output is like: +# 9.14233e-05 -0.259833 +# we do expect the first of these 2 numbers to be close to zero (the second is +# nonzero because the backoff weights make the states sum to >1). + +# Everything below is only for diagnostic. +# Checking that G has no cycles with empty words on them (e.g. , ); +# this might cause determinization failure of CLG. +# #0 is treated as an empty word. +mkdir -p $out_dir/tmpdir.g +awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} + END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt + +fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ + $out_dir/tmpdir.g/select_empty.fst.txt \ + | fstarcsort --sort_type=olabel \ + | fstcompose - $out_dir/G.fst > $out_dir/tmpdir.g/empty_words.fst + +fstinfo $out_dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \ + && echo "Language model has cycles with empty words" && exit 1 + +rm -r $out_dir/tmpdir.g + + +echo "Succeeded in formatting LM: '$lm'" diff --git a/egs/kaldi-vystadial-recipe/s5/utils/gen_topo.pl b/egs/kaldi-vystadial-recipe/s5/utils/gen_topo.pl new file mode 100755 index 00000000000..1488a884d8e --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/gen_topo.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. + +if(@ARGV != 4) { + print STDERR "Usage: utils/gen_topo.pl \n"; + print STDERR "e.g.: utils/gen_topo.pl 3 5 4:5:6:7:8:9:10 1:2:3\n"; + exit (1); +} + +($num_nonsil_states, $num_sil_states, $nonsil_phones, $sil_phones) = @ARGV; + +( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) || die "Unexpected number of nonsilence-model states $num_nonsil_states\n"; +( $num_sil_states >= 3 && $num_sil_states <= 100 ) || die "Unexpected number of silence-model states $num_sil_states\n"; + +$nonsil_phones =~ s/:/ /g; +$sil_phones =~ s/:/ /g; +$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; +$sil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; + +print "\n"; +print "\n"; +print "\n"; +print "$nonsil_phones\n"; +print "\n"; +for ($state = 0; $state < $num_nonsil_states; $state++) { + $statep1 = $state+1; + print " $state $state $state 0.75 $statep1 0.25 \n"; +} +print " $num_nonsil_states \n"; # non-emitting final state. +print "\n"; +# Now silence phones. They have a different topology-- apart from the first and +# last states, it's fully connected. +$transp = 1.0 / ($num_sil_states-1); + +print "\n"; +print "\n"; +print "$sil_phones\n"; +print "\n"; +print " 0 0 "; +for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last + # emitting state. + print " $nextstate $transp "; +} +print "\n"; +for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to + # themselves and to the last emitting state. + print " $state $state "; + for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) { + print " $nextstate $transp "; + } + print "\n"; +} +# Final emitting state (non-skippable). +$state = $num_sil_states-1; +print " $state $state $state 0.75 $num_sil_states 0.25 \n"; +# Final nonemitting state: +print " $num_sil_states \n"; +print "\n"; +print "\n"; diff --git a/egs/kaldi-vystadial-recipe/s5/utils/int2sym.pl b/egs/kaldi-vystadial-recipe/s5/utils/int2sym.pl new file mode 100755 index 00000000000..13cc5ae9b1d --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/int2sym.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +undef $field_begin; +undef $field_end; + + +if ($ARGV[0] eq "-f") { + shift @ARGV; + $field_spec = shift @ARGV; + if ($field_spec =~ m/^\d+$/) { + $field_begin = $field_spec - 1; $field_end = $field_spec - 1; + } + if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) + if ($1 ne "") { + $field_begin = $1 - 1; # Change to zero-based indexing. + } + if ($2 ne "") { + $field_end = $2 - 1; # Change to zero-based indexing. + } + } + if (!defined $field_begin && !defined $field_end) { + die "Bad argument to -f option: $field_spec"; + } +} +$symtab = shift @ARGV; +if(!defined $symtab) { + print STDERR "Usage: sym2int.pl [options] symtab [input] > output\n" . + "options: [-f (|-)]\n" . + "e.g.: -f 2, or -f 3-4\n"; + exit(1); +} + +open(F, "<$symtab") || die "Error opening symbol table file $symtab"; +while() { + @A = split(" ", $_); + @A == 2 || die "bad line in symbol table file: $_"; + $int2sym{$A[1]} = $A[0]; +} + +sub int2sym { + my $a = shift @_; + my $pos = shift @_; + if($a !~ m:^\d+$:) { # not all digits.. + $pos1 = $pos+1; # make it one-based. + die "int2sym.pl: found noninteger token $a [in position $pos1]\n"; + } + $s = $int2sym{$a}; + if(!defined ($s)) { + die "int2sym.pl: integer $a not in symbol table $symtab."; + } + return $s; +} + +$error = 0; +while (<>) { + @A = split(" ", $_); + for ($pos = 0; $pos <= $#A; $pos++) { + $a = $A[$pos]; + if ( (!defined $field_begin || $pos >= $field_begin) + && (!defined $field_end || $pos <= $field_end)) { + $a = int2sym($a, $pos); + } + print $a . " "; + } + print "\n"; +} + + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/ln.pl b/egs/kaldi-vystadial-recipe/s5/utils/ln.pl new file mode 100755 index 00000000000..594d3924ec8 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/ln.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl +use File::Spec; + +if ( @ARGV < 2 ) { + print STDERR "usage: ln.pl input1 input2 dest-dir\n" . + "This script does a soft link of input1, input2, etc." . + "to dest-dir, using relative links where possible\n" . + "Note: input-n and dest-dir may both be absolute pathnames,\n" . + "or relative pathnames, relative to the current directlory.\n"; + exit(1); +} + +$dir = pop @ARGV; +if ( ! -d $dir ) { + print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n"; + exit(1); +} + +$ans = 1; # true. + +$absdir = File::Spec->rel2abs($dir); # Get $dir as abs path. +defined $absdir || die "No such directory $dir"; +foreach $file (@ARGV) { + $absfile = File::Spec->rel2abs($file); # Get $file as abs path. + defined $absfile || die "No such file or directory: $file"; + @absdir_split = split("/", $absdir); + @absfile_split = split("/", $absfile); + + $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this + # as the destination in the link command. + $num_removed = 0; + while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) { + shift @absdir_split; + shift @absfile_split; + $num_removed++; + } + if (-l $newfile) { # newfile is already a link -> safe to delete it. + unlink($newfile); # "unlink" just means delete. + } + if ($num_removed == 0) { # will use absolute pathnames. + $oldfile = "/" . join("/", @absfile_split); + $ret = symlink($oldfile, $newfile); + } else { + $num_dots = @absdir_split; + $oldfile = join("/", @absfile_split); + for ($n = 0; $n < $num_dots; $n++) { + $oldfile = "../" . $oldfile; + } + $ret = symlink($oldfile, $newfile); + } + $ans = $ans && $ret; + if (! $ret) { + print STDERR "Error linking $oldfile to $newfile\n"; + } +} + +exit ($ans == 1 ? 0 : 1); + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/make_lexicon_fst.pl b/egs/kaldi-vystadial-recipe/s5/utils/make_lexicon_fst.pl new file mode 100755 index 00000000000..a5334279c8c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/make_lexicon_fst.pl @@ -0,0 +1,122 @@ +#!/usr/bin/perl -w +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# makes lexicon FST (no pron-probs involved). + +if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { + die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt" +} + +$lexfn = shift @ARGV; +if(@ARGV == 0) { + $silprob = 0.0; +} elsif (@ARGV == 2){ + ($silprob,$silphone) = @ARGV; +} else { + ($silprob,$silphone,$sildisambig) = @ARGV; +} +if($silprob != 0.0) { + $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; + $silcost = -log($silprob); + $nosilcost = -log(1.0 - $silprob); +} + + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + + + +sub is_sil { + # Return true (1) if provided with a phone-sequence + # that means silence. + # @_ is the parameters of the function + # This function returns true if @_ equals ( $silphone ) + # or something of the form ( "#0", $silphone, "#1" ) + # where the "#0" and "#1" are disambiguation symbols. + return ( @_ == 1 && $_[0] eq $silphone || + (@_ == 3 && $_[1] eq $silphone && + $_[0] =~ m/^\#\d+$/ && + $_[0] =~ m/^\#\d+$/)); +} + +if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. + $loopstate = 0; + $nextstate = 1; # next unallocated state. + while() { + @A = split(" ", $_); + $w = shift @A; + + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if(@A > 0) { + $ns = $nextstate++; + } else { + $ns = $loopstate; + } + print "$s\t$ns\t$p\t$word_or_eps\n"; + $word_or_eps = ""; + $s = $ns; + } + } + print "$loopstate\t0\n"; # final-cost. +} else { # have silence probs. + $startstate = 0; + $loopstate = 1; + $silstate = 2; # state from where we go to loopstate after emitting silence. + print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. + if (!defined $sildisambig) { + print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. + $nextstate = 3; + } else { + $disambigstate = 3; + $nextstate = 4; + print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. + print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. + } + while() { + @A = split(" ", $_); + $w = shift @A; + + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if(@A > 0) { + $ns = $nextstate++; + print "$s\t$ns\t$p\t$word_or_eps\n"; + $word_or_eps = ""; + $s = $ns; + } else { + if(!is_sil(@A)){ + # This is non-deterministic but relatively compact, + # and avoids epsilons. + print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n"; + print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n"; + } else { + # no point putting opt-sil after silence word. + print "$s\t$loopstate\t$p\t$word_or_eps\n"; + } + $word_or_eps = ""; + } + } + } + print "$loopstate\t0\n"; # final-cost. +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/make_unigram_grammar.pl b/egs/kaldi-vystadial-recipe/s5/utils/make_unigram_grammar.pl new file mode 100755 index 00000000000..314a66a10cf --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/make_unigram_grammar.pl @@ -0,0 +1,54 @@ +#!/usr/bin/perl +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script is used in discriminative training. +# This script makes a simple unigram-loop version of G.fst +# using a unigram grammar estimated from some training transcripts. +# This is for MMI training. +# We don't have any silences in G.fst; these are supplied by the +# optional silences in the lexicon. + +# Note: the symbols in the transcripts become the input and output +# symbols of G.txt; these can be numeric or not. + +if(@ARGV != 0) { + die "Usage: make_unigram_grammar.pl < text-transcripts > G.txt" +} + +$totcount = 0; +$nl = 0; +while (<>) { + @A = split(" ", $_); + foreach $a (@A) { + $count{$a}++; + $totcount++; + } + $nl++; + $totcount++; # Treat end-of-sentence as a symbol for purposes of + # $totcount, so the grammar is properly stochastic. This doesn't + # become , it just becomes the final-prob. +} + +foreach $a (keys %count) { + $prob = $count{$a} / $totcount; + $cost = -log($prob); # Negated natural-log probs. + print "0\t0\t$a\t$a\t$cost\n"; +} +# Zero final-cost. +$final_prob = $nl / $totcount; +$final_cost = -log($final_prob); +print "0\t$final_cost\n"; + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/mkgraph.sh b/egs/kaldi-vystadial-recipe/s5/utils/mkgraph.sh new file mode 100755 index 00000000000..1134ba778c0 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/mkgraph.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# This script creates a fully expanded decoding graph (HCLG) that represents +# all the language-model, pronunciation dictionary (lexicon), context-dependency, +# and HMM structure in our model. The output is a Finite State Transducer +# that has word-ids on the output, and pdf-ids on the input (these are indexes +# that resolve to Gaussian Mixture Models). +# See +# http://kaldi.sourceforge.net/graph_recipe_test.html +# (this is compiled from this repository using Doxygen, +# the source for this part is in src/doc/graph_recipe_test.dox) + + +N=3 +P=1 +reverse=false + +for x in `seq 2`; do + [ "$1" == "--mono" ] && N=1 && P=0 && shift; + [ "$1" == "--quinphone" ] && N=5 && P=2 && shift; + [ "$1" == "--reverse" ] && reverse=true && shift; +done + +if [ $# != 3 ]; then + echo "Usage: utils/mkgraph.sh [options] " + echo "e.g.: utils/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph" + echo " Options:" + echo " --mono # For monophone models." + echo " --quinphone # For models with 5-phone context (3 is default)" + exit 1; +fi + +if [ -f path.sh ]; then . ./path.sh; fi + +lang=$1 +tree=$2/tree +model=$2/final.mdl +dir=$3 + +mkdir -p $dir + +tscale=1.0 +loopscale=0.1 + +# If $lang/tmp/LG.fst does not exist or is older than its sources, make it... +# (note: the [[ ]] brackets make the || type operators work (inside [ ], we +# would have to use -o instead), -f means file exists, and -ot means older than). + +required="$lang/L.fst $lang/G.fst $lang/phones.txt $lang/words.txt $lang/phones/silence.csl $lang/phones/disambig.int $model $tree" +for f in $required; do + [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1; +done + +mkdir -p $lang/tmp +# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in +# place of -o +if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ + $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then + fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ + fstminimizeencoded > $lang/tmp/LG.fst || exit 1; + fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic." +fi + + +clg=$lang/tmp/CLG_${N}_${P}.fst + +if [[ ! -s $clg || $clg -ot $lang/tmp/LG.fst ]]; then + fstcomposecontext --context-size=$N --central-position=$P \ + --read-disambig-syms=$lang/phones/disambig.int \ + --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.int \ + $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg + fstisstochastic $clg || echo "[info]: CLG not stochastic." +fi + +if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model \ + || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then + if $reverse; then + make-h-transducer --reverse=true --push_weights=true \ + --disambig-syms-out=$dir/disambig_tid.int \ + --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \ + > $dir/Ha.fst || exit 1; + else + make-h-transducer --disambig-syms-out=$dir/disambig_tid.int \ + --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \ + > $dir/Ha.fst || exit 1; + fi +fi + +if [[ ! -s $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \ + $dir/HCLGa.fst -ot $clg ]]; then + fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \ + | fstrmsymbols $dir/disambig_tid.int | fstrmepslocal | \ + fstminimizeencoded > $dir/HCLGa.fst || exit 1; + fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic" +fi + +if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then + add-self-loops --self-loop-scale=$loopscale --reorder=true \ + $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1; + + if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then + # No point doing this test if transition-scale not 1, as it is bound to fail. + fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic." + fi +fi + +# keep a copy of the lexicon and a list of silence phones with HCLG... +# this means we can decode without reference to the $lang directory. + + +cp $lang/words.txt $dir/ || exit 1; +mkdir -p $dir/phones +cp $lang/phones/word_boundary.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring, + # but ignore the error if it's not there. +cp $lang/phones/silence.csl $dir/phones/ || exit 1; +cp $lang/phones.txt $dir/ 2>/dev/null # ignore the error if it's not there. + +# to make const fst: +# fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/nnet/analyze_alignments.sh b/egs/kaldi-vystadial-recipe/s5/utils/nnet/analyze_alignments.sh new file mode 100755 index 00000000000..dc01bb872e0 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/nnet/analyze_alignments.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright 2012 Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# To be run from .. + + +if [ $# != 4 ]; then + echo "Usage: $0 " + echo " e.g.: $0 'TRAINING SET' 'ark:gunzip -c \$alidir/ali.gz |' tri1/final.mdl " + exit 1; +fi + +if [ -f path.sh ]; then . path.sh; fi + +tag=$1 +ali=$2 +model=$3 +lang=$4 + +tmpfile=$(mktemp) + +echo "%%%%%% .pdf STATS, $tag %%%%%%" +analyze-counts --binary=false --rescale-to-probs=true --show-histogram=true \ + "ark:ali-to-pdf --print-args=false $model \"$ali\" ark:- 2>/dev/null |" \ + $tmpfile.0 2>&1 +echo + +echo "%%%%%% .phone STATS, $tag %%%%%%" +#prob stats +analyze-counts --binary=false --rescale-to-probs=true --show-histogram=true \ + "ark:ali-to-phones --print-args=false --per-frame=true $model \"$ali\" ark:- |" \ + $tmpfile.1 2>&1 +#frame stats +analyze-counts --binary=false \ + "ark:ali-to-phones --print-args=false --per-frame=true $model \"$ali\" ark:- |" \ + $tmpfile.2 2>/dev/null +echo + +echo "%%%%%% .ali STATS, $tag %%%%%%" +analyze-counts --binary=false --rescale-to-probs=true --show-histogram=true "$ali" /dev/null 2>&1 +echo + +echo "%%%%%% .phone STATS (VERBOSE), $tag %%%%%%" +#paste and show the logs +cat $tmpfile.1 | sed -e 's|^\s*\[ ||' -e 's|\]||' | tr ' ' '\n' >$tmpfile.1a +cat $tmpfile.2 | sed -e 's|^\s*\[ ||' -e 's|\]||' | tr ' ' '\n' >$tmpfile.2a +paste $tmpfile.1a $tmpfile.2a > $tmpfile +paste $lang/phones.txt $tmpfile | awk '{printf "%10s %4d %f %d\n", $1, $2, $3, $4;}' +echo + +echo "%%%%%% .pdf STATS (VERBOSE), $part %%%%%%" +cat $tmpfile.0 +echo "%%%%%% END" + +rm $tmpfile{,.0,.1,.2,.1a,.2a} + + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_dct_mat.py b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_dct_mat.py new file mode 100755 index 00000000000..bff014af447 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_dct_mat.py @@ -0,0 +1,53 @@ +#!/usr/bin/python -u + +# ./gen_dct_mat.py +# script generates matrix with DCT transform +# +# author: Karel Vesely +# + +from math import * +import sys + + +from optparse import OptionParser + +parser = OptionParser() +parser.add_option('--fea-dim', dest='dim', help='feature dimension') +parser.add_option('--splice', dest='splice', help='applied splice value') +parser.add_option('--dct-basis', dest='dct_basis', help='number of DCT basis') +(options, args) = parser.parse_args() + +if(options.dim == None): + parser.print_help() + sys.exit(1) + +dim=int(options.dim) +splice=int(options.splice) +dct_basis=int(options.dct_basis) + +timeContext=2*splice+1 + + +#generate the DCT matrix +M_PI = 3.1415926535897932384626433832795 +M_SQRT2 = 1.4142135623730950488016887 + + +#generate small DCT matrix +print '[' +for k in range(dct_basis): + for m in range(dim): + for n in range(timeContext): + if(n==0): + print m*'0 ', + else: + print (dim-1)*'0 ', + print str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5))), + if(n==timeContext-1): + print (dim-m-1)*'0 ', + print + print + +print ']' + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_hamm_mat.py b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_hamm_mat.py new file mode 100755 index 00000000000..31a6d877d00 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_hamm_mat.py @@ -0,0 +1,45 @@ +#!/usr/bin/python -u + +# ./gen_hamm_mat.py +# script generates diagonal matrix with hamming window values +# +# author: Karel Vesely +# + +from math import * +import sys + + +from optparse import OptionParser + +parser = OptionParser() +parser.add_option('--fea-dim', dest='dim', help='feature dimension') +parser.add_option('--splice', dest='splice', help='applied splice value') +(options, args) = parser.parse_args() + +if(options.dim == None): + parser.print_help() + sys.exit(1) + +dim=int(options.dim) +splice=int(options.splice) + + +#generate the diagonal matrix with hammings +M_2PI = 6.283185307179586476925286766559005 + +dim_mat=(2*splice+1)*dim +timeContext=2*splice+1 +print '[' +for row in range(dim_mat): + for col in range(dim_mat): + if col!=row: + print '0', + else: + i=int(row/dim) + print str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))), + print + +print ']' + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_mlp_init.py b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_mlp_init.py new file mode 100755 index 00000000000..305bd853c3f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/nnet/gen_mlp_init.py @@ -0,0 +1,83 @@ +#!/usr/bin/python -u + +# ./gen_hamm_dct.py +# script generateing NN initialization for training with TNet +# +# author: Karel Vesely +# + +import math, random +import sys + + +from optparse import OptionParser + +parser = OptionParser() +parser.add_option('--dim', dest='dim', help='d1:d2:d3 layer dimensions in the network') +parser.add_option('--gauss', dest='gauss', help='use gaussian noise for weights', action='store_true', default=False) +parser.add_option('--negbias', dest='negbias', help='use uniform [-4.1,-3.9] for bias (defaultall 0.0)', action='store_true', default=False) +parser.add_option('--inputscale', dest='inputscale', help='scale the weights by 3/sqrt(Ninputs)', action='store_true', default=False) +parser.add_option('--linBNdim', dest='linBNdim', help='dim of linear bottleneck (sigmoids will be omitted, bias will be zero)',default=0) +parser.add_option('--seed', dest='seedval', help='seed for random generator',default=0) +(options, args) = parser.parse_args() + +if(options.dim == None): + parser.print_help() + sys.exit(1) + +#seeding +seedval=int(options.seedval) +if(seedval != 0): + random.seed(seedval) + + +dimStrL = options.dim.split(':') + +dimL = [] +for i in range(len(dimStrL)): + dimL.append(int(dimStrL[i])) + + +#print dimL,'linBN',options.linBNdim + +for layer in range(len(dimL)-1): + print '', dimL[layer+1], dimL[layer] + #weight matrix + print '[' + for row in range(dimL[layer+1]): + for col in range(dimL[layer]): + if(options.gauss): + if(options.inputscale): + print 3/math.sqrt(dimL[layer])*random.gauss(0.0,1.0), + else: + print 0.1*random.gauss(0.0,1.0), + else: + if(options.inputscale): + print (random.random()-0.5)*2*3/math.sqrt(dimL[layer]), + else: + print random.random()/5.0-0.1, + print #newline for each row + print ']' + #bias vector + print '[', + for idx in range(dimL[layer+1]): + if(int(options.linBNdim) == dimL[layer+1]): + print '0.0', + elif(layer == len(dimL)-2):#last layer (softmax) + print '0.0', + elif(options.negbias): + print random.random()/5.0-4.1, + else: + print '0.0', + print ']' + + if(int(options.linBNdim) != dimL[layer+1]): + if(layer == len(dimL)-2): + print '', dimL[layer+1], dimL[layer+1] + else: + print '', dimL[layer+1], dimL[layer+1] + + + + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/nnet/train_nnet_scheduler.sh b/egs/kaldi-vystadial-recipe/s5/utils/nnet/train_nnet_scheduler.sh new file mode 100755 index 00000000000..612a365316c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/nnet/train_nnet_scheduler.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +############################## +#check for obligatory parameters +echo +echo %%% CONFIG +echo learn_rate: ${learn_rate?$0: learn_rate not specified} +echo momentum: ${momentum?$0: momentum not specified} +echo l1_penalty: ${l1_penalty?$0: l1_penalty not specified} +echo l2_penalty: ${l2_penalty?$0: l2_penalty not specified} +echo +echo bunch_size: ${bunch_size?$0: bunch_size not specified} +echo cache_size: ${cache_size?$0: cache_size not specified} +echo randomize: ${randomize?$0: randomize not specified} +echo +echo max_iters: ${max_iters?$0: max_iters not specified} +echo start_halving_inc: ${start_halving_inc?$0: start_halving_inc not specified} +echo end_halving_inc: ${end_halving_inc?$0: end_halving_inc not specified} +echo halving_factor: ${halving_factor?$0: halving_factor not specified} +echo +echo TRAIN_TOOL: ${TRAIN_TOOL?$0: TRAIN_TOOL not specified} +echo +echo feats_cv: ${feats_cv?$0: feats_cv not specified} +echo feats_tr: ${feats_tr?$0: feats_tr not specified} +echo labels: ${labels?$0: labels not specified} +echo mlp_init: ${mlp_init?$0: mlp_init not specified} +echo ${feature_transform:+feature_transform: $feature_transform} +echo ${min_iters:+min_iters: $min_iters} +echo %%% CONFIG +echo + + +############################## +#start training + +#prerun cross-validation +$TRAIN_TOOL --cross-validate=true \ + --bunchsize=$bunch_size --cachesize=$cache_size \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + $mlp_init "$feats_cv" "$labels" \ + 2> $dir/log/prerun.log || exit 1; + +acc=$(cat $dir/log/prerun.log | awk '/FRAME_ACCURACY/{ acc=$3; sub(/%/,"",acc); } END{print acc}') +echo "CROSSVAL PRERUN ACCURACY $acc" + +#training +mlp_best=$mlp_init +mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*} + +iter=0 +halving=0 +for iter in $(seq -w $max_iters); do + echo -n "ITERATION $iter: " + mlp_next=$dir/nnet/${mlp_base}_iter${iter} + + #training + $TRAIN_TOOL \ + --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \ + --bunchsize=$bunch_size --cachesize=$cache_size --randomize=$randomize \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + $mlp_best "$feats_tr" "$labels" $mlp_next \ + 2> $dir/log/iter$iter.log || exit 1; + + tr_acc=$(cat $dir/log/iter$iter.log | awk '/FRAME_ACCURACY/{ acc=$3; sub(/%/,"",acc); } END{print acc}') + echo -n "TRAIN ACCURACY $(printf "%.2f" $tr_acc) LRATE $(printf "%.6g" $learn_rate), " + + #cross-validation + $TRAIN_TOOL --cross-validate=true \ + --bunchsize=$bunch_size --cachesize=$cache_size \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + $mlp_next "$feats_cv" "$labels" \ + 2>>$dir/log/iter$iter.log || exit 1; + + acc_new=$(cat $dir/log/iter$iter.log | awk '/FRAME_ACCURACY/{ acc=$3; sub(/%/,"",acc); } END{print acc}') + echo -n "CROSSVAL ACCURACY $(printf "%.2f" $acc_new), " + + #accept or reject new parameters + acc_prev=$acc + if [ "1" == "$(awk "BEGIN{print($acc_new>$acc);}")" ]; then + acc=$acc_new + mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.2f" $tr_acc)_cv$(printf "%.2f" $acc_new) + mv $mlp_next $mlp_best + echo "nnet accepted ($(basename $mlp_best))" + else + mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.2f" $tr_acc)_cv$(printf "%.2f" $acc_new)_rejected + mv $mlp_next $mlp_reject + echo "nnet rejected ($(basename $mlp_reject))" + fi + + #stopping criterion + if [[ "1" == "$halving" && "1" == "$(awk "BEGIN{print($acc < $acc_prev+$end_halving_inc)}")" ]]; then + if [[ "$min_iters" != "" ]]; then + if [ $min_iters -gt $iter ]; then + echo we were supposed to finish, but we continue, min_iters : $min_iters + continue + fi + fi + echo finished, too small improvement $(awk "BEGIN{print($acc-$acc_prev)}") + break + fi + + #start annealing when improvement is low + if [ "1" == "$(awk "BEGIN{print($acc < $acc_prev+$start_halving_inc)}")" ]; then + halving=1 + fi + + #do annealing + if [ "1" == "$halving" ]; then + learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") + fi +done + +#select the best network +if [ $mlp_best != $mlp_init ]; then + mlp_final=${mlp_best}_final_ + ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); ) +fi + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/parse_options.sh b/egs/kaldi-vystadial-recipe/s5/utils/parse_options.sh new file mode 100755 index 00000000000..fa7113a0cc6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/parse_options.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + +# The following assignment allows the --config variable to be specified +# in all cases. +# The following test will work even if the calling script disallows unset +# variables (using set -u or set -o nounset). +[ -z "${config:-}" ] && config= + +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help) if [ -z "$help_message" ]; then echo "No help found."; + else printf "$help_message\n"; fi; + exit 0 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": --$name $2" + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Override any of the options, if --config was specified. +[ -z "$config" ] || . $config || exit 1; + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" && exit 1; + +true; # so this script returns code zero. + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/prepare_lang.sh b/egs/kaldi-vystadial-recipe/s5/utils/prepare_lang.sh new file mode 100755 index 00000000000..c729de6474f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/prepare_lang.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script prepares a directory such as data/lang/, in the standard format, +# given a source directory containing a dictionary lexicon.txt in a form like: +# word phone1 phone2 ... phonen +# per line (alternate prons would be separate lines). +# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt +# and extra_questions.txt +# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and +# non-silence phones respectively (where silence includes various kinds of +# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the +# "real" phones.) +# In each line of those files is a list of phones, and the phones on each line +# are assumed to correspond to the same "base phone", i.e. they will be +# different stress or tone variations of the same basic phone. +# The file "optional_silence.txt" contains just a single phone (typically SIL) +# which is used for optional silence in the lexicon. +# extra_questions.txt might be empty; typically will consist of lists of phones, +# all members of each list with the same stress or tone; and also possibly a +# list for the silence phones. This will augment the automtically generated +# questions (note: the automatically generated ones will treat all the +# stress/tone versions of a phone the same, so will not "get to ask" about +# stress or tone). + +# This script adds word-position-dependent phones and constructs a host of other +# derived files, that go in data/lang/. + +# Begin configuration section. +num_sil_states=5 +num_nonsil_states=3 +position_dependent_phones=true +# false also when position dependent phones and word_boundary.txt +# have been generated by another source +reverse=false +share_silence_phones=false # if true, then share pdfs of different silence + # phones together. +sil_prob=0.5 +# end configuration sections + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + echo "usage: utils/prepare_lang.sh " + echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" + echo "options: " + echo " --num-sil-states # default: 5, #states in silence models." + echo " --num-nonsil-states # default: 3, #states in non-silence models." + echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" + echo " # markers on phones to indicate word-internal positions. " + echo " --reverse (true|false) # reverse lexicon." + echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " + echo " # all non-silence phones. " + echo " --sil-prob # default: 0.5 [must have 0 < silprob < 1]" + exit 1; +fi + +srcdir=$1 +oov_word=$2 +tmpdir=$3 +dir=$4 +mkdir -p $dir $tmpdir $dir/phones + +[ -f path.sh ] && . ./path.sh + +utils/validate_dict_dir.pl $srcdir || exit 1; + +if $position_dependent_phones; then + # Create $tmpdir/lexicon.original from $srcdir/lexicon.txt by + # adding the markers _B, _E, _S, _I depending on word position. + # In this recipe, these markers apply to silence also. + + perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die; + if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B "; + for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ + < $srcdir/lexicon.txt > $tmpdir/lexicon.original || exit 1; + + # create $tmpdir/phone_map.txt + # this has the format (on each line) + # ... + # where the versions depend on the position of the phone within a word. + # For instance, we'd have: + # AA AA_B AA_E AA_I AA_S + # for (B)egin, (E)nd, (I)nternal and (S)ingleton + # and in the case of silence + # SIL SIL SIL_B SIL_E SIL_I SIL_S + # [because SIL on its own is one of the variants; this is for when it doesn't + # occur inside a word but as an option in the lexicon.] + + # This phone map expands the phone lists into all the word-position-dependent + # versions of the phone lists. + + cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + > $tmpdir/phone_map.txt +else + cp $srcdir/lexicon.txt $tmpdir/lexicon.original + # there might be clusters phones + cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ + sed 's/ /\n/g' | awk '(NF>0){print}' > $tmpdir/phones + paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt +fi + +if $reverse; then + echo "reversing lexicon." + cat $tmpdir/lexicon.original \ + | awk '{printf "%s ",$1;for(i=NF;i>1;i--){printf "%s ",$i;}printf "\n"}' \ + > $tmpdir/lexicon.txt +else + mv $tmpdir/lexicon.original $tmpdir/lexicon.txt +fi + + +mkdir -p $dir/phones # various sets of phones... + +# Sets of phones for use in clustering, and making monophone systems. + +if $share_silence_phones; then + # build a roots file that will force all the silence phones to share the + # same pdf's. [three distinct states, only the transitions will differ.] + # 'shared'/'not-shared' means, do we share the 3 states of the HMM + # in the same tree-root? + # Sharing across models(phones) is achieved by writing several phones + # into one line of roots.txt (shared/not-shared doesn't affect this). + # 'shared split' means we have 1 tree-root for the 3 states of the HMM + # (but we get to ask about the HMM-position when we split). + # 'not-shared not-split' means we have separate tree roots for the 3 states, + # but we never split the tree so they remain stumps + # so all phones in the line correspond to the same model. + + cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ + utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt + cat $dir/phones/sets.txt | awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt +else + # different silence phones will have different GMMs. [note: here, all "shared split" means + # is that we may have one GMM for all the states, or we can split on states. because they're + # context-independent phones, they don't see the context.] + cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt + cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt +fi + +cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt +cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt +cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt +cp $dir/phones/silence.txt $dir/phones/context_indep.txt + +cat $srcdir/extra_questions.txt | utils/apply_map.pl $tmpdir/phone_map.txt \ + >$dir/phones/extra_questions.txt + +# Want extra questions about the word-start/word-end stuff. Make it separate for +# silence and non-silence. Probably doesn't matter, as silence will rarely +# be inside a word. +if $position_dependent_phones; then + for suffix in _B _E _I _S; do + (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + done + for suffix in "" _B _E _I _S; do + (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + done +fi + +# add disambig symbols to the lexicon in $tmpdir/lexicon.txt +# and produce $tmpdir/lexicon_disambig.txt + +ndisambig=`utils/add_lex_disambig.pl $tmpdir/lexicon.txt $tmpdir/lexicon_disambig.txt` +ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. +echo $ndisambig > $tmpdir/lex_ndisambig + +# Format of lexicon_disambig.txt: +# !SIL SIL_S +# SPN_S #1 +# SPN_S #2 +# NSN_S +# !EXCLAMATION-POINT EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E + +( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt + +# Create phone symbol table. +echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ + awk '{n=NR-1; print $1, n;}' > $dir/phones.txt + +# Create a file that describes the word-boundary information for +# each phone. 5 categories. +if $position_dependent_phones; then + cat $dir/phones/{silence,nonsilence}.txt | \ + awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } + /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; } + {print $1, "nonword";} ' > $dir/phones/word_boundary.txt +else + # word_boundary.txt might have been generated by another source + [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt +fi + +# Create word symbol table. +cat $tmpdir/lexicon.txt | awk '{print $1}' | sort | uniq | \ + awk 'BEGIN{print " 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \ + > $dir/words.txt || exit 1; + +# format of $dir/words.txt: +# 0 +#!EXCLAMATION-POINT 1 +#!SIL 2 +#"CLOSE-QUOTE 3 +#... + +silphone=`cat $srcdir/optional_silence.txt` || exit 1; + +# Create the basic L.fst without disambiguation symbols, for use +# in training. +utils/make_lexicon_fst.pl $tmpdir/lexicon.txt $sil_prob $silphone | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; + +# The file oov.txt contains a word that we will map any OOVs to during +# training. +echo "$oov_word" > $dir/oov.txt || exit 1; +cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int # integer version of oov +# symbol, used in some scripts. + + + +# Create these lists of phones in colon-separated integer list form too, +# for purposes of being given to programs as command-line options. +for f in silence nonsilence optional_silence disambig context_indep; do + utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int + utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ + awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; +done + +for x in sets extra_questions; do + utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; +done + +utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ + > $dir/phones/roots.int || exit 1; + +#if $position_dependent_phones; then +if [ -f $dir/phones/word_boundary.txt ]; then + utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ + > $dir/phones/word_boundary.int || exit 1; +fi + +silphonelist=`cat $dir/phones/silence.csl` +nonsilphonelist=`cat $dir/phones/nonsilence.csl` +utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo + + +# Create the lexicon FST with disambiguation symbols, and put it in lang_test. +# There is an extra step where we create a loop to "pass through" the +# disambiguation symbols from G.fst. +phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'` +word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` + +utils/make_lexicon_fst.pl $tmpdir/lexicon_disambig.txt $sil_prob $silphone '#'$ndisambig | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; diff --git a/egs/kaldi-vystadial-recipe/s5/utils/queue.pl b/egs/kaldi-vystadial-recipe/s5/utils/queue.pl new file mode 100755 index 00000000000..8ceff97f4b6 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/queue.pl @@ -0,0 +1,263 @@ +#!/usr/bin/perl +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. +use File::Basename; +use Cwd; + +# queue.pl has the same functionality as run.pl, except that +# it runs the job in question on the queue (Sun GridEngine). +# This version of queue.pl uses the task array functionality +# of the grid engine. Note: it's different from the queue.pl +# in the s4 and earlier scripts. + +$qsub_opts = ""; +$sync = 0; + +for ($x = 1; $x <= 3; $x++) { # This for-loop is to + # allow the JOB=1:n option to be interleaved with the + # options to qsub. + while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { + $switch = shift @ARGV; + if ($switch eq "-V") { + $qsub_opts .= "-V "; + } else { + $option = shift @ARGV; + if ($switch eq "-sync" && $option =~ m/^[yY]/) { + $sync = 1; + } + $qsub_opts .= "$switch $option "; + if ($switch eq "-pe") { # e.g. -pe smp 5 + $option2 = shift @ARGV; + $qsub_opts .= "$option2 "; + } + } + } + if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { + $jobname = $1; + $jobstart = $2; + $jobend = $3; + shift; + if ($jobstart > $jobend) { + die "queue.pl: invalid job range $ARGV[0]"; + } + } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. + $jobname = $1; + $jobstart = $2; + $jobend = $2; + shift; + } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { + print STDERR "Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; + } +} + + +if (@ARGV < 2) { + print STDERR + "Usage: queue.pl [options to qsub] [JOB=1:n] log-file command-line arguments...\n" . + "e.g.: queue.pl foo.log echo baz\n" . + " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . + "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . + " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . + "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . + " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . + " another string other than JOB)\n" . + "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . + "and change its behavior. Otherwise it uses qstat to work out when the job finished\n"; + exit 1; +} + +$cwd = getcwd(); +$logfile = shift @ARGV; + +if (defined $jobname && $logfile !~ m/$jobname/ + && $jobend > $jobstart) { + print STDERR "run.pl: you are trying to run a parallel job but " + . "you are putting the output into just one log file ($logfile)\n"; + exit(1); +} + +# +# Work out the command; quote escaping is done here. +# Note: the rules for escaping stuff are worked out pretty +# arbitrarily, based on what we want it to do. Some things that +# we pass as arguments to queue.pl, such as "|", we want to be +# interpreted by bash, so we don't escape them. Other things, +# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want +# to be passed, in quotes, to the Kaldi program. Our heuristic +# is that stuff with spaces in should be quoted. This doesn't +# always work. +# +$cmd = ""; + +foreach $x (@ARGV) { + if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take + # as-is. + elsif ($x =~ m:\":) { $cmd .= "'\''$x'\'' "; } # else if no dbl-quotes, use single + else { $cmd .= "\"$x\" "; } # else use double. +} + +# +# Work out the location of the script file, and open it for writing. +# +$dir = dirname($logfile); +$base = basename($logfile); +$qdir = "$dir/q"; +$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q. +$queue_logfile = "$qdir/$base"; + +if (!-d $dir) { system "mkdir $dir 2>/dev/null"; } # another job may be doing this... +if (!-d $dir) { die "Cannot make the directory $dir\n"; } +if (!-d "$qdir") { system "mkdir $qdir 2>/dev/null"; } # make a directory called "q", + # where we will put the log created by qsub... normally this doesn't contain + # anything interesting, evertyhing goes to $logfile. + +if (defined $jobname) { # It's an array job. + $queue_array_opt = "-t $jobstart:$jobend"; + $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get + # replaced by qsub, in each job, with the job-id. + $cmd =~ s/$jobname/\$SGE_TASK_ID/g; # same for the command... + $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory + # is for the queue to put its log, and this doesn't need the task array subscript + # so we remove it. +} + +# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but +# with the suffix .sh. +$queue_scriptfile = $queue_logfile; +($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh"); +if ($queue_scriptfile !~ m:^/:) { + $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case. +} + +# We'll write to the standard input of "qsub" (the file-handle Q), +# the job that we want it to execute. +# Also keep our current PATH around, just in case there was something +# in it that we need (although we also source ./path.sh) + +$syncfile = "$qdir/done.$$"; + +system("rm $queue_logfile $syncfile 2>/dev/null"); +# +# Write to the script file, and then close it. +# +open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile"; + +print Q "#!/bin/bash\n"; +print Q "cd $cwd\n"; +print Q ". ./path.sh\n"; +print Q "( echo '#' Running on \`hostname\`\n"; +print Q " echo '#' Started at \`date\`\n"; +print Q " echo -n '# '; cat <$logfile\n"; +print Q " ( $cmd ) 2>>$logfile >>$logfile\n"; +print Q "ret=\$?\n"; +print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; +if (!defined $jobname) { # not an array job + print Q "touch $syncfile\n"; # so we know it's done. +} else { + print Q "touch $syncfile.\$SGE_TASK_ID\n"; # touch a bunch of sync-files. +} +print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine +print Q "## submitted with:\n"; # treats specially. +print Q "# $qsub_cmd\n"; +if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; + die "Failed to close the script file (full disk?)"; +} + +$ret = system ("qsub -S /bin/bash -v PATH -cwd -j y -o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1"); +if ($ret != 0) { + if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) + if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; } + print STDERR "queue.pl: job writing to $logfile failed\n"; + } else { + print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n"; + print STDERR `tail $queue_logfile`; + } + exit(1); +} + +if (! $sync) { # We're not submitting with -sync y, so we + # need to wait for the jobs to finish. We wait for the + # sync-files we "touched" in the script to exist. + @syncfiles = (); + if (!defined $jobname) { # not an array job. + push @syncfiles, $syncfile; + } else { + for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + push @syncfiles, "$syncfile.$jobid"; + } + } + $wait = 0.1; + foreach $f (@syncfiles) { + # wait for them to finish one by one. + while (! -f $f) { + sleep($wait); + $wait *= 1.2; + if ($wait > 1.0) { + $wait = 1.0; # never wait more than 1 second. + } + } + } + $all_syncfiles = join(" ", @syncfiles); + system("rm $all_syncfiles 2>/dev/null"); +} + +# OK, at this point we are synced; we know the job is done. +# But we don't know about its exit status. We'll look at $logfile for this. +# First work out an array @logfiles of file-locations we need to +# read (just one, unless it's an array job). +@logfiles = (); +if (!defined $jobname) { # not an array job. + push @logfiles, $logfile; +} else { + for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $l = $logfile; + $l =~ s/\$SGE_TASK_ID/$jobid/g; + push @logfiles, $l; + } +} + +$num_failed = 0; +foreach $l (@logfiles) { + @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0); + for ($iter = 0; $iter <= @wait_times; $iter++) { + $line = `tail -1 $l 2>/dev/null`; + if ($line =~ m/with status (\d+)/) { + $status = $1; + last; + } else { + if ($iter < @wait_times) { + sleep($wait_times[$iter]); + } else { + if (! -f $l) { + print STDERR "Log-file $l does not exist.\n"; + } else { + print STDERR "The last line of log-file $l does not seem to indicate the " + . "return status as expected\n"; + } + exit(1); # Something went wrong with the queue, or the + # machine it was running on, probably. + } + } + } + # OK, now we have $status, which is the return-status of + # the command in the job. + if ($status != 0) { $num_failed++; } +} +if ($num_failed == 0) { exit(0); } +else { # we failed. + if (@logfiles == 1) { + if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/$jobstart/g; } + print STDERR "queue.pl: job failed with status $status, log is in $logfile\n"; + if ($logfile =~ m/JOB/) { + print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; + } + } else { + if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; } + $numjobs = 1 + $jobend - $jobstart; + print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n"; + } + exit(1); +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/remove_oovs.pl b/egs/kaldi-vystadial-recipe/s5/utils/remove_oovs.pl new file mode 100755 index 00000000000..5bcab59840c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/remove_oovs.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script removes lines that contain these OOVs on either the +# third or fourth fields of the line. It is intended to remove arcs +# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). + +if ( @ARGV < 1 && @ARGV > 2) { + die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; +} + +$unklist = shift @ARGV; +open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; +while(){ + @A = split(" ", $_); + @A == 1 || die "Bad line in unknown-symbol list: $_"; + $unk{$A[0]} = 1; +} + +$num_removed = 0; +while(<>){ + @A = split(" ", $_); + if(defined $unk{$A[2]} || defined $unk{$A[3]}) { + $num_removed++; + } else { + print; + } +} +print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/rnnlm_compute_scores.sh b/egs/kaldi-vystadial-recipe/s5/utils/rnnlm_compute_scores.sh new file mode 100755 index 00000000000..d904fdc995f --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/rnnlm_compute_scores.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Compute scores from RNNLM. This script takes a directory +# $dir (e.g. dir=local/rnnlm/rnnlm.voc30.hl30 ), +# where it expects the files: +# rnnlm wordlist.rnn unk.probs, +# and also an input file location where it can get the sentences to score, and +# an output file location to put the scores (negated logprobs) for each +# sentence. This script uses the Kaldi-style "archive" format, so the input and +# output files will have a first field that corresponds to some kind of +# utterance-id or, in practice, utterance-id-1, utterance-id-2, etc., for the +# N-best list. +# +# Here, "wordlist.rnn" is the set of words, like a vocabulary, +# that the RNN was trained on (note, it won't include or ), +# plus which is a kind of class where we put low-frequency +# words; unk.probs gives the probs for words given this class, and it +# has, on each line, "word prob". + +. ./path.sh || exit 1; + +rnnlm=$KALDI_ROOT/tools/rnnlm-0.3e/rnnlm + +[ ! -f $rnnlm ] && echo No such program $rnnlm && exit 1; + +if [ $# != 4 ]; then + echo "Usage: rnnlm_compute_scores.sh " + exit 1; +fi + +dir=$1 +tempdir=$2 +text_in=$3 +scores_out=$4 + +for x in rnnlm wordlist.rnn unk.probs; do + if [ ! -f $dir/$x ]; then + echo "rnnlm_compute_scores.sh: expected file $dir/$x to exist." + exit 1; + fi +done + +mkdir -p $tempdir +cat $text_in | awk '{for (x=2;x<=NF;x++) {printf("%s ", $x)} printf("\n");}' >$tempdir/text +cat $text_in | awk '{print $1}' > $tempdir/ids # e.g. utterance ids. +cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \ + -v logprobs=$tempdir/loglikes.oov \ + 'BEGIN{ while((getline0) { invoc[$1]=1; } while ((getline0){ unkprob[$1]=$2;} } + { logprob=0; for (x=1;x<=NF;x++) { w=$x; + if (invoc[w]) { printf("%s ",w); } else { + printf(" "); + if (unkprob[w] != 0) { logprob += log(unkprob[w]); } + else { print "Warning: unknown word ", w >"/dev/stderr"; logprob += log(1.0e-07); }}} + printf("\n"); print logprob > logprobs } ' > $tempdir/text.nounk + +# OK, now we compute the scores on the text with OOVs replaced +# with + +$rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \ + awk '{print $1*log(10);}' > $tempdir/loglikes.rnn + +[ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \ + echo "rnnlm rescoring failed" && exit 1; + +paste $tempdir/loglikes.rnn $tempdir/loglikes.oov | awk '{print -($1+$2);}' >$tempdir/scores + +# scores out, with utterance-ids. +paste $tempdir/ids $tempdir/scores > $scores_out + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/run.pl b/egs/kaldi-vystadial-recipe/s5/utils/run.pl new file mode 100755 index 00000000000..efb2ed4f8f9 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/run.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl -w + +# In general, doing +# run.pl some.log a b c is like running the command a b c in +# the bash shell, and putting the standard error and output into some.log. +# To run parallel jobs (backgrounded on the host machine), you can do (e.g.) +# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB +# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. +# If any of the jobs fails, this script will fail. + +# A typical example is: +# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz +# and run.pl will run something like: +# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log +# +# Basically it takes the command-line arguments, quotes them +# as necessary to preserve spaces, and evaluates them with bash. +# In addition it puts the command line at the top of the log, and +# the start and end times of the command at the beginning and end. +# The reason why this is useful is so that we can create a different +# version of this program that uses a queueing system instead. + +@ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; + +$jobstart=1; +$jobend=1; + +# First parse an option like JOB=1:4 + +if (@ARGV > 0) { + if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { + $jobname = $1; + $jobstart = $2; + $jobend = $3; + shift; + if ($jobstart > $jobend) { + die "queue.pl: invalid job range $ARGV[0]"; + } + } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. + $jobname = $1; + $jobstart = $2; + $jobend = $2; + shift; + } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { + print STDERR "Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; + } +} + +$logfile = shift @ARGV; + +if (defined $jobname && $logfile !~ m/$jobname/ && + $jobend > $jobstart) { + print STDERR "run.pl: you are trying to run a parallel job but " + . "you are putting the output into just one log file ($logfile)\n"; + exit(1); +} + +$cmd = ""; + +foreach $x (@ARGV) { + if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } + elsif ($x =~ m:\":) { $cmd .= "'$x' "; } + else { $cmd .= "\"$x\" "; } +} + + +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $childpid = fork(); + if (!defined $childpid) { die "Error forking in run.pl (writing to $logfile)"; } + if ($childpid == 0) { # We're in the child... this branch + # executes the job and returns (possibly with an error status). + if (defined $jobname) { + $cmd =~ s/$jobname/$jobid/g; + $logfile =~ s/$jobname/$jobid/g; + } + system("mkdir -p `dirname $logfile` 2>/dev/null"); + open(F, ">$logfile") || die "Error opening log file $logfile"; + print F "# " . $cmd . "\n"; + print F "# Started at " . `date`; + $starttime = `date +'%s'`; + print F "#\n"; + close(F); + + # Pipe into bash.. make sure we're not using any other shell. + open(B, "|bash") || die "Error opening shell command"; + print B "( " . $cmd . ") 2>>$logfile >> $logfile"; + close(B); # If there was an error, exit status is in $? + $ret = $?; + + $endtime = `date +'%s'`; + open(F, ">>$logfile") || die "Error opening log file $logfile (again)"; + $enddate = `date`; + chop $enddate; + print F "# Ended (code $ret) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; + close(F); + exit($ret == 0 ? 0 : 1); + } +} + +$ret = 0; +$numfail = 0; +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $r = wait(); + if ($r == -1) { die "Error waiting for child process"; } # should never happen. + if ($? != 0) { $numfail++; $ret = 1; } # The child process failed. +} + +if ($ret != 0) { + $njobs = $jobend - $jobstart + 1; + if ($njobs == 1) { + print STDERR "run.pl: job failed, log is in $logfile\n"; + if ($logfile =~ m/JOB/) { + print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; + } + } + else { + $logfile =~ s/$jobname/*/g; + print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; + } +} + + +exit ($ret); diff --git a/egs/kaldi-vystadial-recipe/s5/utils/s2eps.pl b/egs/kaldi-vystadial-recipe/s5/utils/s2eps.pl new file mode 100755 index 00000000000..de993db67f7 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/s2eps.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces and with (on both input and output sides), +# for the G.fst acceptor. + +while(<>){ + @A = split(" ", $_); + if ( @A >= 4 ) { + if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } + if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } + } + print join("\t", @A) . "\n"; +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/shuffle_list.pl b/egs/kaldi-vystadial-recipe/s5/utils/shuffle_list.pl new file mode 100755 index 00000000000..3144c263053 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/shuffle_list.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl + +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# seeding is optional... +if ($#ARGV==0) { + srand($ARGV[0]); +} else { + srand(0); # Seems to give inconsistent behavior if we don't seed. +} + + +# This script shuffles lines of a list. +# The list is read from stdin and written to stdout. +@X = ; +@X = sort { rand() <=> rand() } @X; +print @X; diff --git a/egs/kaldi-vystadial-recipe/s5/utils/spk2utt_to_utt2spk.pl b/egs/kaldi-vystadial-recipe/s5/utils/spk2utt_to_utt2spk.pl new file mode 100755 index 00000000000..ca8a6a1249c --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/spk2utt_to_utt2spk.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +while(<>){ + @A = split(" ", $_); + @A > 1 || die "Invalid line in spk2utt file: $_"; + $s = shift @A; + foreach $u ( @A ) { + print "$u $s\n"; + } +} + + diff --git a/egs/kaldi-vystadial-recipe/s5/utils/split_data.sh b/egs/kaldi-vystadial-recipe/s5/utils/split_data.sh new file mode 100755 index 00000000000..bee31a29643 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/split_data.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +split_per_spk=true +if [ "$1" == "--per-utt" ]; then + split_per_spk=false + shift +fi + +if [ $# != 2 ]; then + echo "Usage: split_data.sh data-dir num-to-split" + exit 1 +fi + +data=$1 +numsplit=$2 + +if [ $numsplit -le 0 ]; then + echo "Invalid num-split argument $numsplit"; + exit 1; +fi + +n=0; +feats="" +wavs="" +utt2spks="" +texts="" + +nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nt=`cat $data/text | wc -l` +if [ $nu -ne $nf ]; then + echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script " + echo " may produce incorrectly split data." + echo "use utils/fix_data_dir.sh to fix this." +fi +if [ $nt -ne 0 -a $nu -ne $nt ]; then + echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script " + echo " may produce incorrectly split data." + echo "use utils/fix_data_dir.sh to fix this." +fi + +# `utils/get_split.pl` returns "0 1 2 3" or "00 01 .. 18 19" or whatever. +for n in `seq $numsplit`; do + mkdir -p $data/split$numsplit/$n + feats="$feats $data/split$numsplit/$n/feats.scp" + texts="$texts $data/split$numsplit/$n/text" + utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" +done + +if $split_per_spk; then + utt2spk_opt="--utt2spk=$data/utt2spk" +else + utt2spk_opt= +fi + +utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 + +utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats || exit 1 +[ -f $data/text ] && \ + utils/split_scp.pl $utt2spk_opt $data/text $texts + +for n in `seq $numsplit`; do + dsn=$data/split$numsplit/$n + utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; + # for completeness, also split the spk2gender file + [ -f $data/spk2gender ] && \ + utils/filter_scp.pl $dsn/spk2utt $data/spk2gender > $dsn/spk2gender + [ -f $data/cmvn.scp ] && \ + utils/filter_scp.pl $dsn/spk2utt $data/cmvn.scp > $dsn/cmvn.scp + if [ -f $data/segments ]; then + utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments + awk '{print $2;}' $dsn/segments |sort|uniq > $data/tmp.reco # recording-ids. + [ -f $data/reco2file_and_channel ] && + utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel + [ -f $data/wav.scp ] && utils/filter_scp.pl $data/tmp.reco $data/wav.scp > $dsn/wav.scp + rm $data/tmp.reco + else # else wav indexed by utterance -> filter on this. + [ -f $data/wav.scp ] && + utils/filter_scp.pl $dsn/utt2spk $data/wav.scp > $dsn/wav.scp + fi +done + +exit 0 diff --git a/egs/kaldi-vystadial-recipe/s5/utils/split_scp.pl b/egs/kaldi-vystadial-recipe/s5/utils/split_scp.pl new file mode 100755 index 00000000000..18abcdb2fb1 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/split_scp.pl @@ -0,0 +1,221 @@ +#!/usr/bin/perl -w +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program splits up any kind of .scp or archive-type file. +# If there is no utt2spk option it will work on any text file and +# will split it up with an approximately equal number of lines in +# each but. +# With the --utt2spk option it will work on anything that has the +# utterance-id as the first entry on each line; the utt2spk file is +# of the form "utterance speaker" (on each line). +# It splits it into equal size chunks as far as it can. If you use +# the utt2spk option it will make sure these chunks coincide with +# speaker boundaries. In this case, if there are more chunks +# than speakers (and in some other circumstances), some of the +# resulting chunks will be empty and it +# will print a warning. +# You will normally call this like: +# split_scp.pl scp scp.1 scp.2 scp.3 ... +# or +# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... +# Note that you can use this script to split the utt2spk file itself, +# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... + +# You can also call the scripts like: +# split_scp.pl -j 3 0 scp scp.0 +# [note: with this option, it assumes zero-based indexing of the split parts, +# i.e. the second number must be 0 <= n < num-jobs.] + +$num_jobs = 0; +$job_id = 0; +$utt2spk_file = ""; + +for ($x = 1; $x <= 2; $x++) { + if ($ARGV[0] eq "-j") { + shift @ARGV; + $num_jobs = shift @ARGV; + $job_id = shift @ARGV; + if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { + die "Invalid num-jobs and job-id: $num_jobs and $job_id"; + } + } + if ($ARGV[0] =~ "--utt2spk=(.+)") { + $utt2spk_file=$1; + shift; + } +} + +if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { + die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... \n" . + " or: split_scp.pl -j num-jobs job-id [--utt2spk=] in.scp [out.scp]\n" . + " ... where 0 <= job-id < num-jobs."; +} + +$error = 0; +$inscp = shift @ARGV; +if ($num_jobs == 0) { # without -j option + @OUTPUTS = @ARGV; +} else { + for ($j = 0; $j < $num_jobs; $j++) { + if ($j == $job_id) { + if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } + else { push @OUTPUTS, "-"; } + } else { + push @OUTPUTS, "/dev/null"; + } + } +} + +if ($utt2spk_file ne "") { # We have the --utt2spk option... + open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; + while() { + @A = split; + @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; + ($u,$s) = @A; + $utt2spk{$u} = $s; + } + open(I, "<$inscp") || die "Opening input scp file $inscp"; + @spkrs = (); + while() { + @A = split; + if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } + $u = $A[0]; + $s = $utt2spk{$u}; + if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } + if(!defined $spk_count{$s}) { + push @spkrs, $s; + $spk_count{$s} = 0; + $spk_data{$s} = ""; + } + $spk_count{$s}++; + $spk_data{$s} = $spk_data{$s} . $_; + } + # Now split as equally as possible .. + # First allocate spks to files by allocating an approximately + # equal number of speakers. + $numspks = @spkrs; # number of speakers. + $numscps = @OUTPUTS; # number of output files. + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + $scparray[$scpidx] = []; # [] is array reference. + } + for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { + $scpidx = int(($spkidx*$numscps) / $numspks); + $spk = $spkrs[$spkidx]; + push @{$scparray[$scpidx]}, $spk; + $scpcount[$scpidx] += $spk_count{$spk}; + } + + # Now will try to reassign beginning + ending speakers + # to different scp's and see if it gets more balanced. + # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. + # We can show that if considering changing just 2 scp's, we minimize + # this by minimizing the squared difference in sizes. This is + # equivalent to minimizing the absolute difference in sizes. This + # shows this method is bound to converge. + + $changed = 1; + while($changed) { + $changed = 0; + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + # First try to reassign ending spk of this scp. + if($scpidx < $numscps-1) { + $sz = @{$scparray[$scpidx]}; + if($sz > 0) { + $spk = $scparray[$scpidx]->[$sz-1]; + $count = $spk_count{$spk}; + $nutt1 = $scpcount[$scpidx]; + $nutt2 = $scpcount[$scpidx+1]; + if( abs( ($nutt2+$count) - ($nutt1-$count)) + < abs($nutt2 - $nutt1)) { # Would decrease + # size-diff by reassigning spk... + $scpcount[$scpidx+1] += $count; + $scpcount[$scpidx] -= $count; + pop @{$scparray[$scpidx]}; + unshift @{$scparray[$scpidx+1]}, $spk; + $changed = 1; + } + } + } + if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { + $spk = $scparray[$scpidx]->[0]; + $count = $spk_count{$spk}; + $nutt1 = $scpcount[$scpidx-1]; + $nutt2 = $scpcount[$scpidx]; + if( abs( ($nutt2-$count) - ($nutt1+$count)) + < abs($nutt2 - $nutt1)) { # Would decrease + # size-diff by reassigning spk... + $scpcount[$scpidx-1] += $count; + $scpcount[$scpidx] -= $count; + shift @{$scparray[$scpidx]}; + push @{$scparray[$scpidx-1]}, $spk; + $changed = 1; + } + } + } + } + # Now print out the files... + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + $scpfn = $OUTPUTS[$scpidx]; + open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; + $count = 0; + if(@{$scparray[$scpidx]} == 0) { + print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n"; + $error = 1; + } else { + foreach $spk ( @{$scparray[$scpidx]} ) { + print F $spk_data{$spk}; + $count += $spk_count{$spk}; + } + if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } + } + close(F); + } +} else { + # This block is the "normal" case where there is no --utt2spk + # option and we just break into equal size chunks. + + open(I, "<$inscp") || die "Opening input scp file $inscp"; + + $numscps = @OUTPUTS; # size of array. + @F = (); + while() { + push @F, $_; + } + $numlines = @F; + if($numlines == 0) { + print STDERR "split_scp.pl: error: empty input scp file $inscp"; + $error = 1; + } + $linesperscp = int( $numlines / $numscps); # the "whole part".. + $linesperscp >= 1 || die "You are splitting into too many pieces!"; + $remainder = $numlines - ($linesperscp * $numscps); + ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder"; + # [just doing int() rounds down]. + $n = 0; + for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { + $scpfile = $OUTPUTS[$scpidx]; + open(O, ">$scpfile") || die "Opening output scp file $scpfile"; + for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) { + print O $F[$n++]; + } + close(O) || die "Closing scp file $scpfile"; + } + $n == $numlines || die "split_scp.pl: code error., $n != $numlines"; +} + +exit ($error ? 1 : 0); diff --git a/egs/kaldi-vystadial-recipe/s5/utils/subset_data_dir.sh b/egs/kaldi-vystadial-recipe/s5/utils/subset_data_dir.sh new file mode 100755 index 00000000000..6afb49481db --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/subset_data_dir.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# feats.scp +# wav.scp +# spk2utt +# utt2spk +# text +# It creates a subset of that data, consisting of some specified +# number of utterances. (The selected utterances are distributed +# evenly throughout the file, by the program ./subset_scp.pl). + + +# There are three options, none compatible with any other. + +# If you give the --per-spk option, it will attempt to select the supplied +# number of utterances for each speaker (typically you would supply a much +# smaller number in this case). + +# If you give the --shortest option, it will give you the n shortest utterances. + +# If you give the --first option it will just give you the n first utterances. + +shortest=false +perspk=false +first_opt="" + +if [ "$1" == "--per-spk" ]; then + perspk=true; + shift; +elif [ "$1" == "--shortest" ]; then + shortest=true; + shift; +elif [ "$1" == "--first" ]; then + first_opt="--first"; + shift; +elif [ "$1" == "--last" ]; then + first_opt="--last"; + shift; +fi + + + +if [ $# != 3 ]; then + echo "Usage: subset_data_dir.sh [--per-spk] " + exit 1; +fi + +srcdir=$1 +numutt=$2 +destdir=$3 + + +if [ ! -f $srcdir/utt2spk ]; then + echo "subset_data_dir.sh: no such file $srcdir/utt2spk" + exit 1; +fi + + +function do_filtering { + # assumes the utt2spk and spk2utt files already exist. + [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp + [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp + [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text + [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender + [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp + if [ -f $srcdir/segments ]; then + utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments + awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings. + # The next line would override the command above for wav.scp, which would be incorrect. + [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp + [ -f $srcdir/reco2file_and_channel ] && \ + utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel + rm $destdir/reco + fi + srcutts=`cat $srcdir/utt2spk | wc -l` + destutts=`cat $destdir/utt2spk | wc -l` + echo "Retained $numutt utterances per speaker from data-dir $srcdir and put it in $destdir, reducing #utt from $srcutts to $destutts" +} + + +## scripting note: $perspk evaluates to true or false +## so this becomes the command true or false. +if $perspk; then + mkdir -p $destdir + awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; } + for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } + printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt + utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk + do_filtering; # bash function. + exit 0; +else + if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then + echo "subset_data_dir.sh: cannot subset to more utterances than you originally had." + exit 1; + fi + mkdir -p $destdir || exit 1; + + ## scripting note: $shortest evaluates to true or false + ## so this becomes the command true or false. + if $shortest; then + # select the n shortest utterances. + . ./path.sh + [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1; + feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; + sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist + utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk + rm $destdir/tmp.uttlist $destdir/tmp.len + else + utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; + fi + utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt + do_filtering; + exit 0; +fi diff --git a/egs/kaldi-vystadial-recipe/s5/utils/subset_scp.pl b/egs/kaldi-vystadial-recipe/s5/utils/subset_scp.pl new file mode 100755 index 00000000000..2ee3b338027 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/subset_scp.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl -w +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This program selects a subset of N elements in the scp. + +# By default, it selects them evenly from throughout the scp, in order to avoid +# selecting too many from the same speaker. It prints them on the standard +# output. +# With the option --first, it just selects the N first utterances. + + +$first = 0; +$last = 0; +if ($ARGV[0] eq "--first") { + shift; + $first = 1; +} +if ($ARGV[0] eq "--last") { + shift; + $last = 1; +} + +if(@ARGV < 2 ) { + die "Usage: subset_scp.pl N in.scp "; +} + +$N = shift @ARGV; +if($N == 0) { + die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; +} +$inscp = shift @ARGV; +open(I, "<$inscp") || die "Opening input scp file $inscp"; + +@F = (); +while() { + push @F, $_; +} +$numlines = @F; +if($N > $numlines) { + die "You requested from subset_scp.pl more elements than available: $N > $numlines"; +} + +sub select_n { + my ($start,$end,$num_needed) = @_; + my $diff = $end - $start; + if($num_needed > $diff) { die "select_n: code error"; } + if($diff == 1 ) { + if($num_needed > 0) { + print $F[$start]; + } + } else { + my $halfdiff = int($diff/2); + my $halfneeded = int($num_needed/2); + select_n($start, $start+$halfdiff, $halfneeded); + select_n($start+$halfdiff, $end, $num_needed - $halfneeded); + } +} + +if ( ! $first && ! $last) { + select_n(0, $numlines, $N); +} else { + if ($first) { # --first option: same as head. + for ($n = 0; $n < $N; $n++) { + print $F[$n]; + } + } else { # --last option: same as tail. + for ($n = @F - $N; $n < @F; $n++) { + print $F[$n]; + } + } +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/summarize_warnings.pl b/egs/kaldi-vystadial-recipe/s5/utils/summarize_warnings.pl new file mode 100755 index 00000000000..ccbeb4186b9 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/summarize_warnings.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; + +$dir = $ARGV[0]; + +! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; + +$dir =~ s:/$::; # Remove trailing slash. + + +# Group the files into categories where all have the same base-name. +foreach $f (glob ("$dir/*.log")) { + $f_category = $f; + # do next expression twice; s///g doesn't work as they overlap. + $f_category =~ s:\.\d+\.:.*.:; + $f_category =~ s:\.\d+\.:.*.:; + $fmap{$f_category} .= " $f"; +} + +sub split_hundreds { # split list of filenames into groups of 100. + my $names = shift @_; + my @A = split(" ", $names); + my @ans = (); + while (@A > 0) { + my $group = ""; + for ($x = 0; $x < 100 && @A>0; $x++) { + $fname = pop @A; + $group .= "$fname "; + } + push @ans, $group; + } + return @ans; +} + +foreach $c (keys %fmap) { + $n = 0; + foreach $fgroup (split_hundreds($fmap{$c})) { + $n += `grep -w WARNING $fgroup | wc -l`; + } + if ($n != 0) { + print "$n warnings in $c\n" + } +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/sym2int.pl b/egs/kaldi-vystadial-recipe/s5/utils/sym2int.pl new file mode 100755 index 00000000000..f7334b7b4dd --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/sym2int.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +$ignore_oov = 0; +$ignore_first_field = 0; +for($x = 0; $x < 2; $x++) { + if ($ARGV[0] eq "--map-oov") { + shift @ARGV; $map_oov = shift @ARGV; + } + if ($ARGV[0] eq "-f") { + shift @ARGV; + $field_spec = shift @ARGV; + if ($field_spec =~ m/^\d+$/) { + $field_begin = $field_spec - 1; $field_end = $field_spec - 1; + } + if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) + if ($1 ne "") { + $field_begin = $1 - 1; # Change to zero-based indexing. + } + if ($2 ne "") { + $field_end = $2 - 1; # Change to zero-based indexing. + } + } + if (!defined $field_begin && !defined $field_end) { + die "Bad argument to -f option: $field_spec"; + } + } +} + +$symtab = shift @ARGV; +if (!defined $symtab) { + print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . + "options: [--map-oov ] [-f ]\n" . + "note: can look like 4-5, or 4-, or 5-, or 1.\n"; +} +open(F, "<$symtab") || die "Error opening symbol table file $symtab"; +while() { + @A = split(" ", $_); + @A == 2 || die "bad line in symbol table file: $_"; + $sym2int{$A[0]} = $A[1] + 0; +} + +if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up + if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } + $map_oov = $sym2int{$map_oov}; +} + +$num_warning = 0; +# $max_warning = 20; # Original +$max_warning = 2000; # Ondra Changed + +while (<>) { + @A = split(" ", $_); + if (@A == 0) { + die "Empty line in transcriptions input."; + } + @B = (); + for ($n = 0; $n < @A; $n++) { + $a = $A[$n]; + if ( (!defined $field_begin || $n >= $field_begin) + && (!defined $field_end || $n <= $field_end)) { + $i = $sym2int{$a}; + if (!defined ($i)) { + if (defined $map_oov) { + if ($num_warning++ < $max_warning) { + print STDERR "sym2int.pl: replacing $a with $map_oov\n"; + if ($num_warning == $max_warning) { + print STDERR "sym2int.pl: not warning for OOVs any more times\n"; + } + } + $i = $map_oov; + } else { + $pos = $n+1; + die "sym2int.pl: undefined symbol $a (in position $pos)\n"; + } + } + $a = $i; + } + push @B, $a; + } + print join(" ", @B); + print "\n"; +} + +exit(0); diff --git a/egs/kaldi-vystadial-recipe/s5/utils/utt2spk_to_spk2utt.pl b/egs/kaldi-vystadial-recipe/s5/utils/utt2spk_to_spk2utt.pl new file mode 100755 index 00000000000..0c9e6417c82 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/utt2spk_to_spk2utt.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# converts an utt2spk file to a spk2utt file. +# Takes input from the stdin or from a file argument; +# output goes to the standard out. + +if ( @ARGV > 1 ) { + die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; +} + +while(<>){ + @A = split(" ", $_); + @A == 2 || die "Invalid line in utt2spk file: $_"; + ($u,$s) = @A; + if(!$seen_spk{$s}) { + $seen_spk{$s} = 1; + push @spklist, $s; + } + $uttlist{$s} = $uttlist{$s} . "$u "; +} +foreach $s (@spklist) { + $l = $uttlist{$s}; + $l =~ s: $::; # remove trailing space. + print "$s $l\n"; +} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/validate_dict_dir.pl b/egs/kaldi-vystadial-recipe/s5/utils/validate_dict_dir.pl new file mode 100755 index 00000000000..7654e8ffcdb --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/validate_dict_dir.pl @@ -0,0 +1,142 @@ +#!/usr/bin/perl + +# Guoguo Chen (guoguo@jhu.edu) +# +# Validation script for data/local/dict + +if(@ARGV != 1) { + die "Usage: validate_dict_dir.pl dict_directory\n"; +} + +$dict = shift @ARGV; + +$exit = 0; +# Checking silence_phones.txt ------------------------------- +print "Checking $dict/silence_phones.txt ...\n"; +if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} +if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} +$idx = 1; +%silence = (); +$success = 1; +print "--> reading $dict/silence_phones.txt\n"; +while() { + chomp; + my @col = split(" ", $_); + foreach(0 .. @col-1) { + if($silence{@col[$_]}) {$exit = 1; print "--> ERROR: phone \"@col[$_]\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;} + else {$silence{@col[$_]} = 1;} + } + $idx ++; +} +close(S); +$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; +print "\n"; + +# Checking optional_silence.txt ------------------------------- +print "Checking $dict/optional_silence.txt ...\n"; +if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} +if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} +$idx = 1; +$success = 1; +print "--> reading $dict/optional_silence.txt\n"; +while() { + chomp; + my @col = split(" ", $_); + if ($idx > 1 or @col > 1) { + $exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0; + } elsif (!$silence{$col[0]}) { + $exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0; + } + $idx ++; +} +close(OS); +$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; +print "\n"; + +# Checking nonsilence_phones.txt ------------------------------- +print "Checking $dict/nonsilence_phones.txt ...\n"; +if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} +if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} +$idx = 1; +%nonsilence = (); +$success = 1; +print "--> reading $dict/nonsilence_phones.txt\n"; +while() { + chomp; + my @col = split(" ", $_); + foreach(0 .. @col-1) { + if($nonsilence{@col[$_]}) {$exit = 1; print "--> ERROR: phone \"@col[$_]\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;} + else {$nonsilence{@col[$_]} = 1;} + } + $idx ++; +} +close(NS); +$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; +print "\n"; + +# Checking disjoint ------------------------------- +sub intersect { + my ($a, $b) = @_; + @itset = (); + %itset = (); + foreach(keys %$a) { + if(exists $b->{$_} and !$itset{$_}) { + push(@itset, $_); + $itset{$_} = 1; + } + } + return @itset; +} + +print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; +@itset = intersect(\%silence, \%nonsilence); +if(@itset == 0) {print "--> disjoint property is OK.\n";} +else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlop: "; foreach(@itset) {print "$_ ";} print "\n";} +print "\n"; + +# Checking lexicon.txt ------------------------------- +print "Checking $dict/lexicon.txt\n"; +if(-z "$dict/lexicon.txt") {$exit = 1; print "--> ERROR: $dict/lexicon.txt is empty or not exists\n";} +if(!open(L, "<$dict/lexicon.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/lexicon.txt\n";} +$idx = 1; +$success = 1; +print "--> reading $dict/lexicon.txt\n"; +while() { + chomp; + my @col = split(" ", $_); + $word = shift @col; + foreach(0 .. @col-1) { + if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { + $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n"; + $success = 0; + } + } + $idx ++; +} +close(L); +$success == 0 || print "--> $dict/lexicon.txt is OK\n"; +print "\n"; + +# Checking extra_questions.txt ------------------------------- +print "Checking $dict/extra_questions.txt ...\n"; +if(-s "$dict/extra_questions.txt") { + if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";} + $idx = 1; + $success = 1; + print "--> reading $dict/extra_questions.txt\n"; + while() { + chomp; + my @col = split(" ", $_); + foreach(0 .. @col-1) { + if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { + $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; + $success = 0; + } + } + $idx ++; + } + close(EX); + $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; +} else {print "--> $dict/extra_phones.txt is empty\n";} + +if($exit == 1) {exit 1;} diff --git a/egs/kaldi-vystadial-recipe/s5/utils/validate_lang.pl b/egs/kaldi-vystadial-recipe/s5/utils/validate_lang.pl new file mode 100755 index 00000000000..ab74dbda983 --- /dev/null +++ b/egs/kaldi-vystadial-recipe/s5/utils/validate_lang.pl @@ -0,0 +1,501 @@ +#!/usr/bin/perl + +# Guoguo Chen (guoguo@jhu.edu) +# +# Validation script for data/lang + +if(@ARGV != 1) { + die "Usage: validate_lang.pl lang_directory\n"; +} + +$lang = shift @ARGV; +$exit = 0; +# Checking phones.txt ------------------------------- +print "Checking $lang/phones.txt ...\n"; +if(-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or not exists\n"; exit 1;} +if(!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;} +$idx = 1; +%psymtab = (); +while(

) { + chomp; + my @col = split(" ", $_); + if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;} + my $phone = shift @col; + my $id = shift @col; + $psymtab{$phone} = $id; + $idx ++; +} +close(P); +%pint2sym = (); +foreach(keys %psymtab) { + if($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;} + else {$pint2sym{$psymtab{$_}} = $_;} +} +print "--> $lang/phones.txt is OK\n"; +print "\n"; + +# Check word.txt ------------------------------- +print "Checking words.txt: #0 ...\n"; +if(-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or not exists\n"; exit 1;} +if(!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;} +$idx = 1; +%wsymtab = (); +while() { + chomp; + my @col = split(" ", $_); + if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;} + $word = shift @col; + $id = shift @col; + $wsymtab{$word} = $id; + $idx ++; +} +close(W); +%wint2sym = (); +foreach(keys %wsymtab) { + if($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;} + else {$wint2sym{$wsymtab{$_}} = $_;} +} +if(exists $wsymtab{"#0"}) { + print "--> $lang/words.txt has \"#0\"\n"; + print "--> $lang/words.txt is OK\n"; +} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; $exit = 1;} +print "\n"; + +# Checking phones/* ------------------------------- +sub check_txt_int_csl { + my ($cat, $symtab) = @_; + print "Checking $cat.\{txt, int, csl\} ...\n"; + if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";} + if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";} + if(-z "$cat.csl") {$exit = 1; return print "--> ERROR: $cat.csl is empty or not exists\n";} + if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} + if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} + if(!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";} + + $idx1 = 1; + while() { + chomp; + my @col = split(" ", $_); + if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";} + $entry[$idx1] = shift @col; + $idx1 ++; + } + close(TXT); $idx1 --; + print "--> $idx1 entry/entries in $cat.txt\n"; + + $idx2 = 1; + while() { + chomp; + my @col = split(" ", $_); + if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";} + if($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} + $idx2 ++; + } + close(INT); $idx2 --; + if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} + print "--> $cat.int corresponds to $cat.txt\n"; + + $idx3 = 1; + while() { + chomp; + my @col = split(":", $_); + if(@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";} + foreach(1 .. $idx1) { + if($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";} + } + $idx3 ++; + } + close(CSL); $idx3 --; + if($idx3 != 1) {$exit = 1; return print "--> ERROR: expect 1 row in $cat.csl (break at line ", $idx3+1, ")\n";} + print "--> $cat.csl corresponds to $cat.txt\n"; + + return print "--> $cat.\{txt, int, csl\} are OK\n"; +} + +sub check_txt_int { + my ($cat, $symtab) = @_; + print "Checking $cat.\{txt, int\} ...\n"; + if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";} + if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";} + if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} + if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} + + $idx1 = 1; + while() { + chomp; + s/^(shared|not-shared) (split|not-split) //g; + s/ nonword$//g; + s/ begin$//g; + s/ end$//g; + s/ internal$//g; + s/ singleton$//g; + $entry[$idx1] = $_; + $idx1 ++; + } + close(TXT); $idx1 --; + print "--> $idx1 entry/entries in $cat.txt\n"; + + $idx2 = 1; + while() { + chomp; + s/^(shared|not-shared) (split|not-split) //g; + s/ nonword$//g; + s/ begin$//g; + s/ end$//g; + s/ internal$//g; + s/ singleton$//g; + my @col = split(" ", $_); + @set = split(" ", $entry[$idx2]); + if(@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} + foreach(0 .. @set-1) { + if($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";} + } + $idx2 ++; + } + close(INT); $idx2 --; + if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} + print "--> $cat.int corresponds to $cat.txt\n"; + + return print "--> $cat.\{txt, int\} are OK\n"; +} + +@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence"); +@list2 = ("extra_questions", "roots", "sets"); +foreach(@list1) { + check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n"; +} +foreach(@list2) { + check_txt_int("$lang/phones/$_", \%psymtab); print "\n"; +} +if(-e "$lang/phones/word_boundary.txt") { + check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n"; +} + +# Check disjoint and summation ------------------------------- +sub intersect { + my ($a, $b) = @_; + @itset = (); + %itset = (); + foreach(keys %$a) { + if(exists $b->{$_} and !$itset{$_}) { + push(@itset, $_); + $itset{$_} = 1; + } + } + return @itset; +} + +sub check_disjoint { + print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n"; + if(!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";} + if(!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";} + if(!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";} + + $idx = 1; + while() { + chomp; + my @col = split(" ", $_); + $phone = shift @col; + if($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";} + $silence{$phone} = 1; + push(@silence, $phone); + $idx ++; + } + close(S); + + $idx = 1; + while() { + chomp; + my @col = split(" ", $_); + $phone = shift @col; + if($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";} + $nonsilence{$phone} = 1; + push(@nonsilence, $phone); + $idx ++; + } + close(N); + + $idx = 1; + while() { + chomp; + my @col = split(" ", $_); + $phone = shift @col; + if($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";} + $disambig{$phone} = 1; + $idx ++; + } + close(D); + + my @itsect1 = intersect(\%silence, \%nonsilence); + my @itsect2 = intersect(\%silence, \%disambig); + my @itsect3 = intersect(\%disambig, \%nonsilence); + + $success = 1; + if(@itsect1 != 0) { + $success = 0; + $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- "; + foreach(@itsect1) { + print $_, " "; + } + print "\n"; + } else {print "--> silence.txt and nonsilence.txt are disjoint\n";} + + if(@itsect2 != 0) { + $success = 0; + $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- "; + foreach(@itsect2) { + print $_, " "; + } + print "\n"; + } else {print "--> silence.txt and disambig.txt are disjoint\n";} + + if(@itsect3 != 0) { + $success = 0; + $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- "; + foreach(@itsect1) { + print $_, " "; + } + print "\n"; + } else {print "--> disambig.txt and nonsilence.txt are disjoint\n";} + + $success == 0 || print "--> disjoint property is OK\n"; + return; +} + +sub check_summation { + print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n"; + if(scalar(keys %silence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or not exists\n";} + if(scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or not exists\n";} + if(scalar(keys %disambig) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";} + + %sum = (%silence, %nonsilence, %disambig); + $sum{""} = 1; + + my @itset = intersect(\%sum, \%psymtab); + my @key1 = keys %sum; + my @key2 = keys %psymtab; + my %itset = (); foreach(@itset) {$itset{$_} = 1;} + if(@itset < @key1) { + $exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- "; + foreach(@key1) { + if(!$itset{$_}) {print "$_ ";} + } + print "\n"; + } + + if(@itset < @key2) { + $exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- "; + foreach(@key2) { + if(!$itset{$_}) {print "$_ ";} + } + print "\n"; + } + + if(@itset == @key1 and @itset == @key2) { + print "--> summation property is OK\n"; + } + return; +} + +%silence = (); +@silence = (); +%nonsilence = (); +@nonsilence = (); +%disambig = (); +check_disjoint; print "\n"; +check_summation; print "\n"; + +# Checking optional_silence.txt ------------------------------- +print "Checking optional_silence.txt ...\n"; +$idx = 1; +$success = 1; +if(-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or not exists\n";} +if(!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";} +print "--> reading $lang/phones/optional_silence.txt\n"; +while() { + chomp; + my @col = split(" ", $_); + if ($idx > 1 or @col > 1) { + $exit = 1; print "--> ERROR: only 1 phone expected in $lang/phones/optional_silence.txt\n"; $success = 0; + } elsif (!$silence{$col[0]}) { + $exit = 1; print "--> ERROR: phone $col[0] not found in $lang/phones/silence_phones.txt\n"; $success = 0; + } + $idx ++; +} +close(OS); +$success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n"; +print "\n"; + +# Check disambiguation symbols ------------------------------- +print "Checking disambiguation symbols: #0 and #1\n"; +if(scalar(keys %disambig) == 0) {$exit = 1; print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";} +if(exists $disambig{"#0"} and exists $disambig{"#1"}) { + print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n"; + print "--> $lang/phones/disambig.txt is OK\n\n"; +} else { + $exit = 1; print "--> ERROR: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\"\n"; +} + + +# Check topo ------------------------------- +print "Checking topo ...\n"; +if(-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or not exists\n";} +if(!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";} +$idx = 1; +while() { + chomp; + next if(m/^<.*>[ ]*$/); + if($idx == 1) {$nonsilence_seq = $_; $idx ++;} + if($idx == 2) {$silence_seq = $_;} +} +close(T); +if($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";} +@silence_seq = split(" ", $silence_seq); +@nonsilence_seq = split(" ", $nonsilence_seq); +$success1 = 1; +if(@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";} +else { + foreach(0 .. scalar(@nonsilence)-1) { + if($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) { + $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n"; + $success = 0; + } + } +} +$success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n"; +$success2 = 1; +if(@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";} +else { + foreach(0 .. scalar(@silence)-1) { + if($psymtab{@silence[$_]} ne @silence_seq[$_]) { + $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n"; + $success = 0; + } + } +} +$success2 != 1 || print "--> $lang/topo's silence section is OK\n"; +$success1 != 1 or $success2 != 1 || print "--> $lang/topo is OK\n"; +print "\n"; + +# Check word_boundary ------------------------------- +$nonword = ""; +$begin = ""; +$end = ""; +$internal = ""; +$singleton = ""; +if(-s "$lang/phones/word_boundary.txt") { + print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n"; + if(!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";} + $idx = 1; + %wb = (); + while() { + chomp; + my @col; + if (m/^.*nonword$/ ) {s/ nonword//g; @col = split(" ", $_); if (@col == 1) {$nonword .= "$col[0] ";}} + if (m/^.*begin$/ ) {s/ begin$//g; @col = split(" ", $_); if (@col == 1) {$begin .= "$col[0] ";}} + if (m/^.*end$/ ) {s/ end$//g; @col = split(" ", $_); if (@col == 1) {$end .= "$col[0] ";}} + if (m/^.*internal$/ ) {s/ internal$//g; @col = split(" ", $_); if (@col == 1) {$internal .= "$col[0] ";}} + if (m/^.*singleton$/) {s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}} + if(@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";} + $wb{shift @col} = 1; + $idx ++; + } + close(W); + + @itset = intersect(\%disambig, \%wb); + $success1 = 1; + if(@itset != 0) { + $success1 = 0; + $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- "; + foreach(@itset) {print "$_ ";} + print "\n"; + } + $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n"; + + %sum = (%silence, %nonsilence); + @itset = intersect(\%sum, \%wb); + %itset = (); foreach(@itset) {$itset{$_} = 1;} + $success2 = 1; + if(@itset < scalar(keys %sum)) { + $success2 = 0; + $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- "; + foreach(keys %sum) { + if(!$itset{$_}) {print "$_ ";} + } + print "\n"; + } + if(@itset < scalar(keys %wb)) { + $success2 = 0; + $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- "; + foreach(keys %wb) { + if(!$itset{$_}) {print "$_ ";} + } + print "\n"; + } + $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n"; + $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n"; + + + # Check L.fst ------------------------------- + print "--> checking L.fst and L_disambig.fst...\n"; + $nonword =~ s/ $//g; + $nonword =~ s/ / |/g; + $begin =~ s/ $//g; + $begin =~ s/ / |/g; + $end =~ s/ $//g; + $end =~ s/ / |/g; + $internal =~ s/ $//g; + $internal =~ s/ / |/g; + $singleton =~ s/ $//g; + $singleton =~ s/ / |/g; + + # Now handle the escape characters + foreach $esc(("^", "\$", "(", ")", "/", "@", "[", "]", "{", "}", "?", ".", "+", "*")) { + $tmp = "\\" . $esc; + $nonword =~ s/$tmp/\\$esc/g; + $begin =~ s/$tmp/\\$esc/g; + $end =~ s/$tmp/\\$esc/g; + $internal =~ s/$tmp/\\$esc/g; + $singleton =~ s/$tmp/\\$esc/g; + } + + $wlen = int(rand(100)) + 1; + print "--> generating a $wlen words sequence\n"; + $wordseq = ""; + $sid = 0; + foreach(1 .. $wlen) { + $id = int(rand(scalar(%wint2sym))); + while($wint2sym{$id} =~ m/^#[0-9]*$/ or $id == 0) {$id = int(rand(scalar(%wint2sym)));} + $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n"; + $sid ++; + } + $wordseq = $wordseq . "$sid 0"; + $phoneseq = `echo \"$wordseq" | fstcompile > tmp.fst; fstcompose $lang/L.fst tmp.fst | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint --isymbols=$lang/phones.txt --osymbols=$lang/phones.txt | awk '{if(NF > 2) {print \$3}}'; rm tmp.fst`; + $phoneseq =~ s/\s/ /g; + $phoneseq =~ m/^($nonword )*(((($begin )($internal )*($end ))|($singleton ))($nonword )*){$wlen}$/; + if(length($2) == 0) { + $exit = 1; print "--> ERROR: resulting phone sequence from L.fst doesn't correspond to the word sequence; check L.log.fst\n"; + open(LOG, ">L.log.fst"); print LOG $wordseq; close(LOG); + } else { + print "--> resulting phone sequence from L.fst corresponds to the word sequence\n"; + print "--> L.fst is OK\n"; + } + + $phoneseq = `echo \"$wordseq" | fstcompile > tmp.fst; fstcompose $lang/L_disambig.fst tmp.fst | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint --isymbols=$lang/phones.txt --osymbols=$lang/phones.txt | awk '{if(NF > 2) {print \$3}}'; rm tmp.fst`; + $phoneseq =~ s/\s/ /g; + $phoneseq =~ m/^(($nonword )(#[0-9]* )*)*(((($begin )($internal )*($end ))|($singleton ))(#[0-9]* )*(($nonword )(#[0-9]* )*)*){$wlen}$/; + if(length($4) == 0) { + $exit = 1; print "--> ERROR: resulting phone sequence from L_disambig.fst doesn't correspond to the word sequence; check L_disambig.log.fst\n"; + open(LOG, ">L_disambig.log.fst"); print LOG $wordseq; close(LOG); + } else { + print "--> resulting phone sequence from L_disambig.fst corresponds to the word sequence\n"; + print "--> L_disambig.fst is OK\n"; + } + print "\n"; +} + +# Check oov ------------------------------- +check_txt_int("$lang/oov", \%wsymtab); print "\n"; + + +if ($exit == 1) {exit 1;} diff --git a/egs/voxforge/online_demo/.gitignore b/egs/voxforge/online_demo/.gitignore new file mode 100644 index 00000000000..893758f420d --- /dev/null +++ b/egs/voxforge/online_demo/.gitignore @@ -0,0 +1,9 @@ +.gitignore +online-data +online-data.tar.bz2 +online-data_original/ +online-data_voxforge.zip +online-data_voxforge/ +online-data_vystadial.zip +online-data_vystadial/ +work/ diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 00000000000..573171ce2b9 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,115 @@ +*.o +*.a +*.fst +*.scp +*.tmp +*.gz +tmpf +bin/build-pfile-from-ali +bin/copy-post +bin/duplicate-matrix +bin/extract-ctx +bin/get-post-on-ali +bin/latgen-tracking-mapped +bin/logprob-to-post +bin/matrix-logprob +bin/matrix-sum +bin/pdf-to-counts +bin/post-to-pdf-post +bin/post-to-phone-post +bin/prob-to-post +featbin/copy-feats-to-htk +featbin/interpolate-pitch +featbin/paste-feats +featbin/process-pitch-feats +featbin/select-feats +featbin/subsample-feats +fstbin/fstpushspecial +fstbin/fstrhocompose +fstbin/fsts-to-transcripts +gmmbin/gmm-adapt-map +gmmbin/gmm-est-fmllr-raw +gmmbin/gmm-est-fmllr-raw-gpost +gmmbin/gmm-latgen-faster-parallel +gmmbin/gmm-latgen-tracking +kwsbin/generate-proxy-keywords +kwsbin/kws-index-union +kwsbin/kws-search +kwsbin/lattice-to-kws-index +kwsbin/transcripts-to-fsts +latbin/lattice-add-penalty +latbin/lattice-align-phones +latbin/lattice-align-words-lexicon +latbin/lattice-combine +latbin/lattice-depth +latbin/lattice-determinize-pruned-parallel +latbin/lattice-push +latbin/lattice-rescore-mapped +latbin/lattice-reverse +latbin/lattice-to-smbr-post +nnet-cpubin/nnet-align-compiled +nnet-cpubin/nnet-am-average +nnet-cpubin/nnet-am-combine +nnet-cpubin/nnet-am-compute +nnet-cpubin/nnet-am-copy +nnet-cpubin/nnet-am-fix +nnet-cpubin/nnet-am-info +nnet-cpubin/nnet-am-init +nnet-cpubin/nnet-am-limit-rank +nnet-cpubin/nnet-am-mixup +nnet-cpubin/nnet-am-rescale +nnet-cpubin/nnet-am-shrink +nnet-cpubin/nnet-am-stats +nnet-cpubin/nnet-combine +nnet-cpubin/nnet-combine-a +nnet-cpubin/nnet-combine-fast +nnet-cpubin/nnet-compute-prob +nnet-cpubin/nnet-copy-egs +nnet-cpubin/nnet-get-egs +nnet-cpubin/nnet-get-preconditioner +nnet-cpubin/nnet-gradient +nnet-cpubin/nnet-init +nnet-cpubin/nnet-insert +nnet-cpubin/nnet-latgen-faster +nnet-cpubin/nnet-latgen-faster-parallel +nnet-cpubin/nnet-logprob +nnet-cpubin/nnet-logprob-parallel +nnet-cpubin/nnet-logprob2 +nnet-cpubin/nnet-logprob2-parallel +nnet-cpubin/nnet-precondition +nnet-cpubin/nnet-randomize-frames +nnet-cpubin/nnet-select-egs +nnet-cpubin/nnet-shrink +nnet-cpubin/nnet-shuffle-egs +nnet-cpubin/nnet-subset-egs +nnet-cpubin/nnet-train +nnet-cpubin/nnet-train-lbfgs +nnet-cpubin/nnet-train-parallel +nnet-cpubin/nnet-train-simple +nnet-cpubin/nnet-train-transitions +nnetbin/cmvn-to-nnet +nnetbin/nnet-concat +nnetbin/nnet-train-mpe-sequential +nnetbin/nnet-train-xent-hardlab-frmshuff-prior +nnetbin/transf-to-nnet +onlinebin/online-gmm-decode-faster +onlinebin/online-net-client +onlinebin/online-server-gmm-decode-faster +onlinebin/online-wav-gmm-decode-faster +feat/tmp.test.wav.* +fstext/push-special-test +gmm/tmp_stats +gmm/tmpfb +gmm/tmpfeats +nnet-cpu/nnet-precondition-test +sgmm/estimate-am-sgmm-multi-test +sgmm/tmpfb +sgmm2/tmpfb +thread/kaldi-task-sequence-test +thread/kaldi-thread-test +tied/tmpfb +transform/fmllr-raw-test +transform/tmp_regtree +transform/tmp_stats +transform/tmpfb +sgmm2bin/sgmm2-latgen-faster-parallel diff --git a/src/Makefile b/src/Makefile index e6af5e650c2..ab2d6089e8f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,11 @@ SUBDIRS = base matrix util feat tree thread gmm tied transform sgmm \ nnetbin latbin sgmm2 sgmm2bin nnet-cpu nnet-cpubin kwsbin # Optional subdirectories +<<<<<<< HEAD EXT_SUBDIRS = online onlinebin +======= +EXT_SUBDIRS = online onlinebin python-kaldi-decoding +>>>>>>> master all: test_install kaldi.mk $(SUBDIRS) echo Done @@ -89,5 +93,6 @@ nnet: base util matrix cudamatrix nnet-cpu: base util matrix thread #3)Dependencies for optional parts of Kaldi onlinebin: base matrix util feat tree optimization gmm tied transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet-cpu online +python-kaldi-decoding: base matrix util feat tree optimization thread gmm tied transform sgmm sgmm2 fstext hmm decoder lat online online: decoder kwsbin: fstext lat base util diff --git a/src/configure b/src/configure index cd3748767de..13afd9791a7 100755 --- a/src/configure +++ b/src/configure @@ -62,13 +62,14 @@ unset MKLLIBDIR function usage { echo 'Usage: ./configure [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR] - [--omp-libdir=OMPDIR] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS] + [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS] [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR]'; } threaded_atlas=false # By default, use the un-threaded version of ATLAS. threaded_math=${threaded_atlas} static_math=false +static_fst=false use_cuda=true while [ $# -gt 0 ]; @@ -93,6 +94,10 @@ do static_math=true; shift ;; --static-math=no) static_math=false; shift ;; + --static-fst=yes) + static_fst=true; shift ;; + --static-fst=no) + static_fst=false; shift ;; --fst-root=*) FSTROOT=`read_dirname $1`; shift ;; --clapack-root=*) @@ -531,9 +536,17 @@ if [ "`uname -o`" == "Cygwin" ]; then fi if [ "`uname`" == "Linux" ]; then - if [ ! -f $FSTROOT/lib/libfst.a ]; then - failure "Static OpenFST library not found: See ../tools/INSTALL" + if $static_fst ; then + OPENFSTLIBS="$FSTROOT/lib/libfst.a" + fst_type='a' + else + OPENFSTLIBS="-L${FSTROOT}/lib -lfst -Wl,-rpath=${FSTROOT}/lib" + fst_type='so' + fi + if [ ! -f "$FSTROOT/lib/libfst.${fst_type}" ]; then + failure "Static=[$static_fst] OpenFST library not found: See ../tools/INSTALL" fi + echo OPENFSTLIBS = $OPENFSTLIBS >> kaldi.mk echo FSTROOT = $FSTROOT >> kaldi.mk echo "On Linux: Checking for linear algebra header files ..." diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk index a1dce3f3e65..db49336cfb2 100644 --- a/src/makefiles/cygwin.mk +++ b/src/makefiles/cygwin.mk @@ -5,6 +5,7 @@ $(error FSTROOT not defined.) endif CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=0 \ + -fPIC \ -DHAVE_POSIX_MEMALIGN -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \ -Wno-sign-compare -Winit-self \ -I ../../tools/CLAPACK/ \ diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk index 30a392fcfd4..a543b47a042 100644 --- a/src/makefiles/darwin_10_5.mk +++ b/src/makefiles/darwin_10_5.mk @@ -5,6 +5,7 @@ $(error FSTROOT not defined.) endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \ diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk index 6fd0d360c28..62802ee6b5b 100644 --- a/src/makefiles/darwin_10_6.mk +++ b/src/makefiles/darwin_10_6.mk @@ -5,6 +5,7 @@ $(error FSTROOT not defined.) endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \ diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk index 63d762a6758..1138202043e 100644 --- a/src/makefiles/darwin_10_7.mk +++ b/src/makefiles/darwin_10_7.mk @@ -5,6 +5,7 @@ $(error FSTROOT not defined.) endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \ diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk index 63d762a6758..1138202043e 100644 --- a/src/makefiles/darwin_10_8.mk +++ b/src/makefiles/darwin_10_8.mk @@ -5,6 +5,7 @@ $(error FSTROOT not defined.) endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \ diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk index 95ff3a886d6..be9bb0b9cfd 100644 --- a/src/makefiles/linux_atlas.mk +++ b/src/makefiles/linux_atlas.mk @@ -14,6 +14,7 @@ endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \ diff --git a/src/makefiles/linux_atlas_64bit.mk b/src/makefiles/linux_atlas_64bit.mk index b00a1f76cd5..444c917de81 100644 --- a/src/makefiles/linux_atlas_64bit.mk +++ b/src/makefiles/linux_atlas_64bit.mk @@ -36,6 +36,7 @@ endif CXXFLAGS = -msse -msse2 -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare -Winit-self \ -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \ diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk index 04947a1b9b5..8d826f5a957 100644 --- a/src/makefiles/linux_clapack.mk +++ b/src/makefiles/linux_clapack.mk @@ -1,6 +1,7 @@ # You have to make sure CLAPACKLIBS is set... CXXFLAGS = -msse -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -msse2 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare \ -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \ diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk index c5a42a65d17..9b799d430fe 100644 --- a/src/makefiles/linux_openblas.mk +++ b/src/makefiles/linux_openblas.mk @@ -1,16 +1,18 @@ # You have to make sure CLAPACKLIBS is set... CXXFLAGS = -msse -Wall -I.. \ + -fPIC \ -DKALDI_DOUBLEPRECISION=0 -msse2 -DHAVE_POSIX_MEMALIGN \ -Wno-sign-compare \ -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \ -DUSE_KALDI_SVD -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \ -I ../../tools/openfst/include \ + -I $(FSTROOT)/include \ $(EXTRA_CXXFLAGS) \ -g # -O0 -DKALDI_PARANOID LDFLAGS = -rdynamic -LDLIBS = $(EXTRA_LDLIBS) ../../tools/openfst/lib/libfst.a -ldl $(OPENBLASLIBS) -lm -lpthread +LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl CC = g++ CXX = g++ AR = ar diff --git a/src/python-kaldi-decoding/.gitignore b/src/python-kaldi-decoding/.gitignore new file mode 100644 index 00000000000..988c8be5cd9 --- /dev/null +++ b/src/python-kaldi-decoding/.gitignore @@ -0,0 +1,17 @@ +.valgrind +*.o +*.so +*.a +valgrind.out +.depend.mk +data_voip_en +decode +mfcc +utils +data_voip_en +compute-mfcc-feats-test +gmm-latgen-faster-test +online-wav-gmm-decode-faster-test +compute-wer-test +lattice-best-path-test +*.d diff --git a/src/python-kaldi-decoding/Makefile b/src/python-kaldi-decoding/Makefile new file mode 100644 index 00000000000..61591c68b4c --- /dev/null +++ b/src/python-kaldi-decoding/Makefile @@ -0,0 +1,82 @@ +# We suppose that this Makefile sits in kaldi-trunk/src/ThisDirectory +all: + +EXTRA_CXXFLAGS = -Wno-sign-compare -I ../../tools/portaudio/install/include +EXTRA_LDLIBS = + +include ../kaldi.mk +CC = gcc + +TESTFILES = compute-wer-test gmm-latgen-faster-test compute-mfcc-feats-test \ + lattice-best-path-test online-wav-gmm-decode-faster-test +OBJFILES = compute-wer.o gmm-latgen-faster.o compute-mfcc-feats.o \ + lattice-best-path.o online-wav-gmm-decode-faster.o + +LDLIBFILE = libkaldi-cffi.so +LIBFILE = kaldi-cffi.a + + +UNAME=$(shell uname) +ifeq ($(UNAME), Linux) + EXTRA_LDLIBS += ../../tools/portaudio/install/lib/libportaudio.a +ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),) + EXTRA_LDLIBS += -lasound +else + EXTRA_LDLIBS += -lrt +endif +else + EXTRA_LDLIBS += -L $(PA_LDD) -lportaudio +endif + +all: $(LIBFILE) $(LDLIBFILE) + +# I can not use ../decoder/kaldi-decoder.a because there are two types of decoders for two types sgmm vs sgmm2 +$(LDLIBFILE): $(OBJFILES) \ + ../decoder/decodable-am-diag-gmm.o ../decoder/lattice-faster-decoder.o ../decoder/faster-decoder.o \ + ../online/kaldi-online.a ../thread/kaldi-thread.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../fstext/kaldi-fstext.a ../tree/kaldi-tree.a \ + ../matrix/kaldi-matrix.a ../feat/kaldi-feature.a ../util/kaldi-util.a ../base/kaldi-base.a + $(CC) -fPIC -shared -o $@ -Wl,-export-dynamic,--whole-archive $^ -Wl,--no-whole-archive $(LDLIBS) + + +$(LIBFILE): $(OBJFILES) + $(AR) -cru $(LIBFILE) $(OBJFILES) + $(RANLIB) $(LIBFILE) + +# Rule below would expand to, e.g.: +# ../base/kaldi-base.a: +# make -c ../base kaldi-base.a +# -c option to make is same as changing directory. +%.a: + $(MAKE) -C ${@D} ${@F} + +clean: + -rm -f *.o *.a tmp* *.tmp *.so .depend.mk $(TESTFILES) valgrind.out + +depend: + -$(CXX) -M $(CXXFLAGS) *.cc > .depend.mk + +# removing automatic making of "depend" as it's quite slow. +# .depend.mk: depend + +-include .depend.mk + +.valgrind: $(TESTFILES) + +.PHONY: test_dyn python_test test + +### TESTS - launching command --help ### +test_dyn: $(LDLIBFILE) $(TESTFILES) + # Before running the binaries set the LD_LIBRARY_PATH variable as we do! + @result=0; for x in $(TESTFILES); do \ + echo -n "Running $$x : "; \ + LD_LIBRARY_PATH=$(OBLAS_LDD):$(FST_LDD):$:`pwd` ./$$x --help > /dev/null 2>&1; \ + if [ $$? -ne 0 ]; then echo "... FAIL"; result=1; else echo "... SUCCESS"; fi; \ + done; exit $$result + +python_test: run.py $(LDLIBFILE) + LD_LIBRARY_PATH=$(OBLAS_LDD):$(FST_LDD):`pwd` python $< + echo $? + +# test: test_dyn python_test +test: test_dyn diff --git a/src/python-kaldi-decoding/README.md b/src/python-kaldi-decoding/README.md new file mode 100644 index 00000000000..efdd7d48181 --- /dev/null +++ b/src/python-kaldi-decoding/README.md @@ -0,0 +1,65 @@ +Intro +----- +The goal of this project is to test +Kaldi decoding pipeline called from Python + +Prerequisities +-------------- + + * Install *cffi*! See the docs +[http://cffi.readthedocs.org/](http://cffi.readthedocs.org/) for more info. + * Build kaldi with `OpenBlAS` support and `-fPIC` flags in `CXXFLAGS` or `EXTRA_CXXFLAGS` in the main Makefile + * Before building Kaldi build `OpenBLAS` and openfst by + + ```sh + cd kaldi-trunk/tools + make openblas + ``` + + and + +```sh + cd kaldi-trunk/tools +# replace line in kaldi-trunk/tools/Makefile by following "patch" change line 37! +# switching from disable-shared -> enable-shared +*** Makefile +************ +*** 34,38 **** + +openfst-1.3.2/Makefile: openfst-1.3.2/.patched + cd openfst-1.3.2/; \ +! ./configure --prefix=`pwd` --enable-static --disable-shared --enable-far --enable-ngram-fsts + +--- 34,38 ---- + +openfst-1.3.2/Makefile: openfst-1.3.2/.patched + cd openfst-1.3.2/; \ +! ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts + +# and build it +make openfst_tgt +``` + + +Running and building examples +----------------------------- + +In order to build shared libraries and run C test binaries +```sh +$make all +``` +To run `run.py` set up specify where are the shared libraries. E.g. by running from `kaldi-trunk/src/python-kaldi-decoding` + +```sh +LD_LIBRARY_PATH=`pwd`/../../tools/OpenBLAS:`pwd`/../../tools/openfst/lib:`pwd` ./run.py +``` + + +Remarks on linking +------- + * [How to use dlopen](http://www.isotton.com/devel/docs/C++-dlopen-mini-HOWTO/C++-dlopen-mini-HOWTO.html) + * [Stackoverflow little off topic explanation](http://stackoverflow.com/questions/12762910/c-undefined-symbols-when-loading-shared-library-with-dlopen) + * [http://kaldi.sourceforge.net/matrixwrap.html](See Missing the ATLAS implementation of CLAPACK) + * I spent a lot of time to set right linking. + I was linking `lapack` libraries instead of `lapack_atlas`. + I was getting error `undefined symbol: clapack_dgetrf` diff --git a/src/python-kaldi-decoding/compute-mfcc-feats-test.c b/src/python-kaldi-decoding/compute-mfcc-feats-test.c new file mode 100644 index 00000000000..54a50409f4e --- /dev/null +++ b/src/python-kaldi-decoding/compute-mfcc-feats-test.c @@ -0,0 +1,5 @@ +#include "test_cffi_python_dyn.h" + +int main(int argc, char **argv) { + return testSharedLib("libkaldi-cffi.so", "compute_mfcc_feats_like_main", argc, argv); +} diff --git a/src/python-kaldi-decoding/compute-mfcc-feats.cc b/src/python-kaldi-decoding/compute-mfcc-feats.cc new file mode 100644 index 00000000000..e9934e00231 --- /dev/null +++ b/src/python-kaldi-decoding/compute-mfcc-feats.cc @@ -0,0 +1,185 @@ +// featbin/compute-mfcc-feats.cc + +// Copyright 2009-2012 Microsoft Corporation +// Johns Hopkins University (author: Daniel Povey) + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/feature-mfcc.h" +#include "feat/wave-reader.h" + +#include "compute-mfcc-feats.h" + +int compute_mfcc_feats_like_main(int argc, char *argv[]) { + try { + using namespace kaldi; + const char *usage = + "Create MFCC feature files.\n" + "Usage: compute-mfcc-feats [options...] \n"; + + // construct all the global objects + ParseOptions po(usage); + MfccOptions mfcc_opts; + bool subtract_mean = false; + BaseFloat vtln_warp = 1.0; + std::string vtln_map_rspecifier; + std::string utt2spk_rspecifier; + int32 channel = -1; + BaseFloat min_duration = 0.0; + // Define defaults for gobal options + std::string output_format = "kaldi"; + + // Register the MFCC option struct + mfcc_opts.Register(&po); + + // Register the options + po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]"); + po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. "); + po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)"); + po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)"); + po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)"); + po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)"); + po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds)."); + + // OPTION PARSING .......................................................... + // + + // parse options (+filling the registered variables) + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string wav_rspecifier = po.GetArg(1); + + std::string output_wspecifier = po.GetArg(2); + + Mfcc mfcc(mfcc_opts); + + SequentialTableReader reader(wav_rspecifier); + BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. + TableWriter htk_writer; + + if (utt2spk_rspecifier != "") + KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only " + "needed if the vtln-map option is used."); + RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier, + utt2spk_rspecifier); + + if (output_format == "kaldi") { + if (!kaldi_writer.Open(output_wspecifier)) + KALDI_ERR << "Could not initialize output with wspecifier " + << output_wspecifier; + } else if (output_format == "htk") { + if (!htk_writer.Open(output_wspecifier)) + KALDI_ERR << "Could not initialize output with wspecifier " + << output_wspecifier; + } else { + KALDI_ERR << "Invalid output_format string " << output_format; + } + + int32 num_utts = 0, num_success = 0; + for (; !reader.Done(); reader.Next()) { + num_utts++; + std::string utt = reader.Key(); + const WaveData &wave_data = reader.Value(); + if (wave_data.Duration() < min_duration) { + KALDI_WARN << "File: " << utt << " is too short (" + << wave_data.Duration() << " sec): producing no output."; + continue; + } + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << utt << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + BaseFloat vtln_warp_local; // Work out VTLN warp factor. + if (vtln_map_rspecifier != "") { + if (!vtln_map_reader.HasKey(utt)) { + KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) " + << utt; + continue; + } + vtln_warp_local = vtln_map_reader.Value(utt); + } else { + vtln_warp_local = vtln_warp; + } + if (mfcc_opts.frame_opts.samp_freq != wave_data.SampFreq()) + KALDI_ERR << "Sample frequency mismatch: you specified " + << mfcc_opts.frame_opts.samp_freq << " but data has " + << wave_data.SampFreq() << " (use --sample-frequency option)"; + + SubVector waveform(wave_data.Data(), this_chan); + Matrix features; + try { + mfcc.Compute(waveform, vtln_warp_local, &features, NULL); + } catch (...) { + KALDI_WARN << "Failed to compute features for utterance " + << utt; + continue; + } + if (subtract_mean) { + Vector mean(features.NumCols()); + mean.AddRowSumMat(1.0, features); + mean.Scale(1.0 / features.NumRows()); + for (int32 i = 0; i < features.NumRows(); i++) + features.Row(i).AddVec(-1.0, mean); + } + if (output_format == "kaldi") { + kaldi_writer.Write(utt, features); + } else { + std::pair, HtkHeader> p; + p.first.Resize(features.NumRows(), features.NumCols()); + p.first.CopyFromMat(features); + HtkHeader header = { + features.NumRows(), + 100000, // 10ms shift + sizeof(float)*features.NumCols(), + 006 | // MFCC + (mfcc_opts.use_energy ? 0100 : 020000) // energy; otherwise c0 + }; + p.second = header; + htk_writer.Write(utt, p); + } + if (num_utts % 10 == 0) + KALDI_LOG << "Processed " << num_utts << " utterances"; + KALDI_VLOG(2) << "Processed features for key " << utt; + num_success++; + } + KALDI_LOG << " Done " << num_success << " out of " << num_utts + << " utterances."; + return (num_success != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/python-kaldi-decoding/compute-mfcc-feats.h b/src/python-kaldi-decoding/compute-mfcc-feats.h new file mode 100644 index 00000000000..530bcf82c18 --- /dev/null +++ b/src/python-kaldi-decoding/compute-mfcc-feats.h @@ -0,0 +1,16 @@ +// -*- coding: utf-8 -*- + +#ifndef COMPUTE_MFCC_FEATS_H +#define COMPUTE_MFCC_FEATS_H + +#ifdef __cplusplus +extern "C" { +#endif + +int compute_mfcc_feats_like_main(int argc, char **argv); + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef COMPUTE_MFCC_FEATS_H diff --git a/src/python-kaldi-decoding/compute-wer-test.c b/src/python-kaldi-decoding/compute-wer-test.c new file mode 100644 index 00000000000..797b2651fe8 --- /dev/null +++ b/src/python-kaldi-decoding/compute-wer-test.c @@ -0,0 +1,5 @@ +#include "test_cffi_python_dyn.h" + +int main(int argc, char **argv) { + return testSharedLib("libkaldi-cffi.so", "compute_wer_like_main", argc, argv); +} diff --git a/src/python-kaldi-decoding/compute-wer.cc b/src/python-kaldi-decoding/compute-wer.cc new file mode 100644 index 00000000000..76f694fbd9f --- /dev/null +++ b/src/python-kaldi-decoding/compute-wer.cc @@ -0,0 +1,144 @@ +// bin/compute-wer.cc + +// Copyright 2009-2011 Microsoft Corporation + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "util/parse-options.h" +#include "tree/context-dep.h" +#include "util/edit-distance.h" + +#include "compute-wer.h" + + +int compute_wer_like_main(int argc, char *argv[]) { + using namespace kaldi; + typedef kaldi::int32 int32; + + try { + const char *usage = + "Compute WER by comparing different transcriptions\n" + "Takes two transcription files, in kaldi integer format\n" + "Usage: compute-wer [options] \n"; + ParseOptions po(usage); + + std::string mode = "strict"; + bool text_input = false; // if this is true, we expect symbols as strings, + + po.Register("mode", &mode, + "Scoring mode: \"present\"|\"all\"|\"strict\":\n" + " \"present\" means score those we have transcriptions for\n" + " \"all\" means treat absent transcriptions as empty\n" + " \"strict\" means die if all in ref not also in hyp"); + po.Register("text", &text_input, "Expect strings, not integers, as input."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string ref_rspecifier = po.GetArg(1); + std::string hyp_rspecifier = po.GetArg(2); + + if (mode != "strict" + && mode != "present" + && mode != "all") { + KALDI_ERR << "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got "< &ref_sent = ref_reader.Value(); + std::vector hyp_sent; + if (!hyp_reader.HasKey(key)) { + if (mode == "strict") + KALDI_ERR << "No hypothesis for key " << key << " and strict " + "mode specifier."; + num_absent_sents++; + if (mode == "present") // do not score this one. + continue; + } else { + hyp_sent = hyp_reader.Value(key); + } + num_words += ref_sent.size(); + int32 ins, del, sub; + word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, &ins, &del, &sub); + num_ins += ins; num_del += del; num_sub += sub; + + num_sent++; + sent_errs += (ref_sent != hyp_sent); + } + } else { + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent; + if (!hyp_reader.HasKey(key)) { + if (mode == "strict") + KALDI_ERR << "No hypothesis for key " << key << " and strict " + "mode specifier."; + num_absent_sents++; + if (mode == "present") // do not score this one. + continue; + } else { + hyp_sent = hyp_reader.Value(key); + } + num_words += ref_sent.size(); + int32 ins, del, sub; + word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, &ins, &del, &sub); + num_ins += ins; num_del += del; num_sub += sub; + + num_sent++; + sent_errs += (ref_sent != hyp_sent); + } + } + + BaseFloat percent_wer = 100.0 * static_cast(word_errs) + / static_cast(num_words); + std::cout.precision(2); + std::cerr.precision(2); + std::cout << "%WER "<(sent_errs) + / static_cast(num_sent); + std::cout << "%SER "< 6) { + po.PrintUsage(); + exit(1); + } + + std::string model_in_filename = po.GetArg(1), + fst_in_str = po.GetArg(2), + feature_rspecifier = po.GetArg(3), + lattice_wspecifier = po.GetArg(4), + words_wspecifier = po.GetOptArg(5), + alignment_wspecifier = po.GetOptArg(6); + + TransitionModel trans_model; + AmDiagGmm am_gmm; + { + bool binary; + Input ki(model_in_filename, &binary); + trans_model.Read(ki.Stream(), binary); + am_gmm.Read(ki.Stream(), binary); + } + + bool determinize = config.determinize_lattice; + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) + : lattice_writer.Open(lattice_wspecifier))) + KALDI_ERR << "Could not open table for writing lattices: " + << lattice_wspecifier; + + Int32VectorWriter words_writer(words_wspecifier); + + Int32VectorWriter alignment_writer(alignment_wspecifier); + + fst::SymbolTable *word_syms = NULL; + if (word_syms_filename != "") + if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + + double tot_like = 0.0; + kaldi::int64 frame_count = 0; + int num_done = 0, num_err = 0; + + if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + // Input FST is just one FST, not a table of FSTs. + VectorFst *decode_fst = NULL; + { + std::ifstream is(fst_in_str.c_str(), std::ifstream::binary); + if (!is.good()) KALDI_ERR << "Could not open decoding-graph FST " + << fst_in_str; + decode_fst = + VectorFst::Read(is, fst::FstReadOptions(fst_in_str)); + if (decode_fst == NULL) // fst code will warn. + exit(1); + } + + { + LatticeFasterDecoder decoder(*decode_fst, config); + + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + Matrix features (feature_reader.Value()); + feature_reader.FreeCurrent(); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_err++; + continue; + } + + DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model, features, + acoustic_scale); + + double like; + if (DecodeUtteranceLatticeFaster( + decoder, gmm_decodable, word_syms, utt, acoustic_scale, + determinize, allow_partial, &alignment_writer, &words_writer, + &compact_lattice_writer, &lattice_writer, &like)) { + tot_like += like; + frame_count += features.NumRows(); + num_done++; + } else num_err++; + } + } + delete decode_fst; // delete this only after decoder goes out of scope. + } else { // We have different FSTs for different utterances. + SequentialTableReader fst_reader(fst_in_str); + RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + for (; !fst_reader.Done(); fst_reader.Next()) { + std::string utt = fst_reader.Key(); + if (!feature_reader.HasKey(utt)) { + KALDI_WARN << "Not decoding utterance " << utt + << " because no features available."; + num_err++; + continue; + } + const Matrix &features = feature_reader.Value(utt); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_err++; + continue; + } + + LatticeFasterDecoder decoder(fst_reader.Value(), config); + DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model, features, + acoustic_scale); + double like; + if (DecodeUtteranceLatticeFaster( + decoder, gmm_decodable, word_syms, utt, acoustic_scale, + determinize, allow_partial, &alignment_writer, &words_writer, + &compact_lattice_writer, &lattice_writer, &like)) { + tot_like += like; + frame_count += features.NumRows(); + num_done++; + } else num_err++; + } + } + + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 frames/sec is " + << (elapsed*100.0/frame_count); + KALDI_LOG << "Done " << num_done << " utterances, failed for " + << num_err; + KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " + << frame_count << " frames."; + + if (word_syms) delete word_syms; + if (num_done != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/python-kaldi-decoding/gmm-latgen-faster.h b/src/python-kaldi-decoding/gmm-latgen-faster.h new file mode 100644 index 00000000000..c5cd487275e --- /dev/null +++ b/src/python-kaldi-decoding/gmm-latgen-faster.h @@ -0,0 +1,16 @@ +// -*- coding: utf-8 -*- + +#ifndef GMM_LATGEN_FASTER_H +#define GMM_LATGEN_FASTER_H + +#ifdef __cplusplus +extern "C" { +#endif + +int gmm_latgen_faster_like_main(int argc, char **argv); + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef GMM_LATGEN_FASTER_H diff --git a/src/python-kaldi-decoding/lattice-best-path-test.c b/src/python-kaldi-decoding/lattice-best-path-test.c new file mode 100644 index 00000000000..656052adc8e --- /dev/null +++ b/src/python-kaldi-decoding/lattice-best-path-test.c @@ -0,0 +1,5 @@ +#include "test_cffi_python_dyn.h" + +int main(int argc, char **argv) { + return testSharedLib("libkaldi-cffi.so", "lattice_best_path_like_main", argc, argv); +} diff --git a/src/python-kaldi-decoding/lattice-best-path.cc b/src/python-kaldi-decoding/lattice-best-path.cc new file mode 100644 index 00000000000..67b293f87fa --- /dev/null +++ b/src/python-kaldi-decoding/lattice-best-path.cc @@ -0,0 +1,136 @@ +// latbin/lattice-best-path.cc + +// Copyright 2009-2011 Microsoft Corporation + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +#include "lattice-best-path.h" + +int lattice_best_path_like_main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Generate 1-best path through lattices; output as transcriptions and alignments\n" + "Note: if you want output as FSTs, use lattice-1best; if you want output\n" + "with acoustic and LM scores, use lattice-1best | nbest-to-linear\n" + "Usage: lattice-best-path [options] lattice-rspecifier [ transcriptions-wspecifier [ alignments-wspecifier] ]\n" + " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats ark:1.tra ark:1.ali\n"; + + ParseOptions po(usage); + BaseFloat acoustic_scale = 1.0; + BaseFloat lm_scale = 1.0; + + std::string word_syms_filename; + po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); + po.Register("lm-scale", &lm_scale, "Scaling factor for LM probabilities. " + "Note: the ratio acoustic-scale/lm-scale is all that matters."); + po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]"); + + po.Read(argc, argv); + + if (po.NumArgs() < 1 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + transcriptions_wspecifier = po.GetOptArg(2), + alignments_wspecifier = po.GetOptArg(3); + + SequentialCompactLatticeReader clat_reader(lats_rspecifier); + + Int32VectorWriter transcriptions_writer(transcriptions_wspecifier); + + Int32VectorWriter alignments_writer(alignments_wspecifier); + + fst::SymbolTable *word_syms = NULL; + if (word_syms_filename != "") + if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + + + int32 n_done = 0, n_fail = 0; + int64 n_frame = 0; + LatticeWeight tot_weight = LatticeWeight::One(); + + for (; !clat_reader.Done(); clat_reader.Next()) { + std::string key = clat_reader.Key(); + CompactLattice clat = clat_reader.Value(); + clat_reader.FreeCurrent(); + fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &clat); + CompactLattice clat_best_path; + CompactLatticeShortestPath(clat, &clat_best_path); // A specialized + // implementation of shortest-path for CompactLattice. + Lattice best_path; + ConvertLattice(clat_best_path, &best_path); + if (best_path.Start() == fst::kNoStateId) { + KALDI_WARN << "Best-path failed for key " << key; + n_fail++; + } else { + std::vector alignment; + std::vector words; + LatticeWeight weight; + GetLinearSymbolSequence(best_path, &alignment, &words, &weight); + KALDI_LOG << "For utterance " << key << ", best cost " + << weight.Value1() << " + " << weight.Value2() << " = " + << (weight.Value1() + weight.Value2()); + if (transcriptions_wspecifier != "") + transcriptions_writer.Write(key, words); + if (alignments_wspecifier != "") + alignments_writer.Write(key, alignment); + if (word_syms != NULL) { + std::cerr << key << ' '; + for (size_t i = 0; i < words.size(); i++) { + std::string s = word_syms->Find(words[i]); + if (s == "") + KALDI_ERR << "Word-id " << words[i] <<" not in symbol table."; + std::cerr << s << ' '; + } + std::cerr << '\n'; + } + n_done++; + n_frame += alignment.size(); + tot_weight = Times(tot_weight, weight); + } + } + + BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2(); + KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame) + << " = " << (tot_weight.Value1()/n_frame) << " [graph]" + << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]" + << " over " << n_frame << " frames."; + KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail; + + if (word_syms) delete word_syms; + if (n_done != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/python-kaldi-decoding/lattice-best-path.h b/src/python-kaldi-decoding/lattice-best-path.h new file mode 100644 index 00000000000..d810489ff11 --- /dev/null +++ b/src/python-kaldi-decoding/lattice-best-path.h @@ -0,0 +1,16 @@ +// -*- coding: utf-8 -*- + +#ifndef LATTICE_BEST_PATH_H +#define LATTICE_BEST_PATH_H + +#ifdef __cplusplus +extern "C" { +#endif + +int lattice_best_path_like_main(int argc, char **argv); + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef LATTICE_BEST_PATH_H diff --git a/src/python-kaldi-decoding/little_wavs_data_void_en.scp b/src/python-kaldi-decoding/little_wavs_data_void_en.scp new file mode 100644 index 00000000000..0a4368fd499 --- /dev/null +++ b/src/python-kaldi-decoding/little_wavs_data_void_en.scp @@ -0,0 +1,4 @@ +fj228x-001-100517_171607_0001813_0001882.wav ./data_voip_en/test/fj228x-001-100517_171607_0001813_0001882.wav +fj228x-001-100517_182933_0001674_0002000.wav ./data_voip_en/test/fj228x-001-100517_182933_0001674_0002000.wav +fj228x-001-100517_183334_0001637_0001921.wav ./data_voip_en/test/fj228x-001-100517_183334_0001637_0001921.wav +fj228x-001-100517_200151_0002054_0002337.wav ./data_voip_en/test/fj228x-001-100517_200151_0002054_0002337.wav diff --git a/src/python-kaldi-decoding/online-wav-gmm-decode-faster-test.c b/src/python-kaldi-decoding/online-wav-gmm-decode-faster-test.c new file mode 100644 index 00000000000..ac035502c03 --- /dev/null +++ b/src/python-kaldi-decoding/online-wav-gmm-decode-faster-test.c @@ -0,0 +1,5 @@ +#include "test_cffi_python_dyn.h" + +int main(int argc, char **argv) { + return testSharedLib("libkaldi-cffi.so", "online_wav_gmm_decode_faster_like_main", argc, argv); +} diff --git a/src/python-kaldi-decoding/online-wav-gmm-decode-faster.cc b/src/python-kaldi-decoding/online-wav-gmm-decode-faster.cc new file mode 100644 index 00000000000..de3261fb8d6 --- /dev/null +++ b/src/python-kaldi-decoding/online-wav-gmm-decode-faster.cc @@ -0,0 +1,247 @@ +// onlinebin/online-wav-gmm-decode-faster.cc + +// Copyright 2012 Cisco Systems (author: Matthias Paulik) + +// Modifications to the original contribution by Cisco Systems made by: +// Vassil Panayotov + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "feat/feature-mfcc.h" +#include "feat/wave-reader.h" +#include "online/online-audio-source.h" +#include "online/online-feat-input.h" +#include "online/online-decodable.h" +#include "online/online-faster-decoder.h" +#include "online/onlinebin-util.h" + +#include "online-wav-gmm-decode-faster.h" + +int online_wav_gmm_decode_faster_like_main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace fst; + + typedef kaldi::int32 int32; + typedef OnlineFeInput FeInput; + + // up to delta-delta derivative features are calculated (unless LDA is used) + const int32 kDeltaOrder = 2; + + const char *usage = + "Reads in wav file(s) and simulates online decoding.\n" + "Writes .tra and .ali files for WER computation. Utterance " + "segmentation is done on-the-fly.\n" + "Feature splicing/LDA transform is used, if the optional(last) argument " + "is given.\n" + "Otherwise delta/delta-delta(i.e. 2-nd order) features are produced.\n" + "Caution: the last few frames of the wav file may not be decoded properly.\n" + "Hence, don't use one wav file per utterance, but " + "rather use one wav file per show.\n\n" + "Usage: ./online-wav-gmm-decode-faster [options] wav-rspecifier model-in" + "fst-in word-symbol-table silence-phones transcript-wspecifier " + "alignments-wspecifier [lda-matrix-in]\n\n" + "Example: ./online-wav-gmm-decode-faster --rt-min=0.3 --rt-max=0.5 " + "--max-active=4000 --beam=12.0 --acoustic-scale=0.0769 " + "scp:wav.scp model HCLG.fst words.txt '1:2:3:4:5' ark,t:trans.txt ark,t:ali.txt"; + ParseOptions po(usage); + BaseFloat acoustic_scale = 0.1; + int32 cmn_window = 600, + min_cmn_window = 100; // adds 1 second latency, only at utterance start. + int32 channel = -1; + int32 right_context = 4, left_context = 4; + + OnlineFasterDecoderOpts decoder_opts; + decoder_opts.Register(&po, true); + OnlineFeatureMatrixOptions feature_reading_opts; + feature_reading_opts.Register(&po); + + po.Register("left-context", &left_context, "Number of frames of left context"); + po.Register("right-context", &right_context, "Number of frames of right context"); + po.Register("acoustic-scale", &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("cmn-window", &cmn_window, + "Number of feat. vectors used in the running average CMN calculation"); + po.Register("min-cmn-window", &min_cmn_window, + "Minumum CMN window used at start of decoding (adds " + "latency only at start)"); + po.Register("channel", &channel, + "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)"); + po.Read(argc, argv); + if (po.NumArgs() != 7 && po.NumArgs() != 8) { + po.PrintUsage(); + return 1; + } + if (po.NumArgs() == 7) + if (left_context % kDeltaOrder != 0 || left_context != right_context) + KALDI_ERR << "Invalid left/right context parameters!"; + + std::string wav_rspecifier = po.GetArg(1), + model_rspecifier = po.GetArg(2), + fst_rspecifier = po.GetArg(3), + word_syms_filename = po.GetArg(4), + silence_phones_str = po.GetArg(5), + words_wspecifier = po.GetArg(6), + alignment_wspecifier = po.GetArg(7), + lda_mat_rspecifier = po.GetOptArg(8); + + std::vector silence_phones; + if (!SplitStringToIntegers(silence_phones_str, ":", false, &silence_phones)) + KALDI_ERR << "Invalid silence-phones string " << silence_phones_str; + if (silence_phones.empty()) + KALDI_ERR << "No silence phones given!"; + + Int32VectorWriter words_writer(words_wspecifier); + Int32VectorWriter alignment_writer(alignment_wspecifier); + + Matrix lda_transform; + if (lda_mat_rspecifier != "") { + bool binary_in; + Input ki(lda_mat_rspecifier, &binary_in); + lda_transform.Read(ki.Stream(), binary_in); + } + + TransitionModel trans_model; + AmDiagGmm am_gmm; + { + bool binary; + Input ki(model_rspecifier, &binary); + trans_model.Read(ki.Stream(), binary); + am_gmm.Read(ki.Stream(), binary); + } + + fst::SymbolTable *word_syms = NULL; + if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + + fst::Fst *decode_fst = ReadDecodeGraph(fst_rspecifier); + + // We are not properly registering/exposing MFCC and frame extraction options, + // because there are parts of the online decoding code, where some of these + // options are hardwired(ToDo: we should fix this at some point) + MfccOptions mfcc_opts; + mfcc_opts.use_energy = false; + int32 frame_length = mfcc_opts.frame_opts.frame_length_ms = 25; + int32 frame_shift = mfcc_opts.frame_opts.frame_shift_ms = 10; + + int32 window_size = right_context + left_context + 1; + decoder_opts.batch_size = std::max(decoder_opts.batch_size, window_size); + + OnlineFasterDecoder decoder(*decode_fst, decoder_opts, + silence_phones, trans_model); + SequentialTableReader reader(wav_rspecifier); + VectorFst out_fst; + for (; !reader.Done(); reader.Next()) { + std::string wav_key = reader.Key(); + std::cerr << "File: " << wav_key << std::endl; + const WaveData &wav_data = reader.Value(); + if(wav_data.SampFreq() != 16000) + KALDI_ERR << "Sampling rates other than 16kHz are not supported!"; + int32 num_chan = wav_data.Data().NumRows(), this_chan = channel; + { // This block works out the channel (0=left, 1=right...) + KALDI_ASSERT(num_chan > 0); // should have been caught in + // reading code if no channels. + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << wav_key << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + continue; + } + } + } + OnlineVectorSource au_src(wav_data.Data().Row(this_chan)); + Mfcc mfcc(mfcc_opts); + FeInput fe_input(&au_src, &mfcc, + frame_length*(wav_data.SampFreq()/1000), + frame_shift*(wav_data.SampFreq()/1000)); + OnlineCmnInput cmn_input(&fe_input, cmn_window, min_cmn_window); + OnlineFeatInputItf *feat_transform = 0; + if (lda_mat_rspecifier != "") { + feat_transform = new OnlineLdaInput( + &cmn_input, lda_transform, + left_context, right_context); + } else { + DeltaFeaturesOptions opts; + opts.order = kDeltaOrder; + // Note from Dan: keeping the next statement for back-compatibility, + // but I don't think this is really the right way to set the window-size + // in the delta computation: it should be a separate config. + opts.window = left_context / 2; + feat_transform = new OnlineDeltaInput(opts, &cmn_input); + } + + // feature_reading_opts contains timeout, batch size. + OnlineFeatureMatrix feature_matrix(feature_reading_opts, + feat_transform); + + OnlineDecodableDiagGmmScaled decodable(am_gmm, trans_model, acoustic_scale, + &feature_matrix); + int32 start_frame = 0; + bool partial_res = false; + while (1) { + OnlineFasterDecoder::DecodeState dstate = decoder.Decode(&decodable); + if (dstate & (decoder.kEndFeats | decoder.kEndUtt)) { + std::vector word_ids; + decoder.FinishTraceBack(&out_fst); + fst::GetLinearSymbolSequence(out_fst, + static_cast *>(0), + &word_ids, + static_cast(0)); + PrintPartialResult(word_ids, word_syms, partial_res || word_ids.size()); + partial_res = false; + + decoder.GetBestPath(&out_fst); + std::vector tids; + fst::GetLinearSymbolSequence(out_fst, + &tids, + &word_ids, + static_cast(0)); + std::stringstream res_key; + res_key << wav_key << '_' << start_frame << '-' << decoder.frame(); + if (!word_ids.empty()) + words_writer.Write(res_key.str(), word_ids); + alignment_writer.Write(res_key.str(), tids); + if (dstate == decoder.kEndFeats) + break; + start_frame = decoder.frame(); + } else { + std::vector word_ids; + if (decoder.PartialTraceback(&out_fst)) { + fst::GetLinearSymbolSequence(out_fst, + static_cast *>(0), + &word_ids, + static_cast(0)); + PrintPartialResult(word_ids, word_syms, false); + if (!partial_res) + partial_res = (word_ids.size() > 0); + } + } + } + if (feat_transform) delete feat_transform; + } + if (word_syms) delete word_syms; + if (decode_fst) delete decode_fst; + return 0; + } catch(const std::exception& e) { + std::cerr << e.what(); + return -1; + } +} // main() diff --git a/src/python-kaldi-decoding/online-wav-gmm-decode-faster.h b/src/python-kaldi-decoding/online-wav-gmm-decode-faster.h new file mode 100644 index 00000000000..334bbdb8cd3 --- /dev/null +++ b/src/python-kaldi-decoding/online-wav-gmm-decode-faster.h @@ -0,0 +1,17 @@ + +// -*- coding: utf-8 -*- + +#ifndef ONLINE_WAV_GMM_DECODE_FASTER_H +#define ONLINE_WAV_GMM_DECODE_FASTER_H + +#ifdef __cplusplus +extern "C" { +#endif + +int online_wav_gmm_decode_faster_like_main(int argc, char *argv[]) ; + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef ONLINE_WAV_GMM_DECODE_FASTER_H diff --git a/src/python-kaldi-decoding/ordereddefaultdict.py b/src/python-kaldi-decoding/ordereddefaultdict.py new file mode 100644 index 00000000000..085836792ae --- /dev/null +++ b/src/python-kaldi-decoding/ordereddefaultdict.py @@ -0,0 +1,46 @@ +""" +Combine functionality from ordered and default dict. +The implementation is taken from: +http://stackoverflow.com/questions/6190331/can-i-do-an-ordered-default-dict-in-python +""" +from collections import OrderedDict, Callable + + +class DefaultOrderedDict(OrderedDict): + def __init__(self, default_factory=None, *a, **kw): + if (default_factory is not None and not isinstance(default_factory, Callable)): + raise TypeError('first argument must be callable') + OrderedDict.__init__(self, *a, **kw) + self.default_factory = default_factory + + def __getitem__(self, key): + try: + return OrderedDict.__getitem__(self, key) + except KeyError: + return self.__missing__(key) + + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + self[key] = value = self.default_factory() + return value + + def __reduce__(self): + if self.default_factory is None: + args = tuple() + else: + args = self.default_factory, + return type(self), args, None, None, self.items() + + def copy(self): + return self.__copy__() + + def __copy__(self): + return type(self)(self.default_factory, self) + + def __deepcopy__(self, memo): + import copy + return type(self)(self.default_factory, copy.deepcopy(self.items())) + + def __repr__(self): + return 'OrderedDefaultDict(%s, %s)' % (self.default_factory, OrderedDict.__repr__(self)) diff --git a/src/python-kaldi-decoding/run.py b/src/python-kaldi-decoding/run.py new file mode 100755 index 00000000000..e49dceb15ac --- /dev/null +++ b/src/python-kaldi-decoding/run.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python +from cffi import FFI +from collections import namedtuple +# import sys +import os +import errno +from ordereddefaultdict import DefaultOrderedDict +from subprocess import check_output + +cwd = os.path.abspath(os.path.curdir) + +MfccParams = namedtuple( + 'MfccParams', ['mfcc_dir', 'mfcc_config', 'wav_scp', 'mfcc_ark', 'mfcc_scp']) +LatgenParams = namedtuple( + 'LatgenParams', ['decode_dir', 'max_active', 'beam', 'latbeam', 'acoustic_scale', 'wst', 'model', + 'hclg', 'utt2spk', 'cmvn_scp', 'feats_scp', 'lattice_arch']) +BestPathParams = namedtuple('BestPathParams', ['lm_scale', 'wst', 'lattice_arch', 'trans']) +WerParams = namedtuple('WerParams', ['reference', 'hypothesis']) +OnlineParams = namedtuple( + 'OnlineParams', ['decode_dir', 'rt_min', 'rt_max', 'max_active', 'beam', 'acoustic_scale', + 'wav_scp', 'wst', 'model', 'hclg', 'trans', 'align']) + + +class CffiKaldiError(Exception): + def __init__(self, retcode): + self.retcode = retcode + + def __str__(self): + return 'CffiKaldi with return code: %s' % repr(self.retcode) + + +def mymkdir(path): + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def run_mfcc(ffi, mfcclib, mfccPar): + '''Settings and arguments based on /ha/work/people/oplatek/kaldi-trunk/egs/kaldi- + vystadial-recipe/s5/steps/make_mfcc.sh''' + mymkdir(mfccPar.mfcc_dir) + mfcc_args = ['mfcc_unused', '--verbose=2', + '--config=%s' % mfccPar.mfcc_config, + 'scp:%s' % mfccPar.wav_scp, + 'ark,scp:%(mfcc_ark)s,%(mfcc_scp)s' % mfccPar.__dict__] + + try: + mfcc_argkeepalive = [ffi.new("char[]", arg) for arg in mfcc_args] + mfcc_argv = ffi.new("char *[]", mfcc_argkeepalive) + retcode = mfcclib.compute_mfcc_feats_like_main( + len(mfcc_args), mfcc_argv) + if retcode != 0: + raise CffiKaldiError(retcode) + return mfccPar.mfcc_scp + except Exception as e: + print 'Failed running mfcc!' + print e + raise + + +def run_decode(ffi, decodelib, latgenPar): + '''Settings and arguments based on /ha/work/people/oplatek/kaldi-trunk/egs/kaldi- + vystadial-recipe/s5/steps/decode.sh''' + mymkdir(latgenPar.decode_dir) + # feats for delta not lda + decode_args = ['decode_unused', '--max-active=%s' % latgenPar.max_active, + '--beam=%s' % latgenPar.beam, + '--lattice-beam=%s' % latgenPar.latbeam, + '--acoustic-scale=%s' % latgenPar.acoustic_scale, + '--allow-partial=true', + '--word-symbol-table=%s' % latgenPar.wst, + latgenPar.model, + latgenPar.hclg, + 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:%(utt2spk)s scp:%(cmvn_scp)s scp:%(feats_scp)s ark:- | add-deltas ark:- ark:- |' % latgenPar.__dict__, + 'ark:|gzip -c > %s' % latgenPar.lattice_arch] + + try: + decode_argkeepalive = [ffi.new("char[]", arg) for arg in decode_args] + decode_argv = ffi.new("char *[]", decode_argkeepalive) + retcode = decodelib.gmm_latgen_faster_like_main( + len(decode_args), decode_argv) + if retcode != 0: + raise CffiKaldiError(retcode) + print 'Running decode finished!' + return latgenPar.lattice_arch + except Exception as e: + print 'Failed running decode!' + print e + raise + + +def run_bestpath(ffi, bestpathlib, bpPar): + ''' Settings and arguments based on /ha/work/people/oplatek/kaldi-trunk/egs/kaldi- + vystadial-recipe/s5/local/shore.sh''' + bestpath_args = ['bestpath_unsed', '--lm-scale=%s' % bpPar.lm_scale, + '--word-symbol-table=%s' % bpPar.wst, + 'ark:gunzip -c %s|' % bpPar.lattice_arch, + 'ark,t:%s' % bpPar.trans] + try: + bestpath_argkeepalive = [ffi.new("char[]", arg) + for arg in bestpath_args] + bestpath_argv = ffi.new("char *[]", bestpath_argkeepalive) + retcode = bestpathlib.lattice_best_path_like_main( + len(bestpath_args), bestpath_argv) + if retcode != 0: + raise CffiKaldiError(retcode) + return bpPar.trans + except Exception as e: + print 'Failed running bestpath!' + print e + raise + + +def computeWer(ffi, werlib, werPar): + '''Settings and arguments based on /ha/work/people/oplatek/kaldi-trunk/egs/kaldi- + vystadial-recipe/s5/local/shore.sh + | compute-wer --text --mode=present ark:exp/tri2a/decode/scoring/test_filt.txt ark,p:- >& + exp/tri2a/decode/wer_15''' + + wer_args = ['wer_unused', '--text', + '--mode=present', + 'ark:%s' % werPar.reference, + 'ark:%s' % werPar.hypothesis] + try: + wer_argkeepalive = [ffi.new("char[]", arg) for arg in wer_args] + wer_argv = ffi.new("char *[]", wer_argkeepalive) + retcode = werlib.compute_wer_like_main(len(wer_args), wer_argv) + if retcode != 0: + raise CffiKaldiError(retcode) + except Exception as e: + print 'Failed running compute_wer!' + print e + raise + + +def buildReference(wav_scp, ref_path): + with open(ref_path, 'w') as w: + with open(wav_scp, 'r') as scp: + for line in scp: + name, wavpath = line.strip().split(' ', 1) + with open(wavpath + '.trn') as trn: + trans = trn.read() + w.write('%s %s\n' % (name, trans)) + + +def int2txt(trans_path, trans_path_txt, wst, sym_OOV='\'): + ''' based on: cat exp/tri2a/decode/scoring/15.tra | utils/int2sym.pl -f 2- + exp/tri2a/graph/words.txt | sed s:\::g''' + with open(trans_path, 'rb') as r: + with open(trans_path_txt, 'wb') as w: + out = check_output(['utils/int2sym.pl', '-f', '2-', wst], stdin=r) + noUNK = out.replace(sym_OOV, '') + w.write(noUNK) + + +def run_online(ffi, onlinelib, onlinePar): + ''' Based on kaldi-trunk/egs/voxforge/online_demo/run.sh''' + mymkdir(onlinePar.decode_dir) + online_args = ['online_unused', + '--verbose=1', + '--rt-min=%s' % onlinePar.rt_min, + '--rt-max=%s' % onlinePar.rt_max, + '--max-active=%s' % onlinePar.max_active, + '--beam=%s' % onlinePar.beam, + '--acoustic_scale=%s' % onlinePar.acoustic_scale, + 'scp:%s' % onlinePar.wav_scp, + onlinePar.model, onlinePar.hclg, + onlinePar.wst, '1:2:3:4:5', + 'ark,t:%s' % onlinePar.trans, + 'ark,t:%s' % onlinePar.align] + try: + online_argkeepalive = [ffi.new("char[]", arg) for arg in online_args] + online_argv = ffi.new("char *[]", online_argkeepalive) + retcode = onlinelib.online_wav_gmm_decode_faster_like_main( + len(online_args), online_argv) + if retcode != 0: + raise CffiKaldiError(retcode) + return onlinePar.trans + except Exception as e: + print 'Failed running online!' + print e + raise + + +def compactHyp(hyp_path, comp_hyp_path): + d = DefaultOrderedDict(list) + with open(hyp_path, 'r') as hyp: + for line in hyp: + name_, align_dec = line.strip().split('wav_') + name, dec = name_ + 'wav', align_dec.strip().split()[1:] + d[name].extend(dec) + with open(comp_hyp_path, 'w') as w: + for wav, dec_list in d.iteritems(): + w.write('%s %s\n' % (wav, ' '.join(dec_list))) + +if __name__ == '__main__': + ffi = FFI() + + # FIXME check if preprocessor directives works in cffi + # with open('../base/kaldi-types.h', 'r') as r: + # int_header = r.read() + # ffi.cdef(int_header) + + header = ''' + int compute_mfcc_feats_like_main(int argc, char **argv); + int gmm_latgen_faster_like_main(int argc, char **argv); + int lattice_best_path_like_main(int argc, char **argv); + int compute_wer_like_main(int argc, char **argv); + int online_wav_gmm_decode_faster_like_main(int argc, char *argv[]); + ''' + ffi.cdef(header) + s5_dir = '../../egs/kaldi-vystadial-recipe/s5' + exp_dir = s5_dir + '/Results/exp_6_aa7263b3f5c151409a87e3d845d58e39335a4f0c' + data_dir = s5_dir + '/Results/data_6_aa7263b3f5c151409a87e3d845d58e39335a4f0c' + decodedir = cwd + '/decode' + try: + lib = ffi.dlopen('libcffi-kaldi.so') + + mfccPar = MfccParams( + mfcc_dir='mfcc', + mfcc_config=s5_dir + '/conf/mfcc.conf', + wav_scp='little_wavs_data_void_en.scp', + mfcc_ark='mfcc/raw_mfcc.ark', + mfcc_scp='mfcc/raw_mfcc.scp') + run_mfcc(ffi, lib, mfccPar) + print 'running mfcc finished' + + latgenPar = LatgenParams( + decode_dir=decodedir, + max_active='7000', + beam='13.0', + latbeam='6.0', + acoustic_scale='0.083333', + wst=exp_dir + '/tri2a/graph/words.txt', + model=exp_dir + '/tri2a/final.mdl', + hclg=exp_dir + '/tri2a/graph/HCLG.fst', + utt2spk=data_dir + '/test/utt2spk', + # TODO create the version of mfcc dir and change paths in cmvn! + cmvn_scp=data_dir + '/test/cmvn.scp', + feats_scp=mfccPar.mfcc_scp, + lattice_arch=decodedir + '/lat.gz') + run_decode(ffi, lib, latgenPar) + print 'running mfcc finished' + + bpPar = BestPathParams( + lm_scale='15', + wst=latgenPar.wst, + lattice_arch=latgenPar.lattice_arch, + trans=latgenPar.decode_dir + '/trans') + run_bestpath(ffi, lib, bpPar) + print 'running bestpath finished' + + onlinePar = OnlineParams( + decode_dir=decodedir, + rt_min='0.8', + rt_max='0.85', + max_active='4000', + beam='12.0', + acoustic_scale='0.0769', + wav_scp=mfccPar.wav_scp, + wst=latgenPar.wst, + model=latgenPar.model, + hclg=latgenPar.hclg, + trans=decodedir + '/online_trans', + align=decodedir + '/online_align') + run_online(ffi, lib, onlinePar) + + ### Evaluating experiments + ref = decodedir + '/reference.txt' + buildReference(mfccPar.wav_scp, ref) + + # Evaluate latgen decoding + lat_trans_text = bpPar.trans + '.txt' + int2txt(bpPar.trans, lat_trans_text, latgenPar.wst) + lat_werPar = WerParams(hypothesis=lat_trans_text, reference=ref) + computeWer(ffi, lib, lat_werPar) + print 'running WER for latgen finished' + + # # Evaluate online decoding + onl_transtxttmp, onl_transtxt = onlinePar.trans + '.tmp', onlinePar.trans + '.txt' + int2txt(onlinePar.trans, onl_transtxttmp, onlinePar.wst) + compactHyp(onl_transtxttmp, onl_transtxt) + onl_werPar = WerParams(hypothesis=onl_transtxt, reference=ref) + computeWer(ffi, lib, onl_werPar) + print 'running WER for online finished' + except OSError as e: + print 'Maybe you forget to set LD_LIBRARY_PATH?' + print e + raise diff --git a/src/python-kaldi-decoding/test_cffi_python_dyn.h b/src/python-kaldi-decoding/test_cffi_python_dyn.h new file mode 100644 index 00000000000..668e6b0d590 --- /dev/null +++ b/src/python-kaldi-decoding/test_cffi_python_dyn.h @@ -0,0 +1,39 @@ +#ifndef TEST_CFFI_PYTHON_H +#define TEST_CFFI_PYTHON_H +#include +#include + + +/** Links ****** + - http://www.isotton.com/devel/docs/C++-dlopen-mini-HOWTO/C++-dlopen-mini-HOWTO.html + - http://stackoverflow.com/questions/12762910/c-undefined-symbols-when-loading-shared-library-with-dlopen + - http://kaldi.sourceforge.net/matrixwrap.html # see Missing the ATLAS implementation of (parts of) CLAPACK + - you have to choose lapack_atlas / lapack /clapack.. check symbols + +**********************/ + +typedef int (*f_t)(int c, char **ar); + +int testSharedLib(char *nameLib, char *nameFce, int argc, char ** argv) { + void *lib = dlopen(nameLib, RTLD_NOW); + if (!lib) { + printf("Cannot open library: %s\n", dlerror()); + return 1; + } + + dlerror(); // reset errors + f_t f = (f_t)dlsym(lib, nameFce); + const char *dlsym_error = dlerror(); + if (dlsym_error) { + printf("Cannot load symbol '%s', %s\n", nameFce, dlsym_error ); + dlclose(lib); + return 1; + } + + // using the function + int retval = f(argc, argv); + + dlclose(lib); + return retval; +} +#endif // #ifndef TEST_CFFI_PYTHON_H diff --git a/src/vystadial-decoder/.ycm_extra_conf.py b/src/vystadial-decoder/.ycm_extra_conf.py new file mode 100644 index 00000000000..8b835f3de42 --- /dev/null +++ b/src/vystadial-decoder/.ycm_extra_conf.py @@ -0,0 +1,145 @@ +# This file is NOT licensed under the GPLv3, which is the license for the rest +# of YouCompleteMe. +# +# Here's the license text for this file: +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# For more information, please refer to + +import os +import ycm_core +from clang_helpers import PrepareClangFlags + +# These are the compilation flags that will be used in case there's no +# compilation database set (by default, one is not set). +# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR. +flags = [ + '-Wall', + '-Wextra', + '-Werror', + '-Wc++98-compat', + '-Wno-long-long', + '-Wno-variadic-macros', + '-fexceptions', + '-DNDEBUG', + '-DUSE_CLANG_COMPLETER', + # THIS IS IMPORTANT! Without a "-std=" flag, clang won't know which + # language to use when compiling headers. So it will guess. Badly. So C++ + # headers will be compiled as C headers. You don't want that so ALWAYS specify + # a "-std=". + # For a C project, you would set this to something like 'c99' instead of + # 'c++11'. + '-std=c++11', + # ...and the same thing goes for the magic -x option which specifies the + # language that the files to be compiled are written in. This is mostly + # relevant for c++ headers. + # For a C project, you would set this to 'c' instead of 'c++'. + '-x', + 'c++', + # Set for Kaldi project where all sits side by side in src + '-I', + '..', + # Set for Kaldi external dependencies sitting in ../../tools + '-isystem', + '../../tools/openfst/include', + '-isystem', + '../../tools/ATLAS/include', + '-I', + '.', +] + +# Set this to the absolute path to the folder (NOT the file!) containing the +# compile_commands.json file to use that instead of 'flags'. See here for +# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html +# +# Most projects will NOT need to set this to anything; you can just change the +# 'flags' list of compilation flags. Notice that YCM itself uses that approach. +compilation_database_folder = '' + +if compilation_database_folder: + database = ycm_core.CompilationDatabase(compilation_database_folder) +else: + database = None + + +def DirectoryOfThisScript(): + return os.path.dirname(os.path.abspath(__file__)) + + +def MakeRelativePathsInFlagsAbsolute(flags, working_directory): + if not working_directory: + return flags + new_flags = [] + make_next_absolute = False + path_flags = ['-isystem', '-I', '-iquote', '--sysroot='] + for flag in flags: + new_flag = flag + + if make_next_absolute: + make_next_absolute = False + if not flag.startswith('/'): + new_flag = os.path.join(working_directory, flag) + + for path_flag in path_flags: + if flag == path_flag: + make_next_absolute = True + break + + if flag.startswith(path_flag): + path = flag[len(path_flag):] + new_flag = path_flag + os.path.join(working_directory, path) + break + + if new_flag: + new_flags.append(new_flag) + return new_flags + + +def FlagsForFile(filename): + if database: + # Bear in mind that compilation_info.compiler_flags_ does NOT return a + # python list, but a "list-like" StringVec object + compilation_info = database.GetCompilationInfoForFile(filename) + final_flags = PrepareClangFlags( + MakeRelativePathsInFlagsAbsolute( + compilation_info.compiler_flags_, + compilation_info.compiler_working_dir_), + filename) + + # NOTE: This is just for YouCompleteMe; it's highly likely that your project + # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR + # ycm_extra_conf IF YOU'RE NOT 100% YOU NEED IT. + try: + final_flags.remove('-stdlib=libc++') + except ValueError: + pass + else: + relative_to = DirectoryOfThisScript() + final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to) + + return { + 'flags': final_flags, + 'do_cache': True + } diff --git a/src/vystadial-decoder/README.md b/src/vystadial-decoder/README.md new file mode 100644 index 00000000000..43a45177cc1 --- /dev/null +++ b/src/vystadial-decoder/README.md @@ -0,0 +1,86 @@ +Intro +----- +The repository contains the first attempt to build +online Kaldi decoder taking raw audio packets. + +The decoder should have simpel interface, +because the next step will be interfacing the functionality of the decoder from Python. + +Workflow of KALDI decoding in few lines +--------------------- +```cpp +Classes: + OnlineFeInput + MfccOptions // Usage: Mfcc mfcc(mfcc_opts); + Mfcc - ?place holder for mfcc features? + + Mfcc mfcc(mfcc_opts); + FeInput fe_input(&au_src, &mfcc, ..) + OnlineCmvnInput cmvn_input(&fe_input, ..); + feat_transform = new OnlineLdaInput(&cmvn_input, ..) + OnlineDecodableDiagGmmScaled decodable(feat_transform, ..) + while (1) { + OnlineFasterDecoder::DecodeState dstate = decoder.Decode(&decodable); + // different staff for online decoder decoder.FinishTraceBack(&out_fst); +``` + + +Classes for online decoder +-------------------------- +```cpp +// in online/online-audio-source.h +class OnlineVectorSource + int32 Read(VectorBase *data, uint32 *timeout = 0); + +// in online/online-decodable.h +// A decodable, taking input from an OnlineFeatureInput object on-demand +class OnlineDecodableDiagGmmScaled : public DecodableInterface + virtual BaseFloat LogLikelihood(int32 frame, int32 index); + virtual bool IsLastFrame(int32 frame); + virtual int32 NumIndices() /// Indices are one-based! This is for compatibility with OpenFst. + +// in online/online-fast-input.h +class OnlineFeatInputItf + virtual bool Compute(Matrix *output, uint32 *timeout) = 0; + +// in online/online-cmn.h +class OnlineCMN + ApplyCmvn + +// in online/online-faster-decoder.h +struct OnlineFasterDecoderOpts : public FasterDecoderOptions + void Register(ParseOptions *po, bool full) + + +// in online/online-faster-decoder.h +class OnlineFasterDecoder : public FasterDecoder + // Codes returned by Decode() to show the current state of the decoder + enum DecodeState { + kEndFeats = 1, // No more scores are available from the Decodable + kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence + kEndBatch = 4 // End of batch - end of utterance not reached yet + }; + DecodeState Decode(DecodableInterface *decodable); + + // Makes a linear graph, by tracing back from the last "immortal" token + // to the previous one + bool PartialTraceback(fst::MutableFst *out_fst); + + // Makes a linear graph, by tracing back from the best currently active token + // to the last immortal token. This method is meant to be invoked at the end + // of an utterance in order to get the last chunk of the hypothesis + void FinishTraceBack(fst::MutableFst *fst_out); + + // Returns "true" if the best current hypothesis ends with long enough silence + bool EndOfUtterance(); + + int32 frame() { return frame_; } + +``` + + +Reading the wav source +---------------------- +```cpp + SequentialTableReader reader(std::string wav_rspecifier); +``` diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 00000000000..2e3c19f3217 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1,7 @@ +ATLAS +OpenBLAS +openfst +openfst-1.3.2 +portaudio +sctk-2.4.0 +sph2pipe_v2.5 diff --git a/tools/Makefile b/tools/Makefile index e2c57027d12..5c37e7bfd0c 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -47,7 +47,7 @@ openfst/lib: openfst-1.3.2/Makefile: openfst-1.3.2/.patched cd openfst-1.3.2/; \ - ./configure --prefix=`pwd` --enable-static --disable-shared --enable-far --enable-ngram-fsts + ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts .PHONY: openfst-1.3.2/.patched diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh index b365af33d2f..07228025ac1 100755 --- a/tools/extras/install_portaudio.sh +++ b/tools/extras/install_portaudio.sh @@ -89,7 +89,7 @@ if [ -z "$MACOS" ]; then echo "${pa_patch}" | patch -p0 Makefile.in fi -./configure --prefix=`pwd`/install +./configure --prefix=`pwd`/install --with-pic if [ "$MACOS" != "" ]; then