[egs] Add recipe for Mandarin BN/BC data, for speech translation proj…

…ect. (#3866)
kaldi-asr · Jan 22, 2020 · 1a46d94 · 1a46d94
1 parent 33255ed
commit 1a46d94
Show file tree

Hide file tree

Showing 54 changed files with 8,743 additions and 0 deletions.
diff --git a/egs/mandarin_bn_bc/s5/README b/egs/mandarin_bn_bc/s5/README
@@ -0,0 +1,34 @@
+This recipe contains the following corpora from LDC:
+
+Audio:
+  Gale phase 2/3/4
+  LDC2013S08
+  LDC2013S04
+  LDC2014S09
+  LDC2015S06
+  LDC2015S13
+  LDC2016S03
+  LDC2017S25
+
+  TDT 2/3/4
+  LDC2001S93
+  LDC2001S95
+  LDC2005S11
+
+Text:
+  Gale phase 2/3/4
+  LDC2013T20
+  LDC2013T08
+  LDC2014T28
+  LDC2015T09
+  LDC2015T25
+  LDC2016T12
+  LDC2017T18
+
+  TDT 2/3/4
+  LDC2001T57
+  LDC2001T58
+  LDC2005T16
+  Besides, it uses Gigga word, simplified Mandarin for LM training and expanding dictionary:
+  Gigga word (xin:simplified, cna:traditional. Use only xin)
+  LDC2003T09
diff --git a/egs/mandarin_bn_bc/s5/RESULTS b/egs/mandarin_bn_bc/s5/RESULTS
@@ -0,0 +1,15 @@
+# In the results below, "large_test" is the pruned 4-gram LM, which is used for
+# lattice generation.
+
+# Results with nnet3 tdnn+chain model
+# local/chain/run_tdnn.sh
+# (4 epoch training on speed-perturbed and volum-perturbed "cleaned" data and left-biphone model)
+# num_params=20.7 M
+%CER 8.83 [ 7901 / 89515, 929 ins, 1738 del, 5234 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_dev_large_test/cer_9_0.0
+%CER 9.03 [ 17749 / 196659, 3770 ins, 3988 del, 9991 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_eval_large_test/cer_10_0.0
+
+# Results with RNNLM rescoring of tdnn+chain model
+%CER 8.49 [ 7600 / 89515, 863 ins, 1805 del, 4932 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_dev_large_test_rnnlm_1a_nbest_rescore/cer_8_0.0
+%CER 8.47 [ 7585 / 89515, 783 ins, 2027 del, 4775 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_dev_large_test_rnnlm_1a_rescore/cer_9_0.0
+%CER 8.82 [ 17342 / 196659, 3891 ins, 3809 del, 9642 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_eval_large_test_rnnlm_1a_nbest_rescore/cer_8_0.0
+%CER 8.72 [ 17142 / 196659, 3876 ins, 3766 del, 9500 sub ] exp/chain_cleanup/tdnn_1d_sp/decode_eval_large_test_rnnlm_1a_rescore/cer_9_0.0
diff --git a/egs/mandarin_bn_bc/s5/cmd.sh b/egs/mandarin_bn_bc/s5/cmd.sh
@@ -0,0 +1,18 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G --config conf/queue.conf --allow-a09 false"
+export decode_cmd="queue.pl --mem 4G --config conf/queue.conf --allow-a09 false"
+export mkgraph_cmd="queue.pl --mem 8G --config conf/queue.conf --allow-a09 false"
+# the use of cuda_cmd is deprecated, but it's still used in this example
+# directory.
+export cuda_cmd="queue.pl --gpu 1 --config conf/queue.conf"
diff --git a/egs/mandarin_bn_bc/s5/conf/cmu2pinyin b/egs/mandarin_bn_bc/s5/conf/cmu2pinyin
@@ -0,0 +1,39 @@
+AA A
+AE A
+AH A
+AO UO
+AW U
+AY AI
+B B
+CH CH 
+D D
+DH S I
+EH AI
+ER E
+EY AI
+F F
+G G
+HH H
+IH I
+IY I
+JH ZH 
+K K
+L L
+M M
+N N
+NG N
+OW UO
+OY UO
+P P
+R R
+S S
+SH SH
+T T
+TH S
+UH U
+UW U
+V W
+W W
+Y Y
+Z Z 
+ZH X  
diff --git a/egs/mandarin_bn_bc/s5/conf/decode.config b/egs/mandarin_bn_bc/s5/conf/decode.config
diff --git a/egs/mandarin_bn_bc/s5/conf/decode_dnn.config b/egs/mandarin_bn_bc/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/mandarin_bn_bc/s5/conf/fbank.conf b/egs/mandarin_bn_bc/s5/conf/fbank.conf
@@ -0,0 +1,3 @@
+# No non-default options for now.
+--sample-frequency=16000
+--num-mel-bins=30
diff --git a/egs/mandarin_bn_bc/s5/conf/mfcc.conf b/egs/mandarin_bn_bc/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/mandarin_bn_bc/s5/conf/mfcc_hires.conf b/egs/mandarin_bn_bc/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/mandarin_bn_bc/s5/conf/online_cmvn.conf b/egs/mandarin_bn_bc/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/mandarin_bn_bc/s5/conf/online_pitch.conf b/egs/mandarin_bn_bc/s5/conf/online_pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/mandarin_bn_bc/s5/conf/pinyin2cmu b/egs/mandarin_bn_bc/s5/conf/pinyin2cmu
@@ -0,0 +1,58 @@
+A AA
+AI AY
+AN AE N 
+ANG AE NG
+AO AW   
+B B 
+CH CH
+C T S
+D D
+E ER 
+EI EY
+EN AH N
+ENG AH NG
+ER AA R 
+F F
+G G
+H HH
+IA IY AA
+IANG IY AE NG
+IAN IY AE N
+IAO IY AW
+IE IY EH
+I IY
+ING IY NG
+IN IY N
+IONG IY UH NG
+IU IY UH 
+J J
+K K
+L L
+M M
+N N
+O AO
+ONG UH NG
+OU OW
+P P
+Q Q
+R R
+SH SH
+S S
+T T
+UAI UW AY
+UANG UW AE NG
+UAN UW AE N
+UA UW AA
+UI UW IY 
+UN UW AH N
+UO UW AO
+U UW
+UE IY EH 
+VE IY EH 
+V IY UW
+VN IY N 
+W W
+X X 
+Y Y
+ZH JH 
+Z Z
diff --git a/egs/mandarin_bn_bc/s5/conf/pitch.conf b/egs/mandarin_bn_bc/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/mandarin_bn_bc/s5/conf/queue.conf b/egs/mandarin_bn_bc/s5/conf/queue.conf
@@ -0,0 +1,13 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_a09=false
+option allow_a09=true
+option allow_a09=false -l 'hostname=!a09*&!a17*&!a13*&!a14*&!a10*&!a11*&!c16*'
diff --git a/egs/mandarin_bn_bc/s5/local/chain/run_chain_common.sh b/egs/mandarin_bn_bc/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+ali_nj=
+lores_train_data_dir=
+lang_original=
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lang_original=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt $lang_original/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $lang_original $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $ali_nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/mandarin_bn_bc/s5/local/chain/run_tdnn.sh b/egs/mandarin_bn_bc/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh