kaldi-asr · danpovey · Jan 10, 2018 · Jan 9, 2018 · Jan 9, 2018
diff --git a/egs/fisher_swbd/s5/local/fisher_swbd_prepare_dict.sh b/egs/fisher_swbd/s5/local/fisher_swbd_prepare_dict.sh
@@ -112,8 +112,12 @@ srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
 # assume swbd_p1_data_prep.sh was done already.
 [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
 
+rm $dir/lexicon0.txt 2>/dev/null
 cp $srcdict $dir/lexicon0.txt || exit 1;
-patch <local/dict.patch $dir/lexicon0.txt || exit 1;
+chmod +w $srcdict $dir/lexicon0.txt
+
+# Use absolute path in case patch reports the "Invalid file name" error (a bug with patch)
+patch <local/dict.patch `pwd`/$dir/lexicon0.txt || exit 1;
 
 #(2a) Dictionary preparation:
 # Pre-processing (remove comments)

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh
@@ -4,7 +4,7 @@
 set -e
 stage=1
 train_stage=-10
-generate_alignments=true # false if doing chain training
+generate_alignments=false # false if doing chain training
 speed_perturb=true
 
 . ./path.sh
@@ -16,60 +16,42 @@ if [ "$speed_perturb" == "true" ]; then
   if [ $stage -le 1 ]; then
     #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
     # _sp stands for speed-perturbed
-
-    for datadir in train_nodup; do
-      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
-      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
-      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
-      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
-      rm -r data/temp1 data/temp2
-
-      mfccdir=mfcc_perturbed
-      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
-        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      utils/fix_data_dir.sh data/${datadir}_tmp
-
-      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
-      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
-      utils/fix_data_dir.sh data/${datadir}_sp
-      rm -r data/temp0 data/${datadir}_tmp
-    done
+    echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+    utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+    echo "$0: making MFCC features for low-resolution speed-perturbed data" 
+    steps/make_mfcc.sh --nj 70 --cmd "$train_cmd" \
+      data/${train_set}_sp || exit 1
+    steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1
+    utils/fix_data_dir.sh data/${train_set}_sp || exit 1
   fi
 
   if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
     #obtain the alignment of the perturbed data
     steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
-      data/train_nodup_sp data/lang exp/tri5a exp/tri5a_ali_nodup_sp || exit 1
+      data/${train_set}_sp data/lang exp/tri5a exp/tri5a_ali_nodup_sp || exit 1
   fi
-  train_set=train_nodup_sp
+  train_set=${train_set}_sp
 fi
 
 if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
   mfccdir=mfcc_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     date=$(date +'%m_%d_%H_%M')
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/fisher_swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
   fi
 
   # the 100k_nodup directory is copied seperately, as
-  # we want to use exp/tri1b_ali_100k_nodup for lda_mllt training
+  # we want to use exp/tri1b_ali_100k_nodup for ivector extractor training
   # the main train directory might be speed_perturbed
   for dataset in $train_set train_100k_nodup; do
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
 
-    # scale the waveforms, this is useful as we don't use CMVN
-    data_dir=data/${dataset}_hires
-    cat $data_dir/wav.scp | python -c "
-import sys, os, subprocess, re, random
-scale_low = 1.0/8
-scale_high = 2.0
-for line in sys.stdin.readlines():
-  if len(line.strip()) == 0:
-    continue
-  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
-"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
-    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+    # do volume-perturbation on the training data prior to extracting hires
+    # features; this helps make trained nnets more invariant to test data volume.
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
 
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
         --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
@@ -95,46 +77,61 @@ for line in sys.stdin.readlines():
   utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
 fi
 
-# ivector extractor training
 if [ $stage -le 5 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
-  # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
-    5500 90000 data/train_100k_nodup_hires \
-    data/lang exp/tri1b_ali exp/nnet3/tri2b
+    --max-utts 10000 --subsample 2 \
+    data/${train_set}_30k_nodup_hires exp/nnet3/pca
 fi
 
 if [ $stage -le 6 ]; then
   # To train a diagonal UBM we don't need very much data, so use the smallest subset.
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
-    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm
+  echo "$0: training the diagonal UBM."
+  steps/online/nnet2/train_diag_ubm.sh  --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+    data/${train_set}_30k_nodup_hires 512 exp/nnet3/pca exp/nnet3/diag_ubm
 fi
 
 if [ $stage -le 7 ]; then
   # iVector extractors can be sensitive to the amount of data, but this one has a
   # fairly small dim (defaults to 100) so we don't use all of it, we use just the
   # 100k subset (just under half the data).
+  echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
     data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
 fi
 
 if [ $stage -le 8 ]; then
-  # We extract iVectors on all the train_nodup data, which will be what we
-  # train the system on.
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
 
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_hires ${temp_data_root}/${train_set}_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
+    ${temp_data_root}/${train_set}_hires_max2 \
+    exp/nnet3/extractor $ivectordir
 
+  # Also extract iVectors for the test data
   for data_set in eval2000 rt03; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
+      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data_set} || exit 1;
   done
 fi
 

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh