Merge branch 'master' into patch-1

kaldi-asr · Nov 13, 2023 · 3675219 · 3675219
2 parents 8c3c0bc + 21ae411
commit 3675219
Show file tree

Hide file tree

Showing 122 changed files with 2,835 additions and 349 deletions.
diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
@@ -269,7 +269,7 @@ def gen_code(self):
 
         if len(self.depends) > 0:
             ret.append("target_link_libraries(" + self.target_name + " PUBLIC")
-            for d in self.depends:
+            for d in self.depends + ['-lcblas', '-llapack']:
                 ret.append("    " + d)
             ret.append(")\n")
 

diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
@@ -0,0 +1,46 @@
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+LABEL maintainer="williamhilton.works@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        g++ \
+        make \
+        automake \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        gfortran \
+        patch \
+        ffmpeg \
+        vim && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends\
+        software-properties-common && \
+    apt-add-repository multiverse && \
+    apt-get update && \
+    yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
+        intel-mkl && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc) && \
+    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    rm -rf /opt/kaldi/.git
+
+WORKDIR /opt/kaldi/
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac

diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh
@@ -10,7 +10,7 @@ mic=mdm$nmics
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac

diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac

diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G"
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1 --mem 20G"
 
-if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"

diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh
@@ -28,7 +28,7 @@ set -euo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac

diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh
@@ -3,7 +3,7 @@
 # Apache 2.0.
 #
 # This recipe performs diarization for the mix-headset data in the
-# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 
+# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2
 # corpus with simulated RIRs. We use oracle SAD in this recipe.
 # This recipe demonstrates the following:
 # 1. Diarization using x-vector and clustering (AHC, VBx, spectral)
@@ -38,7 +38,7 @@ diarizer_type=spectral  # must be one of (ahc, spectral, vbx)
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
@@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then
   local/ami_download.sh $mic $AMI_DIR
 fi
 
-# Prepare data directories. 
+# Prepare data directories.
 if [ $stage -le 2 ]; then
   # Download the data split and references from BUT's AMI setup
   if ! [ -d AMI-diarization-setup ]; then
@@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then
      transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\
       ivector-normalize-length ark:-  ark:- |" \
     $model_dir/xvectors_plda_train/plda || exit 1;
-  
+
   cp $model_dir/xvectors_plda_train/plda $model_dir/
   cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
   cp $model_dir/xvectors_plda_train/mean.vec $model_dir/

diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py
@@ -8,6 +8,7 @@
 import random
 import kaldi_io
 import seaborn as sns
+import numpy as np
 from collections import Counter
 from sklearn.manifold import TSNE
 from utils import load_human_scores, load_phone_symbol_table
@@ -62,6 +63,9 @@ def main():
                                   min(args.samples, len(lables)))
     features, lables = list(zip(*sampled_paris))
 
+    # Convert the tuple of arrays to a single 2D array
+    features = np.vstack(features)
+
     # Draw scatters
     label_counter = Counter(lables)
     colors = sns.color_palette("colorblind", len(label_counter))

diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -54,7 +54,7 @@ function check_sorted {
 }
 
 for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
-    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x

diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt
@@ -0,0 +1,11 @@
+About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. 
+
+XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
+
+The database can be downloaded from openslr:
+http://www.openslr.org/133/
+
+For more details, please visit: 
+https://huggingface.co/datasets/syzym/xbmu_amdo31
+
+This recipe includes some different ASR models trained with XBMU-AMDO31.
diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS
@@ -0,0 +1,8 @@
+%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0
+%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0
+%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0
+%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0
+%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0
+%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0
+%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0
+%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0
diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh