kaldi-asr · danpovey · Sep 8, 2018 · Sep 9, 2018 · Sep 9, 2018 · Sep 9, 2018
diff --git a/.gitignore b/.gitignore
@@ -144,6 +144,7 @@ GSYMS
 /tools/mmseg-1.3.0.tar.gz
 /tools/mmseg-1.3.0/
 /kaldiwin_vs*
+/tools/cudnn/
 /tools/cub-1.8.0.zip
 /tools/cub-1.8.0/
 /tools/cub
diff --git a/egs/mini_librispeech/s5/run.sh b/egs/mini_librispeech/s5/run.sh
@@ -48,7 +48,7 @@ if [ $stage -le 2 ]; then
   # spread the mfccs over various machines, as this data-set is quite large.
   if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
     mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
-    utils/create_split_dir.pl /export/b{07,14,16,17}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
+    utils/create_split_dir.pl /export/b{07,14,16,18}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
       $mfccdir/storage
   fi
 
@@ -170,9 +170,6 @@ if [ $stage -le 7 ]; then
 
   utils/build_const_arpa_lm.sh \
     data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
-
-  steps/align_fmllr.sh --nj 5 --cmd "$train_cmd" \
-    data/train_clean_5 data/lang exp/tri3b exp/tri3b_ali_train_clean_5
 fi
 
 

diff --git a/src/configure b/src/configure
@@ -67,6 +67,7 @@ Configuration options:
   --shared              Build and link against shared libraries [default=no]
   --use-cuda            Build with CUDA [default=yes]
   --cudatk-dir=DIR      CUDA toolkit directory
+  --cudnn-dir=DIR       CUDNN installation directory
   --cuda-arch=FLAGS     Override the default CUDA_ARCH flags.  See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
@@ -431,9 +432,6 @@ function configure_cuda {
 
     if [ -z "$CUDA_ARCH" ]; then
       case $CUDA_VERSION in
-        5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-        6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
-        7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
         8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
         9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;;
         10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75" ;;
@@ -450,6 +448,8 @@ function configure_cuda {
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
     echo >> kaldi.mk
 
+    configure_cudnn
+
     # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
@@ -462,7 +462,7 @@ function configure_cuda {
     elif [ "`uname -m`" == "ppc64le" ]; then
       cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
-      cat makefiles/cuda_32bit.mk >> kaldi.mk
+      echo "Unexpected architecture `uname -m`"; exit 1
     fi
 
   else
@@ -472,6 +472,47 @@ function configure_cuda {
   fi
 }
 
+function configure_cudnn {
+  if [ -z $CUDNNDIR ]; then
+    download_appropriate_cudnn
+  fi
+
+  echo CUDNNDIR = $CUDNNDIR >> kaldi.mk
+  echo >> kaldi.mk
+
+  if [ ! -f $CUDNNDIR/lib64/libcudnn.so ] |
+     [ ! -f $CUDNNDIR/include/cudnn.h ]; then
+    echo "CUDNNDIR(=$CUDNNDIR) invalid!"
+  fi
+}
+
+function download_appropriate_cudnn {
+  local tools=`rel2abs ../tools`
+  install_dir=$tools/cudnn
+  CUDNNDIR=$tools/cudnn/cuda
+
+  if [ -f $CUDNNDIR/include/cudnn.h ]; then
+    echo -n "CUDNN has been downloaded already. If you'd like to redownload it "
+    echo -n "(e.g., because you changed CUDA version), please delete $CUDNNDIR "
+    echo    "and rerun configure"
+    return
+  fi
+
+  local cudnn_url
+  case $CUDA_VERSION in
+    8_0)  cudnn_url="http://developer.download.nvidia.com/compute/redist/cudnn/v7.1.2/cudnn-8.0-linux-x64-v7.1.tgz" ;;
+    9_0)  cudnn_url="http://developer.download.nvidia.com/compute/redist/cudnn/v7.3.1/cudnn-9.0-linux-x64-v7.3.1.20.tgz" ;;
+    9_1)  cudnn_url="http://developer.download.nvidia.com/compute/redist/cudnn/v7.1.2/cudnn-9.1-linux-x64-v7.1.tgz" ;;
+    9_2)  cudnn_url="http://developer.download.nvidia.com/compute/redist/cudnn/v7.2.1/cudnn-9.2-linux-x64-v7.2.1.38.tgz" ;;
+    10_0) cudnn_url="http://developer.download.nvidia.com/compute/redist/cudnn/v7.3.1/cudnn-10.0-linux-x64-v7.3.1.20.tgz" ;;
+    *) echo "No known CUDNN download for provided CUDA_VERSION. Try checking here to see if your CUDA version supports a reasonably new version of CUDNN: https://gitlab.com/nvidia/cuda/tree/centos7"; exit 1 ;;
+  esac
+
+  mkdir -p $install_dir
+  wget -T 10 -t 3 $cudnn_url -O $install_dir/cudnn.tgz
+  tar --no-same-owner -xzf $install_dir/cudnn.tgz -C $install_dir
+}
+
 function linux_configure_speex {
   # Check whether the user has called tools/extras/install_speex.sh or not
   [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex
@@ -989,6 +1030,9 @@ do
   --cudatk-dir=*)
     CUDATKDIR=`read_dirname $1`;
     shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+  --cudnn-dir=*)
+    CUDNNDIR=`read_dirname $1`;
+    shift ;;
   --cuda-arch=*)
     CUDA_ARCH=`read_value $1`;
     shift;;

diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
@@ -74,6 +74,15 @@
   } \
 }
 
+#define CUDNN_SAFE_CALL(fun)                                                    \
+do {                                                                            \
+  cudnnStatus_t ret;                                                            \
+  if ((ret = (fun)) != CUDNN_STATUS_SUCCESS) {                                  \
+    KALDI_ERR << "cudnnStatus_t " << ret << " : \"" << cudnnGetErrorString(ret) \
+              << "\" returned from '" << #fun << "'";                           \
+  }                                                                             \
+} while(0)
+
 
 namespace kaldi {
 

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
@@ -4,6 +4,7 @@
 //                2013  Lucas Ondel
 //           2013-2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Guoguo Chen
+//                2018  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -92,7 +93,8 @@ void CuDevice::Initialize() {
   //
   // (2) in threads created by the user, as soon as someone calls something that
   //   might potentially use the GPU, via CuDevice()::Instantiate().
-  //   If device_id_ is >= 0, this will create the cuBLAS and cuSparse handles.
+  //   If device_id_ is >= 0, this will create the cuBLAS, cuSparse, cuDNN
+  //   handles.
   KALDI_ASSERT(!initialized_);
   initialized_ = true;
   if (device_id_ == -1) {
@@ -113,6 +115,8 @@ void CuDevice::Initialize() {
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+    CUDNN_SAFE_CALL(cudnnCreate(&cudnn_handle_));
+    CUDNN_SAFE_CALL(cudnnSetStream(cudnn_handle_, cudaStreamPerThread));
   }
 }
 
@@ -248,8 +252,10 @@ void CuDevice::FinalizeActiveGpu() {
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+    CUDNN_SAFE_CALL(cudnnCreate(&cudnn_handle_));
+    CUDNN_SAFE_CALL(cudnnSetStream(cudnn_handle_, cudaStreamPerThread));
 
-    // Notify the user which GPU is being userd.
+    // Notify the user which GPU is being used.
     char name[128];
     DeviceGetName(name,128, device_id);
 
@@ -511,14 +517,17 @@ CuDevice::CuDevice():
     initialized_(false),
     device_id_copy_(-1),
     cublas_handle_(NULL),
-    cusparse_handle_(NULL) {
+    cusparse_handle_(NULL),
+    cudnn_handle_(NULL) {
 }
 
 CuDevice::~CuDevice() {
   if (cublas_handle_)
     CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
   if (cusparse_handle_)
     CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
+  if (cudnn_handle_)
+    CUDNN_SAFE_CALL(cudnnDestroy(cudnn_handle_));
 }
 
 

diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //           2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2018       Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -30,6 +31,7 @@
 #include <string>
 #include <iostream>
 #include <cuda.h>
+#include <cudnn.h>
 #include <cuda_runtime_api.h>
 #include "base/kaldi-common.h"
 #include "base/timer.h"
@@ -80,6 +82,7 @@ class CuDevice {
 
   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
+  inline cudnnHandle_t GetCudnnHandle() { return cudnn_handle_; }
 
   // We provide functions Malloc(), MallocPitch() and Free() which replace
   // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
@@ -271,6 +274,8 @@ class CuDevice {
 
   cusparseHandle_t cusparse_handle_;
 
+  cudnnHandle_t cudnn_handle_;
+
 }; // class CuDevice
 
 
@@ -289,6 +294,8 @@ inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetCubl
 // A more convenient way to get the handle to use cuSPARSE APIs.
 inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); }
 
+inline cudnnHandle_t GetCudnnHandle() { return CuDevice::Instantiate().GetCudnnHandle(); }
+
 
 }  // namespace kaldi
 

diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h
@@ -468,7 +468,7 @@ struct IvectorExtractorEstimationOptions {
                    "update any associated parameters.");
     opts->Register("diagonalize", &diagonalize,
                    "If true, diagonalize the quadratic term in the "
-                   "objective function. This reorders the ivector dimensions"
+                   "objective function. This reorders the ivector dimensions "
                    "from most to least important.");
   }
 };

diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
@@ -4,12 +4,25 @@ endif
 ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
+ifndef CUDNNDIR
+$(error CUDNNDIR not defined.)
+endif
+
+# Order matters here. We must tell the compiler to search
+# $(CUDNNDIR)/lib64 before $(CUDATKDIR)/lib64 because the CUDNN .deb
+# files install cudnn to /usr/local/cuda/lib64, which would overshadow
+# the user-specified $(CUDNNDIR)
+CUDA_INCLUDE += -I$(CUDNNDIR)/include
+CXXFLAGS += -I$(CUDNNDIR)/include
+CUDA_LDFLAGS += -L$(CUDNNDIR)/lib64 -Wl,-rpath,$(CUDNNDIR)/lib64
+CUDA_LDLIBS += -lcudnn
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT)
-CUDA_FLAGS = -Xcompiler "-fPIC -pthread -isystem $(OPENFSTINC)" --verbose --machine 64 -DHAVE_CUDA \
+CUDA_FLAGS = -Xcompiler "-fPIC -pthread -isystem $(OPENFSTINC)" --verbose --machine 64 -DHAVE_CUDA=1 \
              -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-             -std=c++11 -DCUDA_API_PER_THREAD_DEFAULT_STREAM
+             -std=c++11 -DCUDA_API_PER_THREAD_DEFAULT_STREAM  -I$(CUDATKDIR)/include
 
 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
+
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
@@ -28,7 +28,7 @@
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"
 
-static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), 
+static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
 
 namespace kaldi {
@@ -538,7 +538,7 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         // pass stride to write a column as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
-        //for (MatrixIndexT i = 0; i < this_num_rows; ++i) 
+        //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
@@ -1656,11 +1656,12 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
 
 
 template<typename Real>
-SubMatrix<Real>::SubMatrix(Real *data,
+SubMatrix<Real>::SubMatrix(const Real *data,
                            MatrixIndexT num_rows,
                            MatrixIndexT num_cols,
                            MatrixIndexT stride):
-    MatrixBase<Real>(data, num_cols, num_rows, stride) { // caution: reversed order!
+    MatrixBase<Real>(const_cast<Real*>(data),
+                     num_cols, num_rows, stride) { // caution: reversed order!
   if (data == NULL) {
     KALDI_ASSERT(num_rows * num_cols == 0);
     this->num_rows_ = 0;
@@ -1839,7 +1840,8 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
     KALDI_ERR << "Error doing Svd (did not converge), first part of matrix is\n"
               << SubMatrix<Real>(*this, 0, std::min((MatrixIndexT)10, num_rows_),
                                  0, std::min((MatrixIndexT)10, num_cols_))
-              << ", min and max are: " << Min() << ", " << Max();
+              << ", min, max and sum are: " << Min() << ", " << Max()
+              << ", " << Sum();
   }
 }
 

diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
@@ -952,9 +952,10 @@ class SubMatrix : public MatrixBase<Real> {
             const MatrixIndexT co,  // column offset, 0 < co < NumCols()
             const MatrixIndexT c);   // number of columns, c > 0
 
-  // This initializer is mostly intended for use in CuMatrix and related
-  // classes.  Be careful!
-  SubMatrix(Real *data,
+  // This initializer does not take ownership of the pointer, and to use it you
+  // need to have some understanding of how this library works.  Caution:
+  // it can be used to get around const limitations, so be careful.
+  SubMatrix(const Real *data,
             MatrixIndexT num_rows,
             MatrixIndexT num_cols,
             MatrixIndexT stride);

diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
@@ -12,7 +12,8 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-compile-utils-test nnet-nnet-test nnet-utils-test \
   nnet-compile-test nnet-analyze-test nnet-compute-test \
   nnet-optimize-test nnet-derivative-test nnet-example-test \
-  nnet-common-test convolution-test attention-test
+  nnet-common-test convolution-test attention-test \
+  convolution-cudnn-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-simple-component.o nnet-normalize-component.o \
@@ -31,16 +32,21 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-compile-looped.o decodable-simple-looped.o \
   decodable-online-looped.o convolution.o \
   nnet-convolutional-component.o attention.o \
-  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o
+  nnet-attention-component.o nnet-tdnn-component.o \
+  nnet-batch-compute.o convolution-cudnn.o
 
 
+ifeq ($(CUDA), true)
+OBJFILES += convolution-cudnn.o
+endif
+
 LIBNAME = kaldi-nnet3
 
 ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk