From 7fc1ca5a701855765d1a8c387d4b797a966fbd61 Mon Sep 17 00:00:00 2001 From: jluitjens Date: Fri, 1 Mar 2019 11:00:46 -0800 Subject: [PATCH 1/7] Add device options to enable tensor core math mode. --- src/cudamatrix/cu-device.cc | 9 +++++++++ src/cudamatrix/cu-device.h | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 49c179b3673..bf17a579165 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -110,6 +110,13 @@ void CuDevice::Initialize() { // Initialize CUBLAS. CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_)); CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread)); + + if(device_options_.use_tensor_cores_) { + //Enable tensor cores in CUBLAS + //Note if the device does not support tensor cores this will fall back to normal math mode + CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, CUBLAS_TENSOR_OP_MATH)); + } + // Initialize the cuSPARSE library CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_)); CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread)); @@ -525,6 +532,8 @@ CuDevice::~CuDevice() { // Each thread has its own copy of the CuDevice object. // Note: this was declared "static". thread_local CuDevice CuDevice::this_thread_device_; + +CuDevice::CuDeviceOptions_t CuDevice::device_options_; // define and initialize the static members of the CuDevice object. int32 CuDevice::device_id_ = -1; diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index dc3df7e347d..83fb0f352d3 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -184,8 +184,25 @@ class CuDevice { /// (i.e. from outside the class), call this only if Enabled() returns true. bool IsComputeExclusive(); + //Register command line options for CUDA device. + //This must be done before calling CuDevice::Initialize() + static void RegisterDeviceOptions(OptionsItf *po) { + CuDevice::device_options_.Register(po); + } ~CuDevice(); private: + + struct CuDeviceOptions_t { + bool use_tensor_cores_; //Enable tensor cores + CuDeviceOptions_t () : use_tensor_cores_(false) {}; + void Register(OptionsItf *po) { + po->Register("cuda-use-tensor-cores",&use_tensor_cores_, "Enable FP16 tensor math. " + "This is higher performance but less accuracy."); + } + }; + + static CuDeviceOptions_t device_options_; + // Default constructor used to initialize this_thread_device_ CuDevice(); CuDevice(CuDevice&); // Disallow. From 7982a207816e9a975fd474339ccdef9956c3269b Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 09:55:32 -0800 Subject: [PATCH 2/7] fixed typo in comment --- src/cudamatrix/cu-device.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 83fb0f352d3..a65f6b09e78 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -184,8 +184,11 @@ class CuDevice { /// (i.e. from outside the class), call this only if Enabled() returns true. bool IsComputeExclusive(); - //Register command line options for CUDA device. - //This must be done before calling CuDevice::Initialize() + // Register command line options for CUDA device. + // This must be done before calling CuDevice::Initialize() + // Example: + // CuDevice::RegisterDeviceOptions(&po); + // CuDevice::Initialize(); static void RegisterDeviceOptions(OptionsItf *po) { CuDevice::device_options_.Register(po); } From 3d111ec9c8d31f6ad6b1291b199efd61af1f84ae Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 12:59:52 -0800 Subject: [PATCH 3/7] Style fixes --- src/cudamatrix/cu-device.cc | 11 ++++++----- src/cudamatrix/cu-device.h | 14 ++++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index bf17a579165..19625e4c715 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -111,10 +111,11 @@ void CuDevice::Initialize() { CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_)); CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread)); - if(device_options_.use_tensor_cores_) { - //Enable tensor cores in CUBLAS - //Note if the device does not support tensor cores this will fall back to normal math mode - CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, CUBLAS_TENSOR_OP_MATH)); + if (device_options_.use_tensor_cores_) { + // Enable tensor cores in CUBLAS + // Note if the device does not support tensor cores this will fall back to normal math mode + CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, + CUBLAS_TENSOR_OP_MATH)); } // Initialize the cuSPARSE library @@ -533,7 +534,7 @@ CuDevice::~CuDevice() { // Note: this was declared "static". thread_local CuDevice CuDevice::this_thread_device_; -CuDevice::CuDeviceOptions_t CuDevice::device_options_; +CuDevice::CuDeviceOptions CuDevice::device_options_; // define and initialize the static members of the CuDevice object. int32 CuDevice::device_id_ = -1; diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index a65f6b09e78..74a0319665c 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -195,16 +195,18 @@ class CuDevice { ~CuDevice(); private: - struct CuDeviceOptions_t { - bool use_tensor_cores_; //Enable tensor cores - CuDeviceOptions_t () : use_tensor_cores_(false) {}; + struct CuDeviceOptions { + bool use_tensor_cores_; // Enable tensor cores + CuDeviceOptions () : use_tensor_cores_(false) {}; void Register(OptionsItf *po) { - po->Register("cuda-use-tensor-cores",&use_tensor_cores_, "Enable FP16 tensor math. " - "This is higher performance but less accuracy."); + po->Register("cuda-use-tensor-cores",&use_tensor_cores_, + "Enable FP16 tensor math. " + "This is higher performance but less accuracy. " + "This is only recommended for inference."); } }; - static CuDeviceOptions_t device_options_; + static CuDeviceOptions device_options_; // Default constructor used to initialize this_thread_device_ CuDevice(); From e5469b784595c2c7aad049847b5a0ac012643b16 Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 13:01:45 -0800 Subject: [PATCH 4/7] Fixed one more style issue --- src/cudamatrix/cu-device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 74a0319665c..837441ca600 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -199,7 +199,7 @@ class CuDevice { bool use_tensor_cores_; // Enable tensor cores CuDeviceOptions () : use_tensor_cores_(false) {}; void Register(OptionsItf *po) { - po->Register("cuda-use-tensor-cores",&use_tensor_cores_, + po->Register("cuda-use-tensor-cores", &use_tensor_cores_, "Enable FP16 tensor math. " "This is higher performance but less accuracy. " "This is only recommended for inference."); From 919f32c1b89cf87ca7f9a6396fbdbb566acd9d6c Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 13:06:25 -0800 Subject: [PATCH 5/7] Remove trailing _ from struct --- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 19625e4c715..140275d3b6e 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -111,7 +111,7 @@ void CuDevice::Initialize() { CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_)); CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread)); - if (device_options_.use_tensor_cores_) { + if (device_options_.use_tensor_cores) { // Enable tensor cores in CUBLAS // Note if the device does not support tensor cores this will fall back to normal math mode CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 837441ca600..2826cbc382f 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -196,10 +196,10 @@ class CuDevice { private: struct CuDeviceOptions { - bool use_tensor_cores_; // Enable tensor cores - CuDeviceOptions () : use_tensor_cores_(false) {}; + bool use_tensor_cores; // Enable tensor cores + CuDeviceOptions () : use_tensor_cores(false) {}; void Register(OptionsItf *po) { - po->Register("cuda-use-tensor-cores", &use_tensor_cores_, + po->Register("cuda-use-tensor-cores", &use_tensor_cores, "Enable FP16 tensor math. " "This is higher performance but less accuracy. " "This is only recommended for inference."); From 742ec8850da1efc124d9073978dcb3386e617c2e Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 13:12:43 -0800 Subject: [PATCH 6/7] Register device options in a few of the nnet3 binaries. --- src/nnet3bin/nnet3-compute-batch.cc | 1 + src/nnet3bin/nnet3-compute.cc | 1 + src/nnet3bin/nnet3-latgen-faster-batch.cc | 1 + src/nnet3bin/nnet3-xvector-compute.cc | 1 + 4 files changed, 4 insertions(+) diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc index b0001c96f57..9fd6eee4d58 100644 --- a/src/nnet3bin/nnet3-compute-batch.cc +++ b/src/nnet3bin/nnet3-compute-batch.cc @@ -88,6 +88,7 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().AllowMultithreading(); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc index 45fde99a4f5..53803f970b2 100644 --- a/src/nnet3bin/nnet3-compute.cc +++ b/src/nnet3bin/nnet3-compute.cc @@ -86,6 +86,7 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc index fad2d5ed356..520d7010eaa 100644 --- a/src/nnet3bin/nnet3-latgen-faster-batch.cc +++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc @@ -116,6 +116,7 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().AllowMultithreading(); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc index a4bc89a7def..4b56dbe4390 100644 --- a/src/nnet3bin/nnet3-xvector-compute.cc +++ b/src/nnet3bin/nnet3-xvector-compute.cc @@ -121,6 +121,7 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif From cd812a00378b21c17bf2d11d536890af53c4a606 Mon Sep 17 00:00:00 2001 From: jluitjens Date: Sat, 2 Mar 2019 14:42:34 -0800 Subject: [PATCH 7/7] Register before read. Updated example. --- src/cudamatrix/cu-device.h | 1 + src/nnet3bin/nnet3-compute-batch.cc | 5 ++++- src/nnet3bin/nnet3-compute.cc | 5 ++++- src/nnet3bin/nnet3-latgen-faster-batch.cc | 5 ++++- src/nnet3bin/nnet3-xvector-compute.cc | 5 ++++- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 2826cbc382f..8816f9d223b 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -188,6 +188,7 @@ class CuDevice { // This must be done before calling CuDevice::Initialize() // Example: // CuDevice::RegisterDeviceOptions(&po); + // po.Read(argc, argv); // CuDevice::Initialize(); static void RegisterDeviceOptions(OptionsItf *po) { CuDevice::device_options_.Register(po); diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc index 9fd6eee4d58..5d4b9b1db48 100644 --- a/src/nnet3bin/nnet3-compute-batch.cc +++ b/src/nnet3bin/nnet3-compute-batch.cc @@ -80,6 +80,10 @@ int main(int argc, char *argv[]) { "priors stored with the model (in this case, " "a .mdl file is expected as input)."); +#if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); +#endif + po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -88,7 +92,6 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 - CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().AllowMultithreading(); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc index 53803f970b2..cf133025aae 100644 --- a/src/nnet3bin/nnet3-compute.cc +++ b/src/nnet3bin/nnet3-compute.cc @@ -78,6 +78,10 @@ int main(int argc, char *argv[]) { "priors stored with the model (in this case, " "a .mdl file is expected as input)."); +#if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); +#endif + po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -86,7 +90,6 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 - CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc index 520d7010eaa..ec52cff9776 100644 --- a/src/nnet3bin/nnet3-latgen-faster-batch.cc +++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc @@ -108,6 +108,10 @@ int main(int argc, char *argv[]) { po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); +#if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); +#endif + po.Read(argc, argv); if (po.NumArgs() != 4) { @@ -116,7 +120,6 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 - CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().AllowMultithreading(); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc index 4b56dbe4390..e327681cf9b 100644 --- a/src/nnet3bin/nnet3-xvector-compute.cc +++ b/src/nnet3bin/nnet3-xvector-compute.cc @@ -113,6 +113,10 @@ int main(int argc, char *argv[]) { po.Register("pad-input", &pad_input, "If true, duplicate the first and " "last frames of the input features as required to equal min-chunk-size."); +#if HAVE_CUDA==1 + CuDevice::RegisterDeviceOptions(&po); +#endif + po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -121,7 +125,6 @@ int main(int argc, char *argv[]) { } #if HAVE_CUDA==1 - CuDevice::RegisterDeviceOptions(&po); CuDevice::Instantiate().SelectGpuId(use_gpu); #endif