From 7fc1ca5a701855765d1a8c387d4b797a966fbd61 Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Fri, 1 Mar 2019 11:00:46 -0800
Subject: [PATCH 1/7] Add device options to enable tensor core math mode.

---
 src/cudamatrix/cu-device.cc |  9 +++++++++
 src/cudamatrix/cu-device.h  | 17 +++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 49c179b3673..bf17a579165 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -110,6 +110,13 @@ void CuDevice::Initialize() {
     // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+    
+    if(device_options_.use_tensor_cores_) {
+      //Enable tensor cores in CUBLAS
+      //Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, CUBLAS_TENSOR_OP_MATH));
+    }
+
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
@@ -525,6 +532,8 @@ CuDevice::~CuDevice() {
 // Each thread has its own copy of the CuDevice object.
 // Note: this was declared "static".
 thread_local CuDevice CuDevice::this_thread_device_;
+  
+CuDevice::CuDeviceOptions_t CuDevice::device_options_;
 
 // define and initialize the static members of the CuDevice object.
 int32 CuDevice::device_id_ = -1;
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index dc3df7e347d..83fb0f352d3 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -184,8 +184,25 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
+  //Register command line options for CUDA device.  
+  //This must be done before calling CuDevice::Initialize()
+  static void RegisterDeviceOptions(OptionsItf *po) {
+    CuDevice::device_options_.Register(po);  
+  }
   ~CuDevice();
  private:
+
+  struct CuDeviceOptions_t {
+    bool use_tensor_cores_; //Enable tensor cores
+    CuDeviceOptions_t () : use_tensor_cores_(false) {};
+    void Register(OptionsItf *po) {
+      po->Register("cuda-use-tensor-cores",&use_tensor_cores_, "Enable FP16 tensor math. "
+          "This is higher performance but less accuracy.");
+    }
+  };
+
+  static CuDeviceOptions_t device_options_;
+
   // Default constructor used to initialize this_thread_device_
   CuDevice();
   CuDevice(CuDevice&); // Disallow.

From 7982a207816e9a975fd474339ccdef9956c3269b Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 09:55:32 -0800
Subject: [PATCH 2/7] fixed typo in comment

---
 src/cudamatrix/cu-device.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 83fb0f352d3..a65f6b09e78 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -184,8 +184,11 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
-  //Register command line options for CUDA device.  
-  //This must be done before calling CuDevice::Initialize()
+  // Register command line options for CUDA device.  
+  // This must be done before calling CuDevice::Initialize()
+  // Example:
+  //  CuDevice::RegisterDeviceOptions(&po);
+  //  CuDevice::Initialize();
   static void RegisterDeviceOptions(OptionsItf *po) {
     CuDevice::device_options_.Register(po);  
   }

From 3d111ec9c8d31f6ad6b1291b199efd61af1f84ae Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 12:59:52 -0800
Subject: [PATCH 3/7] Style fixes

---
 src/cudamatrix/cu-device.cc | 11 ++++++-----
 src/cudamatrix/cu-device.h  | 14 ++++++++------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index bf17a579165..19625e4c715 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -111,10 +111,11 @@ void CuDevice::Initialize() {
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
     
-    if(device_options_.use_tensor_cores_) {
-      //Enable tensor cores in CUBLAS
-      //Note if the device does not support tensor cores this will fall back to normal math mode
-      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, CUBLAS_TENSOR_OP_MATH));
+    if (device_options_.use_tensor_cores_) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
     }
 
     // Initialize the cuSPARSE library
@@ -533,7 +534,7 @@ CuDevice::~CuDevice() {
 // Note: this was declared "static".
 thread_local CuDevice CuDevice::this_thread_device_;
   
-CuDevice::CuDeviceOptions_t CuDevice::device_options_;
+CuDevice::CuDeviceOptions CuDevice::device_options_;
 
 // define and initialize the static members of the CuDevice object.
 int32 CuDevice::device_id_ = -1;
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index a65f6b09e78..74a0319665c 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -195,16 +195,18 @@ class CuDevice {
   ~CuDevice();
  private:
 
-  struct CuDeviceOptions_t {
-    bool use_tensor_cores_; //Enable tensor cores
-    CuDeviceOptions_t () : use_tensor_cores_(false) {};
+  struct CuDeviceOptions {
+    bool use_tensor_cores_; // Enable tensor cores
+    CuDeviceOptions () : use_tensor_cores_(false) {};
     void Register(OptionsItf *po) {
-      po->Register("cuda-use-tensor-cores",&use_tensor_cores_, "Enable FP16 tensor math. "
-          "This is higher performance but less accuracy.");
+      po->Register("cuda-use-tensor-cores",&use_tensor_cores_, 
+          "Enable FP16 tensor math. "
+          "This is higher performance but less accuracy. "
+          "This is only recommended for inference.");
     }
   };
 
-  static CuDeviceOptions_t device_options_;
+  static CuDeviceOptions device_options_;
 
   // Default constructor used to initialize this_thread_device_
   CuDevice();

From e5469b784595c2c7aad049847b5a0ac012643b16 Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 13:01:45 -0800
Subject: [PATCH 4/7] Fixed one more style issue

---
 src/cudamatrix/cu-device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 74a0319665c..837441ca600 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -199,7 +199,7 @@ class CuDevice {
     bool use_tensor_cores_; // Enable tensor cores
     CuDeviceOptions () : use_tensor_cores_(false) {};
     void Register(OptionsItf *po) {
-      po->Register("cuda-use-tensor-cores",&use_tensor_cores_, 
+      po->Register("cuda-use-tensor-cores", &use_tensor_cores_, 
           "Enable FP16 tensor math. "
           "This is higher performance but less accuracy. "
           "This is only recommended for inference.");

From 919f32c1b89cf87ca7f9a6396fbdbb566acd9d6c Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 13:06:25 -0800
Subject: [PATCH 5/7] Remove trailing _ from struct

---
 src/cudamatrix/cu-device.cc | 2 +-
 src/cudamatrix/cu-device.h  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 19625e4c715..140275d3b6e 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -111,7 +111,7 @@ void CuDevice::Initialize() {
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
     
-    if (device_options_.use_tensor_cores_) {
+    if (device_options_.use_tensor_cores) {
       // Enable tensor cores in CUBLAS
       // Note if the device does not support tensor cores this will fall back to normal math mode
       CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 837441ca600..2826cbc382f 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -196,10 +196,10 @@ class CuDevice {
  private:
 
   struct CuDeviceOptions {
-    bool use_tensor_cores_; // Enable tensor cores
-    CuDeviceOptions () : use_tensor_cores_(false) {};
+    bool use_tensor_cores; // Enable tensor cores
+    CuDeviceOptions () : use_tensor_cores(false) {};
     void Register(OptionsItf *po) {
-      po->Register("cuda-use-tensor-cores", &use_tensor_cores_, 
+      po->Register("cuda-use-tensor-cores", &use_tensor_cores, 
           "Enable FP16 tensor math. "
           "This is higher performance but less accuracy. "
           "This is only recommended for inference.");

From 742ec8850da1efc124d9073978dcb3386e617c2e Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 13:12:43 -0800
Subject: [PATCH 6/7] Register device options in a few of the nnet3 binaries.

---
 src/nnet3bin/nnet3-compute-batch.cc       | 1 +
 src/nnet3bin/nnet3-compute.cc             | 1 +
 src/nnet3bin/nnet3-latgen-faster-batch.cc | 1 +
 src/nnet3bin/nnet3-xvector-compute.cc     | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index b0001c96f57..9fd6eee4d58 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -88,6 +88,7 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().AllowMultithreading();
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index 45fde99a4f5..53803f970b2 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -86,6 +86,7 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index fad2d5ed356..520d7010eaa 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -116,6 +116,7 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().AllowMultithreading();
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index a4bc89a7def..4b56dbe4390 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -121,6 +121,7 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 

From cd812a00378b21c17bf2d11d536890af53c4a606 Mon Sep 17 00:00:00 2001
From: jluitjens <jluitjens@nvidia.com>
Date: Sat, 2 Mar 2019 14:42:34 -0800
Subject: [PATCH 7/7] Register before read.  Updated example.

---
 src/cudamatrix/cu-device.h                | 1 +
 src/nnet3bin/nnet3-compute-batch.cc       | 5 ++++-
 src/nnet3bin/nnet3-compute.cc             | 5 ++++-
 src/nnet3bin/nnet3-latgen-faster-batch.cc | 5 ++++-
 src/nnet3bin/nnet3-xvector-compute.cc     | 5 ++++-
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 2826cbc382f..8816f9d223b 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -188,6 +188,7 @@ class CuDevice {
   // This must be done before calling CuDevice::Initialize()
   // Example:
   //  CuDevice::RegisterDeviceOptions(&po);
+  //  po.Read(argc, argv);
   //  CuDevice::Initialize();
   static void RegisterDeviceOptions(OptionsItf *po) {
     CuDevice::device_options_.Register(po);  
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index 9fd6eee4d58..5d4b9b1db48 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -80,6 +80,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -88,7 +92,6 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
-    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().AllowMultithreading();
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index 53803f970b2..cf133025aae 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -78,6 +78,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -86,7 +90,6 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
-    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index 520d7010eaa..ec52cff9776 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -108,6 +108,10 @@ int main(int argc, char *argv[]) {
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+    
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -116,7 +120,6 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
-    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().AllowMultithreading();
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index 4b56dbe4390..e327681cf9b 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -113,6 +113,10 @@ int main(int argc, char *argv[]) {
     po.Register("pad-input", &pad_input, "If true, duplicate the first and "
       "last frames of the input features as required to equal min-chunk-size.");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -121,7 +125,6 @@ int main(int argc, char *argv[]) {
     }
 
 #if HAVE_CUDA==1
-    CuDevice::RegisterDeviceOptions(&po);
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif