kaldi-asr · danpovey · Aug 13, 2019 · Aug 11, 2019
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
@@ -428,7 +428,7 @@ void CuMatrixBase<Real>:: CopyRangeFromMatClamped(const CuMatrixBase<Real> & src
     cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
       src.Data(), src.Stride(), clamp_low, clamp_high,
       Data(), Stride());
-  } else 
+  } else
 #endif
   {
     for (int32 t = start_range; t < end_range; t++) {
@@ -459,8 +459,8 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_, 
-                                     src_pitch, width, this->num_rows_, 
+      CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
+                                     src_pitch, width, this->num_rows_,
                                      cudaMemcpyDeviceToHost, cudaStreamPerThread));
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
@@ -511,7 +511,7 @@ void CuMatrixBase<Real>::SetZero() {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
-                              num_cols_ * sizeof(Real), num_rows_ , 
+                              num_cols_ * sizeof(Real), num_rows_ ,
                               cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
   } else
@@ -1679,9 +1679,9 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       return;
     }
     void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() * 
-                                 sizeof(MatrixElement<Real>), 
-                                 cudaMemcpyHostToDevice, 
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
+                                 sizeof(MatrixElement<Real>),
+                                 cudaMemcpyHostToDevice,
                                  cudaStreamPerThread));
     CuTimer tim;
     CuVector<Real> tmp(2, kUndefined);
@@ -1706,7 +1706,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       //KALDI_ASSERT(label >= 0 && label < nnet_.OutputDim());
       Real this_prob = output(m, label);
       KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer.
-      *tot_objf += weight * Log(this_prob);
+      *tot_objf += weight * kaldi::Log(this_prob);
       *tot_weight += weight;
       (*this)(m, label) += weight / this_prob;
     }
@@ -2258,7 +2258,7 @@ void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
       host_c_array[i] = C[i]->data_;
     }
 
-    CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array, 
+    CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
                                  3*size*sizeof(Real*), cudaMemcpyHostToDevice,
                                  cudaStreamPerThread));
 
@@ -2340,16 +2340,16 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
-        CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data, 
-                                     sizeof(Real)*num_rows_*num_cols_, 
-                                     cudaMemcpyHostToDevice, 
+        CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
+                                     sizeof(Real)*num_rows_*num_cols_,
+                                     cudaMemcpyHostToDevice,
                                      cudaStreamPerThread));
       } else {
         const Real *v_data = v.Data();
         for (MatrixIndexT r = 0; r < num_rows_; r++) {
           Real *row_data = RowData(r);
-          CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_, 
-                                       cudaMemcpyHostToDevice, 
+          CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
+                                       cudaMemcpyHostToDevice,
                                        cudaStreamPerThread));
           v_data += num_cols_;
         }
@@ -2536,7 +2536,7 @@ void CuMatrixBase<Real>::PowAbs(const CuMatrixBase<Real> &src, Real power, bool
     Mat().PowAbs(src.Mat(), power, include_sign);
   }
 }
-  
+
 template<typename Real>
 void CuMatrixBase<Real>::ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
   KALDI_ASSERT(SameDim(*this, src));
@@ -2624,14 +2624,14 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, 
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
                    cudaMemcpyDeviceToHost, cudaStreamPerThread));
     } else {
       // we could definitely do better than the following.
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), 
-                     sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost, 
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
+                     sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost,
                      cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
@@ -3317,7 +3317,7 @@ void CuMatrixBase<Real>::AddElements(Real alpha, const CuArrayBase<Int32Pair> &i
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
-    CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input, 
+    CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
                                  indexes.Dim() * sizeof(Real),
                                  cudaMemcpyHostToDevice, cudaStreamPerThread));
 

diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
@@ -710,7 +710,7 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
 
   StateId final_state = fst->AddState();
   fst->SetFinal(final_state, Weight::One());
-  for (size_t idx = 0;idx < final_states.size(); idx++) {
+  for (size_t idx = 0; idx < final_states.size(); idx++) {
     StateId s = final_states[idx];
     Weight weight = fst->Final(s);
     fst->SetFinal(s, Weight::Zero());

diff --git a/src/lat/phone-align-lattice.cc b/src/lat/phone-align-lattice.cc
@@ -176,6 +176,8 @@ class LatticePhoneAligner {
       // have returned false or we wouldn't have been called, so we have to
       // force it out.
       CompactLatticeArc lat_arc;
+      // Note: the next call will change the computation-state of the tuple,
+      // so it becomes a different tuple.
       tuple.comp_state.OutputArcForce(tmodel_, opts_, &lat_arc, &error_);
       lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue.
       // The final-prob stuff will get called again from ProcessQueueElement().
@@ -201,12 +203,13 @@ class LatticePhoneAligner {
     // epsilon-sequencing rules encoded by the filters in
     // composition.
     CompactLatticeArc lat_arc;
-    Tuple tuple2(tuple); // temp
     if (tuple.comp_state.OutputPhoneArc(tmodel_, opts_, &lat_arc, &error_) ||
         tuple.comp_state.OutputWordArc(tmodel_, opts_, &lat_arc, &error_)) {
-      // note: this function changes the tuple (when it returns true).
-      lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
-      // if not already present.
+      // note: the functions OutputPhoneArc() and OutputWordArc() change the
+      // tuple (when they return true).
+      lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to
+                                                         // queue, if not
+                                                         // already present.
       KALDI_ASSERT(output_state != lat_arc.nextstate);
       lat_out_->AddArc(output_state, lat_arc);
     } else {
@@ -220,7 +223,7 @@ class LatticePhoneAligner {
         // ... since we did CreateSuperFinal.
         ProcessFinal(tuple, output_state);
       }
-      // Now process the arcs.  Note: final-state shouldn't have any arcs.
+      // Now process the arcs.  Note: final-states shouldn't have any arcs.
       for(fst::ArcIterator<CompactLattice> aiter(lat_, tuple.input_state);
           !aiter.Done(); aiter.Next()) {
         const CompactLatticeArc &arc = aiter.Value();
@@ -369,7 +372,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   // although it might not be obvious from superficially checking
   // the code.  IsEmpty() would be true if we had transition_ids_.empty()
   // and opts.replace_output_symbols, so we would already die by assertion;
-  // in fact, this function would neve be called.
+  // in fact, this function would never be called.
 
   if (!transition_ids_.empty()) { // Do some checking here.
     int32 tid = transition_ids_[0];

diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Generate lattices using nnet3 neural net model.  This version is optimized\n"
         "for GPU-based inference.\n"
-        "Usage: nnet3-latgen-faster-parallel [options] <nnet-in> <fst-in> <features-rspecifier>"
+        "Usage: nnet3-latgen-faster-batch [options] <nnet-in> <fst-in> <features-rspecifier>"
         " <lattice-wspecifier>\n";
     ParseOptions po(usage);
 
@@ -111,7 +111,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::RegisterDeviceOptions(&po);
 #endif
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {