Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions src/cudamatrix/cu-matrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ void CuMatrixBase<Real>:: CopyRangeFromMatClamped(const CuMatrixBase<Real> & src
cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
src.Data(), src.Stride(), clamp_low, clamp_high,
Data(), Stride());
} else
} else
#endif
{
for (int32 t = start_range; t < end_range; t++) {
Expand Down Expand Up @@ -459,8 +459,8 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
MatrixIndexT src_pitch = stride_*sizeof(Real);
MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
MatrixIndexT width = NumCols()*sizeof(Real);
CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
src_pitch, width, this->num_rows_,
CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
src_pitch, width, this->num_rows_,
cudaMemcpyDeviceToHost, cudaStreamPerThread));
CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
Expand Down Expand Up @@ -511,7 +511,7 @@ void CuMatrixBase<Real>::SetZero() {
if (CuDevice::Instantiate().Enabled()) {
CuTimer tim;
CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
num_cols_ * sizeof(Real), num_rows_ ,
num_cols_ * sizeof(Real), num_rows_ ,
cudaStreamPerThread));
CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
} else
Expand Down Expand Up @@ -1679,9 +1679,9 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
return;
}
void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
sizeof(MatrixElement<Real>),
cudaMemcpyHostToDevice,
CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
sizeof(MatrixElement<Real>),
cudaMemcpyHostToDevice,
cudaStreamPerThread));
CuTimer tim;
CuVector<Real> tmp(2, kUndefined);
Expand All @@ -1706,7 +1706,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
//KALDI_ASSERT(label >= 0 && label < nnet_.OutputDim());
Real this_prob = output(m, label);
KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer.
*tot_objf += weight * Log(this_prob);
*tot_objf += weight * kaldi::Log(this_prob);
*tot_weight += weight;
(*this)(m, label) += weight / this_prob;
}
Expand Down Expand Up @@ -2258,7 +2258,7 @@ void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
host_c_array[i] = C[i]->data_;
}

CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
3*size*sizeof(Real*), cudaMemcpyHostToDevice,
cudaStreamPerThread));

Expand Down Expand Up @@ -2340,16 +2340,16 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
if (v.Dim() == num_rows_*num_cols_) {
if (stride_ == num_cols_) {
const Real* v_data = v.Data();
CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
sizeof(Real)*num_rows_*num_cols_,
cudaMemcpyHostToDevice,
CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
sizeof(Real)*num_rows_*num_cols_,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
} else {
const Real *v_data = v.Data();
for (MatrixIndexT r = 0; r < num_rows_; r++) {
Real *row_data = RowData(r);
CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
cudaMemcpyHostToDevice,
CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
v_data += num_cols_;
}
Expand Down Expand Up @@ -2536,7 +2536,7 @@ void CuMatrixBase<Real>::PowAbs(const CuMatrixBase<Real> &src, Real power, bool
Mat().PowAbs(src.Mat(), power, include_sign);
}
}

template<typename Real>
void CuMatrixBase<Real>::ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
KALDI_ASSERT(SameDim(*this, src));
Expand Down Expand Up @@ -2624,14 +2624,14 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
if (CuDevice::Instantiate().Enabled()) {
CuTimer tim;
if (mat.Stride() == mat.NumCols()) {
CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
cudaMemcpyDeviceToHost, cudaStreamPerThread));
} else {
// we could definitely do better than the following.
Real* vec_data = data_;
for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost,
CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost,
cudaStreamPerThread));
vec_data += mat.NumCols();
}
Expand Down Expand Up @@ -3317,7 +3317,7 @@ void CuMatrixBase<Real>::AddElements(Real alpha, const CuArrayBase<Int32Pair> &i
if (CuDevice::Instantiate().Enabled()) {
CuTimer tim;
CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
indexes.Dim() * sizeof(Real),
cudaMemcpyHostToDevice, cudaStreamPerThread));

Expand Down
2 changes: 1 addition & 1 deletion src/fstext/pre-determinize-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {

StateId final_state = fst->AddState();
fst->SetFinal(final_state, Weight::One());
for (size_t idx = 0;idx < final_states.size(); idx++) {
for (size_t idx = 0; idx < final_states.size(); idx++) {
StateId s = final_states[idx];
Weight weight = fst->Final(s);
fst->SetFinal(s, Weight::Zero());
Expand Down
15 changes: 9 additions & 6 deletions src/lat/phone-align-lattice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ class LatticePhoneAligner {
// have returned false or we wouldn't have been called, so we have to
// force it out.
CompactLatticeArc lat_arc;
// Note: the next call will change the computation-state of the tuple,
// so it becomes a different tuple.
tuple.comp_state.OutputArcForce(tmodel_, opts_, &lat_arc, &error_);
lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue.
// The final-prob stuff will get called again from ProcessQueueElement().
Expand All @@ -201,12 +203,13 @@ class LatticePhoneAligner {
// epsilon-sequencing rules encoded by the filters in
// composition.
CompactLatticeArc lat_arc;
Tuple tuple2(tuple); // temp
if (tuple.comp_state.OutputPhoneArc(tmodel_, opts_, &lat_arc, &error_) ||
tuple.comp_state.OutputWordArc(tmodel_, opts_, &lat_arc, &error_)) {
// note: this function changes the tuple (when it returns true).
lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
// if not already present.
// note: the functions OutputPhoneArc() and OutputWordArc() change the
// tuple (when they return true).
lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to
// queue, if not
// already present.
KALDI_ASSERT(output_state != lat_arc.nextstate);
lat_out_->AddArc(output_state, lat_arc);
} else {
Expand All @@ -220,7 +223,7 @@ class LatticePhoneAligner {
// ... since we did CreateSuperFinal.
ProcessFinal(tuple, output_state);
}
// Now process the arcs. Note: final-state shouldn't have any arcs.
// Now process the arcs. Note: final-states shouldn't have any arcs.
for(fst::ArcIterator<CompactLattice> aiter(lat_, tuple.input_state);
!aiter.Done(); aiter.Next()) {
const CompactLatticeArc &arc = aiter.Value();
Expand Down Expand Up @@ -369,7 +372,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
// although it might not be obvious from superficially checking
// the code. IsEmpty() would be true if we had transition_ids_.empty()
// and opts.replace_output_symbols, so we would already die by assertion;
// in fact, this function would neve be called.
// in fact, this function would never be called.

if (!transition_ids_.empty()) { // Do some checking here.
int32 tid = transition_ids_[0];
Expand Down
4 changes: 2 additions & 2 deletions src/nnet3bin/nnet3-latgen-faster-batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Generate lattices using nnet3 neural net model. This version is optimized\n"
"for GPU-based inference.\n"
"Usage: nnet3-latgen-faster-parallel [options] <nnet-in> <fst-in> <features-rspecifier>"
"Usage: nnet3-latgen-faster-batch [options] <nnet-in> <fst-in> <features-rspecifier>"
" <lattice-wspecifier>\n";
ParseOptions po(usage);

Expand Down Expand Up @@ -111,7 +111,7 @@ int main(int argc, char *argv[]) {
#if HAVE_CUDA==1
CuDevice::RegisterDeviceOptions(&po);
#endif

po.Read(argc, argv);

if (po.NumArgs() != 4) {
Expand Down