Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[not-for-merge] Thread compiler xvector example and few optimizations #2303

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions egs/sre16/v2/thread-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Simple script comparing standard and parallel xvector extractors

# conventional xvector computation with master kaldi

time ~/src/kaldi/kaldi-master/src/nnet3bin/nnet3-xvector-compute --min-chunk-size=25 --chunk-size=10000 '~/src/kaldi/kaldi-master/src/nnet3bin/nnet3-copy --nnet-config=0003_sre16_v2_1a/exp/xvector_nnet_1a//extract.config 0003_sre16_v2_1a/exp/xvector_nnet_1a/final.raw - |' 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:dev/split2/1/feats_50.scp ark:- | select-voiced-frames ark:- scp,s,cs:dev/split2/1/vad.scp ark:- |' ark,t:xvec_conv.txt

# real 1m35.620s
# user 1m35.406s
# sys 0m0.194s

# parallel single-thread

time ~/src/kaldi/kaldi-my/src/nnet3bin/nnet3-xvector-compute-parallel --min-chunk-size=25 --chunk-size=10000 'nnet3-copy --nnet-config=0003_sre16_v2_1a/exp/xvector_nnet_1a//extract.config 0003_sre16_v2_1a/exp/xvector_nnet_1a/final.raw - |' 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:dev/split2/1/feats_50.scp ark:- | select-voiced-frames ark:- scp,s,cs:dev/split2/1/vad.scp ark:- |' ark,t:xvec_para_1t.txt

# real 1m34.160s
# #user 1m33.855s
# sys 0m0.291s

# parallel 4 threads
time ~/src/kaldi/kaldi-my/src/nnet3bin/nnet3-xvector-compute-parallel --num-threads=4 --min-chunk-size=25 --chunk-size=10000 'nnet3-copy --nnet-config=0003_sre16_v2_1a/exp/xvector_nnet_1a//extract.config 0003_sre16_v2_1a/exp/xvector_nnet_1a/final.raw - |' 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:dev/split2/1/feats_50.scp ark:- | select-voiced-frames ark:- scp,s,cs:dev/split2/1/vad.scp ark:- |' ark,t:xvec_para_10t.txt

# real 0m40.513s
# user 2m13.843s
# sys 0m0.620s


# parallel 10 threads
time ~/src/kaldi/kaldi-my/src/nnet3bin/nnet3-xvector-compute-parallel --num-threads=10 --min-chunk-size=25 --chunk-size=10000 'nnet3-copy --nnet-config=0003_sre16_v2_1a/exp/xvector_nnet_1a//extract.config 0003_sre16_v2_1a/exp/xvector_nnet_1a/final.raw - |' 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:dev/split2/1/feats_50.scp ark:- | select-voiced-frames ark:- scp,s,cs:dev/split2/1/vad.scp ark:- |' ark,t:xvec_para_10t.txt

# real 0m42.263s
# user 2m17.649s
# sys 0m1.136s

# parallel 10 threads with limited cache capacity 3
time ~/src/kaldi/kaldi-my/src/nnet3bin/nnet3-xvector-compute-parallel --cache-capacity=3 --num-threads=10 --min-chunk-size=25 --chunk-size=10000 'nnet3-copy --nnet-config=0003_sre16_v2_1a/exp/xvector_nnet_1a/extract.config 0003_sre16_v2_1a/exp/xvector_nnet_1a//final.raw - |' 'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:dev/split2/1/feats_50.scp ark:- | select-voiced-frames ark:- scp,s,cs:dev/split2/1/vad.scp ark:- |' ark,t:xvec_para_4t_c3.txt

# real 0m43.296s
# user 2m16.898s
# sys 0m1.451s

2 changes: 1 addition & 1 deletion src/nnet3/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
nnet-compile-looped.o decodable-simple-looped.o \
decodable-online-looped.o convolution.o \
nnet-convolutional-component.o attention.o \
nnet-attention-component.o
nnet-attention-component.o nnet-xvector-threaded.o


LIBNAME = kaldi-nnet3
Expand Down
9 changes: 4 additions & 5 deletions src/nnet3/nnet-optimize.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,10 @@ class CachingOptimizingCompiler {

/// Does the compilation and returns a const pointer to the result, which is
/// owned by this class, not the caller. It calls ComputeCudaIndexes() for
/// you, because you wouldn't be able to do this on a const object.
///
/// Note: this used to return 'const NnetComputation*'. If you get a
/// compilation failure, just replace 'const NnetComputation*' with
/// 'std::shared_ptr<const NnetComputation>' in the calling code.
/// you, because you wouldn't be able to do this on a const object. If you
/// want to preserve thread safety you should hold the result in the same type
/// (std::shared_ptr<const NnetComputation>) while you still need it, but
/// otherwise you can just cast to const NnetComputation*.
std::shared_ptr<const NnetComputation> Compile(
const ComputationRequest &request);
void ReadCache(std::istream &is, bool binary);
Expand Down
130 changes: 130 additions & 0 deletions src/nnet3/nnet-xvector-threaded.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// nnet3/nnet3-xvector-threaded.cc

// Copyright 2017 Johns Hopkins University (author: Daniel Povey)
// 2017 Johns Hopkins University (author: Daniel Garcia-Romero)
// 2017 David Snyder
// 2018 Behavox Limited (author: Arseniy Gorin)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#include "nnet3/nnet-xvector-threaded.h"

namespace kaldi {
namespace nnet3 {

XVectorExtractorParallelClass::XVectorExtractorParallelClass(
const NnetSimpleComputationOptions &opts,
const Nnet &nnet,
CachingOptimizingCompiler *compiler,
std::string utt,
const int chunk_size,
const int min_chunk_size,
const Matrix<BaseFloat> &feats,
BaseFloatVectorWriter *xvector_writer
):
opts_(opts),
nnet_(&nnet),
compiler_(*compiler),
utt_(utt),
chunk_size_(chunk_size),
min_chunk_size_(min_chunk_size),
feats_(feats),
xvector_writer_(xvector_writer) {
tot_weight_ = 0.0;
xvector_avg_.Resize(nnet_->OutputDim("output"), kSetZero);
}

void XVectorExtractorParallelClass::operator () () {
int32 num_rows = feats_.NumRows(),
feat_dim = feats_.NumCols(),
this_chunk_size = chunk_size_;

if (num_rows < min_chunk_size_) {
KALDI_WARN << "Minimum chunk size of " << min_chunk_size_
<< " is greater than the number of rows "
<< "in utterance: " << utt_;
// let's make sure client does this check
// TODO: exit gracefully
} else if (num_rows < chunk_size_) {
// KALDI_LOG << "Chunk size of " << chunk_size_ << " is greater than "
// << "the number of rows in utterance: " << utt_
// << ", using chunk size of " << num_rows;
this_chunk_size = num_rows;
} else if (chunk_size_ == -1) {
this_chunk_size = num_rows;
}

int32 num_chunks = ceil(
num_rows / static_cast<BaseFloat>(this_chunk_size));

// Iterate over the feature chunks.
for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
// If we're nearing the end of the input, we may need to shift the
// offset back so that we can get this_chunk_size frames of input to
// the nnet.
int32 offset = std::min(
this_chunk_size, num_rows - chunk_indx * this_chunk_size);
if (offset < min_chunk_size_)
continue;
SubMatrix<BaseFloat> sub_features(
feats_, chunk_indx * this_chunk_size, offset, 0, feat_dim);
Vector<BaseFloat> xvector;
tot_weight_ += offset;

RunNnetComputation(sub_features, *nnet_, &compiler_, &xvector);

xvector_avg_.AddVec(offset, xvector);
}
}


XVectorExtractorParallelClass::~XVectorExtractorParallelClass () {
xvector_avg_.Scale(1.0 / tot_weight_);
xvector_writer_->Write(utt_, xvector_avg_);
}


void XVectorExtractorParallelClass::RunNnetComputation(const MatrixBase<BaseFloat> &features,
const Nnet &nnet, CachingOptimizingCompiler *compiler,
Vector<BaseFloat> *xvector) {
ComputationRequest request;
request.need_model_derivative = false;
request.store_component_stats = false;
request.inputs.push_back(
IoSpecification("input", 0, features.NumRows()));
IoSpecification output_spec;
output_spec.name = "output";
output_spec.has_deriv = false;
output_spec.indexes.resize(1);
request.outputs.resize(1);
request.outputs[0].Swap(&output_spec);
std::shared_ptr<const NnetComputation> computation = compiler->Compile(request);
Nnet *nnet_to_update = NULL; // we're not doing any update.
NnetComputer computer(NnetComputeOptions(), *computation,
nnet, nnet_to_update);
CuMatrix<BaseFloat> input_feats_cu(features);
computer.AcceptInput("input", &input_feats_cu);
computer.Run();
CuMatrix<BaseFloat> cu_output;
computer.GetOutputDestructive("output", &cu_output);
xvector->Resize(cu_output.NumCols());
xvector->CopyFromVec(cu_output.Row(0));
}
}
// nnet3
}
//
83 changes: 83 additions & 0 deletions src/nnet3/nnet-xvector-threaded.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// nnet3/xvector.

// Copyright 2017 Johns Hopkins University (author: Daniel Povey)
// 2017 Johns Hopkins University (author: Daniel Garcia-Romero)
// 2017 David Snyder
// 2018 Behavox Limited (author: Arseniy Gorin)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "nnet3/nnet-am-decodable-simple.h"
#include "base/timer.h"
#include "nnet3/nnet-utils.h"

namespace kaldi {
namespace nnet3 {

class XVectorExtractorParallelClass {

/*
This version is intended for multi-thread xvector extraction.
We allow compiler to be passed as a pointer.

IMPORTANT NOTE:

CachingOptimizingCompiler is not thread safe in terms of graph compilation.
To use this class without run-time errors with multiple threads,
one must make sure to pre-compile the graph cache in advance

*/

public:
XVectorExtractorParallelClass(
const NnetSimpleComputationOptions &opts,
const Nnet &nnet,
CachingOptimizingCompiler *compiler,
std::string utt,
const int chunk_size,
const int min_chunk_size,
const Matrix<BaseFloat> &feats,
BaseFloatVectorWriter *xvector_writer
);

void operator () ();

~XVectorExtractorParallelClass ();

private:
void DeletePointers();
KALDI_DISALLOW_COPY_AND_ASSIGN(XVectorExtractorParallelClass);

static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
const Nnet &nnet, CachingOptimizingCompiler *compiler,
Vector<BaseFloat> *xvector);
const NnetSimpleComputationOptions opts_;
const Nnet *nnet_;
CachingOptimizingCompiler &compiler_;
std::string utt_;
int chunk_size_;
int min_chunk_size_;
Matrix<BaseFloat> feats_;
BaseFloatVectorWriter *xvector_writer_;

BaseFloat tot_weight_;
Vector<BaseFloat> xvector_avg_; // (nnet_->OutputDim("output"), kSetZero);
};

}}
3 changes: 2 additions & 1 deletion src/nnet3bin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
nnet3-discriminative-compute-objf nnet3-discriminative-train \
nnet3-discriminative-subset-egs nnet3-get-egs-simple \
nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute
nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
nnet3-xvector-compute-parallel

OBJFILES =

Expand Down
Loading