Skip to content

Commit

Permalink
Merged with 'master' (#3156)
Browse files Browse the repository at this point in the history
* [scripts] Add fix regarding num-jobs for segment_long_utterances*.sh(#3130)

* [src] Enable allow_{upsample,downsample} with online features (#3139)

* [src] Fix bad assert in fstmakecontextsyms (#3142)

* [src] Fix to "Fixes to grammar-fst & LM-disambig symbols" (#3000) (#3143)

* [build] Make sure PaUtils exported from portaudio (#3144)

* [src] cudamatrix: fixing a synchronization bug in 'normalize-per-row' (#3145)

was only apparent using large matrices

* [src] Fix typo in comment (#3147)

* [src] Add binary that functions as a TCP server (#2938)

* [scripts] Fix bug in comment (#3152)
  • Loading branch information
desh2608 authored and danpovey committed Mar 21, 2019
1 parent c4a326e commit 4cab3db
Show file tree
Hide file tree
Showing 24 changed files with 771 additions and 64 deletions.
9 changes: 8 additions & 1 deletion egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,17 @@ if [ $stage -le 3 ]; then
cp $srcdir/phones.txt $dir 2>/dev/null || true

mkdir -p $graph_dir

n_reco=$(cat $text | wc -l) || exit 1
nj_reco=$nj

if [ $nj -gt $n_reco ]; then
nj_reco=$n_reco
fi

# Make graphs w.r.t. to the original text (usually recording-level)
steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
--nj $nj --cmd "$cmd" $text \
--nj $nj_reco --cmd "$cmd" $text \
$lang $dir $dir/graphs
if [ -z "$utt2text" ]; then
# and then copy it to the sub-segments.
Expand Down
9 changes: 8 additions & 1 deletion egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,17 @@ if [ $stage -le 3 ]; then

mkdir -p $graph_dir

n_reco=$(cat $text | wc -l) || exit 1
nj_reco=$nj

if [ $nj -gt $n_reco ]; then
nj_reco=$n_reco
fi

# Make graphs w.r.t. to the original text (usually recording-level)
steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
--scale-opts "$scale_opts" \
--nj $nj --cmd "$cmd" $text \
--nj $nj_reco --cmd "$cmd" $text \
$lang $dir $dir/graphs
if [ -z "$utt2text" ]; then
# and then copy it to the sub-segments.
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/utils/parse_options.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ done


###
### No we process the command line options
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
Expand Down
19 changes: 11 additions & 8 deletions src/cudamatrix/cu-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2552,9 +2552,12 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
const int i = blockIdx.x;
const int tid = threadIdx.x;
const Real* x_row = x + i * x_d.stride;

typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage;
__shared__ Real ssum[CU1DBLOCK];

__shared__ Real stddev_div_target_rms;
__shared__ Real scale;

// Reduce x_j^2 to CU1DBLOCK elements per row
Real tsum = Real(0);
Expand All @@ -2563,14 +2566,14 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
}
tsum = BlockReduceT(temp_storage).Sum(tsum);
__syncthreads();


const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
ssum[tid] = sqrt(
fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));

const Real stddev_div_target_rms = ssum[0];
const Real scale = Real(1) / stddev_div_target_rms;
if (tid == 0) {
const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
stddev_div_target_rms = sqrt(
fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
scale = Real(1) / stddev_div_target_rms;
}
__syncthreads();

// Store normalized input to output
Real* y_row = y + i * y_stride;
Expand Down
49 changes: 47 additions & 2 deletions src/cudamatrix/cu-math-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,50 @@ static void UnitTestCuMathNormalizePerRow() {
}
}


template<typename Real>
static void UnitTestCuMathNormalizePerRow_v2() {

int row = 128;
int col = 1024;

Matrix<Real> Hi(row,col);
Matrix<Real> Ho(row,col);
Hi.SetRandn();
Hi.Scale(5.0);
Hi.ApplyFloor(0.0); // like ReLU,

CuMatrix<Real> Di(row, col);
CuMatrix<Real> Do(row, col);
Di.CopyFromMat(Hi);

Real target_rms = 0.3456;
bool add_log_stddev = false;
const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66

//gpu
cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);

//cpu
{
MatrixBase<Real>& in(Hi);
MatrixBase<Real>& out(Ho);
Real target_rms=0.3456;
Vector<Real> in_norm(in.NumRows());
Real d_scaled = in.NumCols() * target_rms * target_rms;
in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
in_norm.ApplyFloor(kSquaredNormFloor);
in_norm.ApplyPow(-0.5);
out.CopyFromMat(in);
out.MulRowsVec(in_norm);
}

Matrix<Real> Ho2(Do);
// here the BUG was detected (by processing big-enough matrix),
AssertEqual(Ho,Ho2,0.00001);
}


template<typename Real>
static void UnitTestCuDiffNormalizePerRow() {
for (int32 i = 0; i < 2; i++) {
Expand Down Expand Up @@ -660,6 +704,7 @@ template<typename Real> void CudaMathUnitTest() {
UnitTestEnsureNonzero<Real>();
UnitTestBackpropLstmNonlinearity<Real>();
UnitTestCuMathNormalizePerRow<Real>();
UnitTestCuMathNormalizePerRow_v2<Real>();
UnitTestCuDiffNormalizePerRow<Real>();
}

Expand All @@ -673,9 +718,9 @@ int main() {
for (; loop < 2; loop++) {
CuDevice::Instantiate().SetDebugStrideMode(true);
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
else
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
#endif
srand(time(NULL));
kaldi::CudaMathUnitTest<float>();
Expand Down
2 changes: 1 addition & 1 deletion src/decoder/grammar-fst.cc
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ bool GrammarFstPreparer::IsEntryState(StateId s) const {
// we check that at least one has label with nonterminal equal to #nonterm_begin...
// in fact they will all have this value if at least one does, and this was checked
// in NeedEpsilons().
if (nonterminal == kNontermBegin)
if (nonterminal == GetPhoneSymbolFor(kNontermBegin))
return true;
}
return false;
Expand Down
64 changes: 64 additions & 0 deletions src/doc/online_decoding.dox
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,70 @@ and downloadable models that can be used with online nnet3 decoding, please
see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
includes instructions in a README file).

\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding

The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
~/src/online2bin folder. The usage is as follows:

\verbatim
online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table> <listen-port>
\endverbatim

For example:

\verbatim
online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt 5050
\endverbatim

The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
the server outputs word strings. Endpointing is mandatory to make the operation of the
program reasonable. Other, non-standard options include:
- samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
- chunk-length - length of signal being processed by decoder at each step
- output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
- num-threads-startup - number of threads used when initializing iVector extractor

The TCP protocol simply takes RAW signal on input (16-bit signed integer
encoding at chosen sampling frequency) and outputs simple text using the following
logic:
- each refresh period (output-freq argument) the current state of decoding is output
- each line is terminated by '\r'
- once an utterance boundary is detected due to endpointing a '\n' char is output

Each output string (delimited by '\r') should be treated as uncertain and can change
entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
specifically in order to make the output look neat in the terminal. It is possible to
use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.

To run the program from the terminal you can use one of the following commands. First,
make sure the server is running and accepting connections. Using the Aspire models, the
command should look like this:
\verbatim
online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
--frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
--beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 model/final.mdl graph/HCLG.fst graph/words.txt 5050
\endverbatim

To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
sent to the socket:
\verbatim
sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
\endverbatim

It is possible to play audio (almost) simultaneously as decoding. It may require installing the
'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):

\verbatim
sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
pv -L 16000 -q | nc -N localhost 5050
\endverbatim

Finally, it is possible to send audio from the microphone directly into the server:

\verbatim
rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
\endverbatim


*/
Expand Down
Loading

0 comments on commit 4cab3db

Please sign in to comment.