| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ../../sre08/v1/sid/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ../../wsj/s5/steps/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ../../wsj/s5/utils |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,194 @@ | ||
| #!/usr/bin/env python3 | ||
| # Copyright 2017 David Snyder | ||
| # Apache 2.0 | ||
| # | ||
| # This script generates augmented data. It is based on | ||
| # steps/data/reverberate_data_dir.py but doesn't handle reverberation. | ||
| # It is designed to be somewhat simpler and more flexible for augmenting with | ||
| # additive noise. | ||
| from __future__ import print_function | ||
| import sys, random, argparse, os, imp | ||
| sys.path.append("steps/data/") | ||
| from reverberate_data_dir import ParseFileToDict | ||
| from reverberate_data_dir import WriteDictToFile | ||
| data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py') | ||
|
|
||
| def GetArgs(): | ||
| parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. " | ||
| "Noises are separated into background and foreground noises which are added together or " | ||
| "separately. Background noises are added to the entire recording, and repeated as necessary " | ||
| "to cover the full length. Multiple overlapping background noises can be added, to simulate " | ||
| "babble, for example. Foreground noises are added sequentially, according to a specified " | ||
| "interval. See also steps/data/reverberate_data_dir.py " | ||
| "Usage: augment_data_dir.py [options...] <in-data-dir> <out-data-dir> " | ||
| "E.g., steps/data/augment_data_dir.py --utt-suffix aug --fg-snrs 20:10:5:0 --bg-snrs 20:15:10 " | ||
| "--num-bg-noise 1:2:3 --fg-interval 3 --fg-noise-dir data/musan_noise --bg-noise-dir " | ||
| "data/musan_music data/train data/train_aug", formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
| parser.add_argument('--fg-snrs', type=str, dest = "fg_snr_str", default = '20:10:0', | ||
| help='When foreground noises are being added, the script will iterate through these SNRs.') | ||
| parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0', | ||
| help='When background noises are being added, the script will iterate through these SNRs.') | ||
| parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1', | ||
| help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording') | ||
| parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0, | ||
| help='Number of seconds between the end of one foreground noise and the beginning of the next.') | ||
| parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.') | ||
| parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.') | ||
|
|
||
| parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir", | ||
| help="Background noise data directory") | ||
| parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir", | ||
| help="Foreground noise data directory") | ||
| parser.add_argument("input_dir", help="Input data directory") | ||
| parser.add_argument("output_dir", help="Output data directory") | ||
|
|
||
| print(' '.join(sys.argv)) | ||
| args = parser.parse_args() | ||
| args = CheckArgs(args) | ||
| return args | ||
|
|
||
| def CheckArgs(args): | ||
| if not os.path.exists(args.output_dir): | ||
| os.makedirs(args.output_dir) | ||
| if not args.fg_interval >= 0: | ||
| raise Exception("--fg-interval must be 0 or greater") | ||
| if args.bg_noise_dir is None and args.fg_noise_dir is None: | ||
| raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified") | ||
| return args | ||
|
|
||
| def GetNoiseList(noise_wav_scp_filename): | ||
| noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines() | ||
| noise_wavs = {} | ||
| noise_utts = [] | ||
| for line in noise_wav_scp_file: | ||
| toks=line.split(" ") | ||
| wav = " ".join(toks[1:]) | ||
| noise_utts.append(toks[0]) | ||
| noise_wavs[toks[0]] = wav.rstrip() | ||
| return noise_utts, noise_wavs | ||
|
|
||
| def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \ | ||
| bg_noise_utts, noise_wavs, noise2dur, interval, num_opts): | ||
| # This section is common to both foreground and background noises | ||
| new_wav = "" | ||
| dur_str = str(dur) | ||
| noise_dur = 0 | ||
| tot_noise_dur = 0 | ||
| snrs=[] | ||
| noises=[] | ||
| start_times=[] | ||
|
|
||
| # Now handle the background noises | ||
| if len(bg_noise_utts) > 0: | ||
| num = random.choice(num_opts) | ||
| for i in range(0, num): | ||
| noise_utt = random.choice(bg_noise_utts) | ||
| noise = noise_wavs[noise_utt] + " wav-reverberate --duration=" \ | ||
| + dur_str + " - - |" | ||
| snr = random.choice(bg_snr_opts) | ||
| snrs.append(snr) | ||
| start_times.append(0) | ||
| noises.append(noise) | ||
|
|
||
| # Now handle the foreground noises | ||
| if len(fg_noise_utts) > 0: | ||
| while tot_noise_dur < dur: | ||
| noise_utt = random.choice(fg_noise_utts) | ||
| noise = noise_wavs[noise_utt] | ||
| snr = random.choice(fg_snr_opts) | ||
| snrs.append(snr) | ||
| noise_dur = noise2dur[noise_utt] | ||
| start_times.append(tot_noise_dur) | ||
| tot_noise_dur += noise_dur + interval | ||
| noises.append(noise) | ||
|
|
||
| start_times_str = "--start-times='" + ",".join(map(str,start_times)) + "'" | ||
| snrs_str = "--snrs='" + ",".join(map(str,snrs)) + "'" | ||
| noises_str = "--additive-signals='" + ",".join(noises) + "'" | ||
|
|
||
| # If the wav is just a file | ||
| if len(wav.split()) == 1: | ||
| new_wav = "wav-reverberate --shift-output=true " + noises_str + " " \ | ||
| + start_times_str + " " + snrs_str + " " + wav + " - |" | ||
| # Else if the wav is in a pipe | ||
| else: | ||
| new_wav = wav + "wav-reverberate --shift-output=true " + noises_str + " " \ | ||
| + start_times_str + " " + snrs_str + " - - |" | ||
| return new_wav | ||
|
|
||
| def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir): | ||
| if os.path.isfile(input_dir + "/" + filename): | ||
| dict = ParseFileToDict(input_dir + "/" + filename, | ||
| value_processor = lambda x: " ".join(x)) | ||
| if len(utt_suffix) > 0: | ||
| new_dict = {} | ||
| for key in dict.keys(): | ||
| new_dict[key + "-" + utt_suffix] = dict[key] | ||
| dict = new_dict | ||
| WriteDictToFile(dict, output_dir + "/" + filename) | ||
|
|
||
| def main(): | ||
| args = GetArgs() | ||
| fg_snrs = map(int, args.fg_snr_str.split(":")) | ||
| bg_snrs = map(int, args.bg_snr_str.split(":")) | ||
| input_dir = args.input_dir | ||
| output_dir = args.output_dir | ||
| num_bg_noises = map(int, args.num_bg_noises.split(":")) | ||
| reco2dur = ParseFileToDict(input_dir + "/reco2dur", | ||
| value_processor = lambda x: float(x[0])) | ||
| wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines() | ||
|
|
||
| noise_wavs = {} | ||
| noise_reco2dur = {} | ||
| bg_noise_utts = [] | ||
| fg_noise_utts = [] | ||
|
|
||
| # Load background noises | ||
| if args.bg_noise_dir: | ||
| bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp" | ||
| bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename) | ||
| bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur", | ||
| value_processor = lambda x: float(x[0])) | ||
| noise_wavs.update(bg_noise_wavs) | ||
| noise_reco2dur.update(bg_noise_reco2dur) | ||
|
|
||
| # Load background noises | ||
| if args.fg_noise_dir: | ||
| fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp" | ||
| fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur" | ||
| fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename) | ||
| fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur", | ||
| value_processor = lambda x: float(x[0])) | ||
| noise_wavs.update(fg_noise_wavs) | ||
| noise_reco2dur.update(fg_noise_reco2dur) | ||
|
|
||
| random.seed(args.random_seed) | ||
| new_utt2wav = {} | ||
| new_utt2spk = {} | ||
|
|
||
| # Augment each line in the wav file | ||
| for line in wav_scp_file: | ||
| toks = line.rstrip().split(" ") | ||
| utt = toks[0] | ||
| wav = " ".join(toks[1:]) | ||
| dur = reco2dur[utt] | ||
| new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts, | ||
| bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval, | ||
| num_bg_noises) | ||
| new_utt = utt + "-" + args.utt_suffix | ||
| new_utt2wav[new_utt] = new_wav | ||
|
|
||
| if not os.path.exists(output_dir): | ||
| os.makedirs(output_dir) | ||
|
|
||
| WriteDictToFile(new_utt2wav, output_dir + "/wav.scp") | ||
| CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir) | ||
| CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir) | ||
| CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir) | ||
| CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir) | ||
| CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir) | ||
| CopyFileIfExists("", "spk2gender", input_dir, output_dir) | ||
| data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir)) | ||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,211 @@ | ||
| // nnet3bin/nnet3-xvector-compute.cc | ||
|
|
||
| // Copyright 2017 Johns Hopkins University (author: Daniel Povey) | ||
| // 2017 Johns Hopkins University (author: Daniel Garcia-Romero) | ||
| // 2017 David Snyder | ||
|
|
||
| // See ../../COPYING for clarification regarding multiple authors | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
| // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
| // MERCHANTABLITY OR NON-INFRINGEMENT. | ||
| // See the Apache 2 License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
|
|
||
| #include "base/kaldi-common.h" | ||
| #include "util/common-utils.h" | ||
| #include "nnet3/nnet-am-decodable-simple.h" | ||
| #include "base/timer.h" | ||
| #include "nnet3/nnet-utils.h" | ||
|
|
||
| namespace kaldi { | ||
| namespace nnet3 { | ||
|
|
||
| // Computes an xvector from a chunk of speech features. | ||
| static void RunNnetComputation(const MatrixBase<BaseFloat> &features, | ||
| const Nnet &nnet, CachingOptimizingCompiler *compiler, | ||
| Vector<BaseFloat> *xvector) { | ||
| ComputationRequest request; | ||
| request.need_model_derivative = false; | ||
| request.store_component_stats = false; | ||
| request.inputs.push_back( | ||
| IoSpecification("input", 0, features.NumRows())); | ||
| IoSpecification output_spec; | ||
| output_spec.name = "output"; | ||
| output_spec.has_deriv = false; | ||
| output_spec.indexes.resize(1); | ||
| request.outputs.resize(1); | ||
| request.outputs[0].Swap(&output_spec); | ||
| const NnetComputation *computation = compiler->Compile(request); | ||
| Nnet *nnet_to_update = NULL; // we're not doing any update. | ||
| NnetComputer computer(NnetComputeOptions(), *computation, | ||
| nnet, nnet_to_update); | ||
| CuMatrix<BaseFloat> input_feats_cu(features); | ||
| computer.AcceptInput("input", &input_feats_cu); | ||
| computer.Run(); | ||
| CuMatrix<BaseFloat> cu_output; | ||
| computer.GetOutputDestructive("output", &cu_output); | ||
| xvector->Resize(cu_output.NumCols()); | ||
| xvector->CopyFromVec(cu_output.Row(0)); | ||
| } | ||
|
|
||
| } // namespace nnet3 | ||
| } // namespace kaldi | ||
|
|
||
| int main(int argc, char *argv[]) { | ||
| try { | ||
| using namespace kaldi; | ||
| using namespace kaldi::nnet3; | ||
| typedef kaldi::int32 int32; | ||
| typedef kaldi::int64 int64; | ||
|
|
||
| const char *usage = | ||
| "Propagate features through an xvector neural network model and write\n" | ||
| "the output vectors. \"Xvector\" is our term for a vector or\n" | ||
| "embedding which is the output of a particular type of neural network\n" | ||
| "architecture found in speaker recognition. This architecture\n" | ||
| "consists of several layers that operate on frames, a statistics\n" | ||
| "pooling layer that aggregates over the frame-level representations\n" | ||
| "and possibly additional layers that operate on segment-level\n" | ||
| "representations. The xvectors are generally extracted from an\n" | ||
| "output layer after the statistics pooling layer. By default, one\n" | ||
| "xvector is extracted directly from the set of features for each\n" | ||
| "utterance. Optionally, xvectors are extracted from chunks of input\n" | ||
| "features and averaged, to produce a single vector.\n" | ||
| "\n" | ||
| "Usage: nnet3-xvector-compute [options] <raw-nnet-in> " | ||
| "<features-rspecifier> <vector-wspecifier>\n" | ||
| "e.g.: nnet3-xvector-compute final.raw scp:feats.scp " | ||
| "ark:nnet_prediction.ark\n" | ||
| "See also: nnet3-compute\n"; | ||
|
|
||
| ParseOptions po(usage); | ||
| Timer timer; | ||
|
|
||
| NnetSimpleComputationOptions opts; | ||
| opts.acoustic_scale = 1.0; // by default do no scaling in this recipe. | ||
|
|
||
| std::string use_gpu = "no"; | ||
| int32 chunk_size = -1, | ||
| min_chunk_size = 100; | ||
|
|
||
| opts.Register(&po); | ||
| po.Register("use-gpu", &use_gpu, | ||
| "yes|no|optional|wait, only has effect if compiled with CUDA"); | ||
| po.Register("chunk-size", &chunk_size, | ||
| "If set, extracts xectors from specified chunk-size, and averages. " | ||
| "If not set, extracts an xvector from all available features."); | ||
| po.Register("min-chunk-size", &min_chunk_size, | ||
| "Minimum chunk-size allowed when extracting xvectors."); | ||
|
|
||
| po.Read(argc, argv); | ||
|
|
||
| if (po.NumArgs() != 3) { | ||
| po.PrintUsage(); | ||
| exit(1); | ||
| } | ||
|
|
||
| #if HAVE_CUDA==1 | ||
| CuDevice::Instantiate().SelectGpuId(use_gpu); | ||
| #endif | ||
|
|
||
| std::string nnet_rxfilename = po.GetArg(1), | ||
| feature_rspecifier = po.GetArg(2), | ||
| vector_wspecifier = po.GetArg(3); | ||
|
|
||
| Nnet nnet; | ||
| ReadKaldiObject(nnet_rxfilename, &nnet); | ||
| SetBatchnormTestMode(true, &nnet); | ||
| SetDropoutTestMode(true, &nnet); | ||
| CollapseModel(CollapseModelConfig(), &nnet); | ||
|
|
||
| CachingOptimizingCompiler compiler(nnet, opts.optimize_config); | ||
|
|
||
| BaseFloatVectorWriter vector_writer(vector_wspecifier); | ||
|
|
||
| int32 num_success = 0, num_fail = 0; | ||
| int64 frame_count = 0; | ||
| int32 xvector_dim = nnet.OutputDim("output"); | ||
|
|
||
| SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); | ||
|
|
||
| for (; !feature_reader.Done(); feature_reader.Next()) { | ||
| std::string utt = feature_reader.Key(); | ||
| const Matrix<BaseFloat> &features (feature_reader.Value()); | ||
| if (features.NumRows() == 0) { | ||
| KALDI_WARN << "Zero-length utterance: " << utt; | ||
| num_fail++; | ||
| continue; | ||
| } | ||
| int32 num_rows = features.NumRows(), | ||
| feat_dim = features.NumCols(), | ||
| this_chunk_size = chunk_size; | ||
|
|
||
| if (num_rows < min_chunk_size) { | ||
| KALDI_WARN << "Minimum chunk size of " << min_chunk_size | ||
| << " is greater than the number of rows " | ||
| << "in utterance: " << utt; | ||
| num_fail++; | ||
| continue; | ||
| } else if (num_rows < chunk_size) { | ||
| KALDI_LOG << "Chunk size of " << chunk_size << " is greater than " | ||
| << "the number of rows in utterance: " << utt | ||
| << ", using chunk size of " << num_rows; | ||
| this_chunk_size = num_rows; | ||
| } else if (chunk_size == -1) { | ||
| this_chunk_size = num_rows; | ||
| } | ||
|
|
||
| int32 num_chunks = ceil( | ||
| num_rows / static_cast<BaseFloat>(this_chunk_size)); | ||
| Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero); | ||
| BaseFloat tot_weight = 0.0; | ||
|
|
||
| // Iterate over the feature chunks. | ||
| for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) { | ||
| // If we're nearing the end of the input, we may need to shift the | ||
| // offset back so that we can get this_chunk_size frames of input to | ||
| // the nnet. | ||
| int32 offset = std::min( | ||
| this_chunk_size, num_rows - chunk_indx * this_chunk_size); | ||
| if (offset < min_chunk_size) | ||
| continue; | ||
| SubMatrix<BaseFloat> sub_features( | ||
| features, chunk_indx * this_chunk_size, offset, 0, feat_dim); | ||
| Vector<BaseFloat> xvector; | ||
| tot_weight += offset; | ||
| RunNnetComputation(sub_features, nnet, &compiler, &xvector); | ||
| xvector_avg.AddVec(offset, xvector); | ||
| } | ||
| xvector_avg.Scale(1.0 / tot_weight); | ||
| vector_writer.Write(utt, xvector_avg); | ||
|
|
||
| frame_count += features.NumRows(); | ||
| num_success++; | ||
| } | ||
|
|
||
| #if HAVE_CUDA==1 | ||
| CuDevice::Instantiate().PrintProfile(); | ||
| #endif | ||
| double elapsed = timer.Elapsed(); | ||
| KALDI_LOG << "Time taken "<< elapsed | ||
| << "s: real-time factor assuming 100 frames/sec is " | ||
| << (elapsed*100.0/frame_count); | ||
| KALDI_LOG << "Done " << num_success << " utterances, failed for " | ||
| << num_fail; | ||
|
|
||
| if (num_success != 0) return 0; | ||
| else return 1; | ||
| } catch(const std::exception &e) { | ||
| std::cerr << e.what(); | ||
| return -1; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,229 @@ | ||
| // nnet3bin/nnet3-xvector-get-egs.cc | ||
|
|
||
| // Copyright 2016-2017 Johns Hopkins University (author: Daniel Povey) | ||
| // 2016-2017 Johns Hopkins University (author: Daniel Garcia-Romero) | ||
| // 2016-2017 David Snyder | ||
|
|
||
| // See ../../COPYING for clarification regarding multiple authors | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
| // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
| // MERCHANTABLITY OR NON-INFRINGEMENT. | ||
| // See the Apache 2 License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include <sstream> | ||
| #include "util/common-utils.h" | ||
| #include "nnet3/nnet-example.h" | ||
|
|
||
| namespace kaldi { | ||
| namespace nnet3 { | ||
|
|
||
| // A struct for holding information about the position and | ||
| // duration of each chunk. | ||
| struct ChunkInfo { | ||
| std::string name; | ||
| int32 output_archive_id; | ||
| int32 start_frame; | ||
| int32 num_frames; | ||
| int32 label; | ||
| }; | ||
|
|
||
| // Process the range input file and store it as a map from utterance | ||
| // name to vector of ChunkInfo structs. | ||
| static void ProcessRangeFile(const std::string &range_rxfilename, | ||
| unordered_map<std::string, std::vector<ChunkInfo *> > *utt_to_chunks) { | ||
| Input range_input(range_rxfilename); | ||
| if (!range_rxfilename.empty()) { | ||
| std::string line; | ||
| while (std::getline(range_input.Stream(), line)) { | ||
| ChunkInfo *chunk_info = new ChunkInfo(); | ||
| std::vector<std::string> fields; | ||
| SplitStringToVector(line, " \t\n\r", true, &fields); | ||
| if (fields.size() != 6) | ||
| KALDI_ERR << "Expected 6 fields in line of range file, got " | ||
| << fields.size() << " instead."; | ||
|
|
||
| std::string utt = fields[0], | ||
| start_frame_str = fields[3], | ||
| num_frames_str = fields[4], | ||
| label_str = fields[5]; | ||
|
|
||
| if (!ConvertStringToInteger(fields[1], &(chunk_info->output_archive_id)) | ||
| || !ConvertStringToInteger(start_frame_str, &(chunk_info->start_frame)) | ||
| || !ConvertStringToInteger(num_frames_str, &(chunk_info->num_frames)) | ||
| || !ConvertStringToInteger(label_str, &(chunk_info->label))) | ||
| KALDI_ERR << "Expected integer for output archive in range file."; | ||
|
|
||
| chunk_info->name = utt + "-" + start_frame_str + "-" + num_frames_str | ||
| + "-" + label_str; | ||
| unordered_map<std::string, std::vector<ChunkInfo*> >::iterator | ||
| got = utt_to_chunks->find(utt); | ||
|
|
||
| if (got == utt_to_chunks->end()) { | ||
| std::vector<ChunkInfo* > chunk_infos; | ||
| chunk_infos.push_back(chunk_info); | ||
| utt_to_chunks->insert(std::pair<std::string, | ||
| std::vector<ChunkInfo* > > (utt, chunk_infos)); | ||
| } else { | ||
| got->second.push_back(chunk_info); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| static void WriteExamples(const MatrixBase<BaseFloat> &feats, | ||
| const std::vector<ChunkInfo *> &chunks, const std::string &utt, | ||
| bool compress, int32 num_pdfs, int32 *num_egs_written, | ||
| std::vector<NnetExampleWriter *> *example_writers) { | ||
| for (std::vector<ChunkInfo *>::const_iterator it = chunks.begin(); | ||
| it != chunks.end(); ++it) { | ||
| ChunkInfo *chunk = *it; | ||
| NnetExample eg; | ||
| int32 num_rows = feats.NumRows(), | ||
| feat_dim = feats.NumCols(); | ||
| if (num_rows < chunk->num_frames) { | ||
| KALDI_WARN << "Unable to create examples for utterance " << utt | ||
| << ". Requested chunk size of " | ||
| << chunk->num_frames | ||
| << " but utterance has only " << num_rows << " frames."; | ||
| } else { | ||
| // The requested chunk positions are approximate. It's possible | ||
| // that they slightly exceed the number of frames in the utterance. | ||
| // If that occurs, we can shift the chunks location back slightly. | ||
| int32 shift = std::min(0, num_rows - chunk->start_frame | ||
| - chunk->num_frames); | ||
| SubMatrix<BaseFloat> chunk_mat(feats, chunk->start_frame + shift, | ||
| chunk->num_frames, 0, feat_dim); | ||
| NnetIo nnet_input = NnetIo("input", 0, chunk_mat); | ||
| for (std::vector<Index>::iterator indx_it = nnet_input.indexes.begin(); | ||
| indx_it != nnet_input.indexes.end(); ++indx_it) | ||
| indx_it->n = 0; | ||
|
|
||
| Posterior label; | ||
| std::vector<std::pair<int32, BaseFloat> > post; | ||
| post.push_back(std::pair<int32, BaseFloat>(chunk->label, 1.0)); | ||
| label.push_back(post); | ||
| NnetExample eg; | ||
| eg.io.push_back(nnet_input); | ||
| eg.io.push_back(NnetIo("output", num_pdfs, 0, label)); | ||
| if (compress) | ||
| eg.Compress(); | ||
|
|
||
| if (chunk->output_archive_id >= example_writers->size()) | ||
| KALDI_ERR << "Requested output index exceeds number of specified " | ||
| << "output files."; | ||
| (*example_writers)[chunk->output_archive_id]->Write( | ||
| chunk->name, eg); | ||
| (*num_egs_written) += 1; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } // namespace nnet3 | ||
| } // namespace kaldi | ||
|
|
||
| int main(int argc, char *argv[]) { | ||
| try { | ||
| using namespace kaldi; | ||
| using namespace kaldi::nnet3; | ||
| typedef kaldi::int32 int32; | ||
|
|
||
| const char *usage = | ||
| "Get examples for training an nnet3 neural network for the xvector\n" | ||
| "system. Each output example contains a chunk of features from some\n" | ||
| "utterance along with a speaker label. The location and length of\n" | ||
| "the feature chunks are specified in the 'ranges' file. Each line\n" | ||
| "is interpreted as follows:\n" | ||
| " <source-utterance> <relative-output-archive-index> " | ||
| "<absolute-archive-index> <start-frame-index> <num-frames> " | ||
| "<speaker-label>\n" | ||
| "where <relative-output-archive-index> is interpreted as a zero-based\n" | ||
| "index into the wspecifiers provided on the command line (<egs-0-out>\n" | ||
| "and so on), and <absolute-archive-index> is ignored by this program.\n" | ||
| "For example:\n" | ||
| " utt1 3 13 65 300 3\n" | ||
| " utt1 0 10 50 400 3\n" | ||
| " utt2 ...\n" | ||
| "\n" | ||
| "Usage: nnet3-xvector-get-egs [options] <ranges-filename> " | ||
| "<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n" | ||
| "\n" | ||
| "For example:\n" | ||
| "nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark" | ||
| " ark:egs_temp.2.ark ark:egs_temp.3.ark\n"; | ||
|
|
||
| bool compress = true; | ||
| int32 num_pdfs = -1; | ||
|
|
||
| ParseOptions po(usage); | ||
| po.Register("compress", &compress, "If true, write egs in " | ||
| "compressed format."); | ||
| po.Register("num-pdfs", &num_pdfs, "Number of speakers in the training " | ||
| "list."); | ||
|
|
||
| po.Read(argc, argv); | ||
|
|
||
| if (po.NumArgs() < 3) { | ||
| po.PrintUsage(); | ||
| exit(1); | ||
| } | ||
|
|
||
| std::string range_rspecifier = po.GetArg(1), | ||
| feature_rspecifier = po.GetArg(2); | ||
| std::vector<NnetExampleWriter *> example_writers; | ||
|
|
||
| for (int32 i = 3; i <= po.NumArgs(); i++) | ||
| example_writers.push_back(new NnetExampleWriter(po.GetArg(i))); | ||
|
|
||
| unordered_map<std::string, std::vector<ChunkInfo *> > utt_to_chunks; | ||
| ProcessRangeFile(range_rspecifier, &utt_to_chunks); | ||
| SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); | ||
|
|
||
| int32 num_done = 0, | ||
| num_err = 0, | ||
| num_egs_written = 0; | ||
|
|
||
| for (; !feat_reader.Done(); feat_reader.Next()) { | ||
| std::string key = feat_reader.Key(); | ||
| const Matrix<BaseFloat> &feats = feat_reader.Value(); | ||
| unordered_map<std::string, std::vector<ChunkInfo*> >::iterator | ||
| got = utt_to_chunks.find(key); | ||
| if (got == utt_to_chunks.end()) { | ||
| KALDI_WARN << "Could not create examples from utterance " | ||
| << key << " because it has no entry in the ranges " | ||
| << "input file."; | ||
| num_err++; | ||
| } else { | ||
| std::vector<ChunkInfo *> chunks = got->second; | ||
| WriteExamples(feats, chunks, key, compress, num_pdfs, | ||
| &num_egs_written, &example_writers); | ||
| num_done++; | ||
| } | ||
| } | ||
|
|
||
| // Free memory | ||
| for (unordered_map<std::string, std::vector<ChunkInfo*> >::iterator | ||
| map_it = utt_to_chunks.begin(); | ||
| map_it != utt_to_chunks.end(); ++map_it) { | ||
| DeletePointers(&map_it->second); | ||
| } | ||
| DeletePointers(&example_writers); | ||
|
|
||
| KALDI_LOG << "Finished generating examples, " | ||
| << "successfully processed " << num_done | ||
| << " feature files, wrote " << num_egs_written << " examples; " | ||
| << num_err << " files had errors."; | ||
| return (num_egs_written == 0 || num_err > num_done ? 1 : 0); | ||
| } catch(const std::exception &e) { | ||
| std::cerr << e.what() << '\n'; | ||
| return -1; | ||
| } | ||
| } |