320 changes: 320 additions & 0 deletions egs/sre16/v2/run.sh

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions egs/sre16/v2/sid
1 change: 1 addition & 0 deletions egs/sre16/v2/steps
1 change: 1 addition & 0 deletions egs/sre16/v2/utils
194 changes: 194 additions & 0 deletions egs/wsj/s5/steps/data/augment_data_dir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#!/usr/bin/env python3
# Copyright 2017 David Snyder
# Apache 2.0
#
# This script generates augmented data. It is based on
# steps/data/reverberate_data_dir.py but doesn't handle reverberation.
# It is designed to be somewhat simpler and more flexible for augmenting with
# additive noise.
from __future__ import print_function
import sys, random, argparse, os, imp
sys.path.append("steps/data/")
from reverberate_data_dir import ParseFileToDict
from reverberate_data_dir import WriteDictToFile
data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')

def GetArgs():
parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
"Noises are separated into background and foreground noises which are added together or "
"separately. Background noises are added to the entire recording, and repeated as necessary "
"to cover the full length. Multiple overlapping background noises can be added, to simulate "
"babble, for example. Foreground noises are added sequentially, according to a specified "
"interval. See also steps/data/reverberate_data_dir.py "
"Usage: augment_data_dir.py [options...] <in-data-dir> <out-data-dir> "
"E.g., steps/data/augment_data_dir.py --utt-suffix aug --fg-snrs 20:10:5:0 --bg-snrs 20:15:10 "
"--num-bg-noise 1:2:3 --fg-interval 3 --fg-noise-dir data/musan_noise --bg-noise-dir "
"data/musan_music data/train data/train_aug", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--fg-snrs', type=str, dest = "fg_snr_str", default = '20:10:0',
help='When foreground noises are being added, the script will iterate through these SNRs.')
parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
help='When background noises are being added, the script will iterate through these SNRs.')
parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1',
help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording')
parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0,
help='Number of seconds between the end of one foreground noise and the beginning of the next.')
parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.')
parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.')

parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
help="Background noise data directory")
parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
help="Foreground noise data directory")
parser.add_argument("input_dir", help="Input data directory")
parser.add_argument("output_dir", help="Output data directory")

print(' '.join(sys.argv))
args = parser.parse_args()
args = CheckArgs(args)
return args

def CheckArgs(args):
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if not args.fg_interval >= 0:
raise Exception("--fg-interval must be 0 or greater")
if args.bg_noise_dir is None and args.fg_noise_dir is None:
raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
return args

def GetNoiseList(noise_wav_scp_filename):
noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines()
noise_wavs = {}
noise_utts = []
for line in noise_wav_scp_file:
toks=line.split(" ")
wav = " ".join(toks[1:])
noise_utts.append(toks[0])
noise_wavs[toks[0]] = wav.rstrip()
return noise_utts, noise_wavs

def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
# This section is common to both foreground and background noises
new_wav = ""
dur_str = str(dur)
noise_dur = 0
tot_noise_dur = 0
snrs=[]
noises=[]
start_times=[]

# Now handle the background noises
if len(bg_noise_utts) > 0:
num = random.choice(num_opts)
for i in range(0, num):
noise_utt = random.choice(bg_noise_utts)
noise = noise_wavs[noise_utt] + " wav-reverberate --duration=" \
+ dur_str + " - - |"
snr = random.choice(bg_snr_opts)
snrs.append(snr)
start_times.append(0)
noises.append(noise)

# Now handle the foreground noises
if len(fg_noise_utts) > 0:
while tot_noise_dur < dur:
noise_utt = random.choice(fg_noise_utts)
noise = noise_wavs[noise_utt]
snr = random.choice(fg_snr_opts)
snrs.append(snr)
noise_dur = noise2dur[noise_utt]
start_times.append(tot_noise_dur)
tot_noise_dur += noise_dur + interval
noises.append(noise)

start_times_str = "--start-times='" + ",".join(map(str,start_times)) + "'"
snrs_str = "--snrs='" + ",".join(map(str,snrs)) + "'"
noises_str = "--additive-signals='" + ",".join(noises) + "'"

# If the wav is just a file
if len(wav.split()) == 1:
new_wav = "wav-reverberate --shift-output=true " + noises_str + " " \
+ start_times_str + " " + snrs_str + " " + wav + " - |"
# Else if the wav is in a pipe
else:
new_wav = wav + "wav-reverberate --shift-output=true " + noises_str + " " \
+ start_times_str + " " + snrs_str + " - - |"
return new_wav

def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir):
if os.path.isfile(input_dir + "/" + filename):
dict = ParseFileToDict(input_dir + "/" + filename,
value_processor = lambda x: " ".join(x))
if len(utt_suffix) > 0:
new_dict = {}
for key in dict.keys():
new_dict[key + "-" + utt_suffix] = dict[key]
dict = new_dict
WriteDictToFile(dict, output_dir + "/" + filename)

def main():
args = GetArgs()
fg_snrs = map(int, args.fg_snr_str.split(":"))
bg_snrs = map(int, args.bg_snr_str.split(":"))
input_dir = args.input_dir
output_dir = args.output_dir
num_bg_noises = map(int, args.num_bg_noises.split(":"))
reco2dur = ParseFileToDict(input_dir + "/reco2dur",
value_processor = lambda x: float(x[0]))
wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines()

noise_wavs = {}
noise_reco2dur = {}
bg_noise_utts = []
fg_noise_utts = []

# Load background noises
if args.bg_noise_dir:
bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename)
bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur",
value_processor = lambda x: float(x[0]))
noise_wavs.update(bg_noise_wavs)
noise_reco2dur.update(bg_noise_reco2dur)

# Load background noises
if args.fg_noise_dir:
fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename)
fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur",
value_processor = lambda x: float(x[0]))
noise_wavs.update(fg_noise_wavs)
noise_reco2dur.update(fg_noise_reco2dur)

random.seed(args.random_seed)
new_utt2wav = {}
new_utt2spk = {}

# Augment each line in the wav file
for line in wav_scp_file:
toks = line.rstrip().split(" ")
utt = toks[0]
wav = " ".join(toks[1:])
dur = reco2dur[utt]
new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
num_bg_noises)
new_utt = utt + "-" + args.utt_suffix
new_utt2wav[new_utt] = new_wav

if not os.path.exists(output_dir):
os.makedirs(output_dir)

WriteDictToFile(new_utt2wav, output_dir + "/wav.scp")
CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir)
CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir)
CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir)
CopyFileIfExists("", "spk2gender", input_dir, output_dir)
data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))

if __name__ == "__main__":
main()
43 changes: 21 additions & 22 deletions egs/wsj/s5/steps/data/reverberate_data_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def GetArgs():
"--random-seed 1 data/train data/train_rvb",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array",
parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array",
help="Specifies the parameters of an RIR set. "
"Supports the specification of mixture_weight and rir_list_file_name. The mixture weight is optional. "
"The default mixture weight is the probability mass remaining after adding the mixture weights "
Expand Down Expand Up @@ -104,7 +104,7 @@ def CheckArgs(args):

if args.isotropic_noise_addition_probability < 0 or args.isotropic_noise_addition_probability > 1:
raise Exception("--isotropic-noise-addition-probability must be between 0 and 1")

if args.rir_smoothing_weight < 0 or args.rir_smoothing_weight > 1:
raise Exception("--rir-smoothing-weight must be between 0 and 1")

Expand All @@ -113,7 +113,7 @@ def CheckArgs(args):

if args.max_noises_per_minute < 0:
raise Exception("--max-noises-per-minute cannot be negative")

if args.source_sampling_rate is not None and args.source_sampling_rate <= 0:
raise Exception("--source-sampling-rate cannot be non-positive")

Expand All @@ -133,7 +133,7 @@ def next(self):


# This functions picks an item from the collection according to the associated probability distribution.
# The probability estimate of each item in the collection is stored in the "probability" field of
# The probability estimate of each item in the collection is stored in the "probability" field of
# the particular item. x : a collection (list or dictionary) where the values contain a field called probability
def PickItemWithProbability(x):
if isinstance(x, dict):
Expand All @@ -155,7 +155,6 @@ def PickItemWithProbability(x):
def ParseFileToDict(file, assert2fields = False, value_processor = None):
if value_processor is None:
value_processor = lambda x: x[0]

dict = {}
for line in open(file, 'r'):
parts = line.split()
Expand Down Expand Up @@ -236,7 +235,7 @@ def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the in


# This function randomly decides whether to reverberate, and sample a RIR if it does
# It also decides whether to add the appropriate noises
# It also decides whether to add the appropriate noises
# This function return the string of options to the binary wav-reverberate
def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format
pointsource_noise_list, # the point source noise list
Expand Down Expand Up @@ -306,15 +305,15 @@ def GetNewId(id, prefix=None, copy=0):
new_id = id

return new_id


# This is the main function to generate pipeline command for the corruption
# The generic command of wav-reverberate will be like:
# wav-reverberate --duration=t --impulse-response=rir.wav
# wav-reverberate --duration=t --impulse-response=rir.wav
# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings
durations, # a dictionary whose values are the duration (in sec) of the speech recordings
output_dir, # output directory to write the corrupted wav.scp
output_dir, # output directory to write the corrupted wav.scp
room_dict, # the room dictionary, please refer to MakeRoomDict() for the format
pointsource_noise_list, # the point source noise list
iso_noise_dict, # the isotropic noise dictionary
Expand Down Expand Up @@ -358,11 +357,11 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal
pointsource_noise_addition_probability, # Probability of adding point-source noises
speech_dur, # duration of the recording
max_noises_recording # Maximum number of point-source noises that can be added
)
)

# prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data
if reverberate_opts == "" or i == 0:
wav_corrupted_pipe = "{0}".format(wav_original_pipe)
wav_corrupted_pipe = "{0}".format(wav_original_pipe)
else:
wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)

Expand All @@ -380,7 +379,7 @@ def AddPrefixToFields(input_file, output_file, num_replicas, include_original, p
start_index = 0
else:
start_index = 1

for i in range(start_index, num_replicas+1):
for line in list:
if len(line) > 0 and line[0] != ';':
Expand Down Expand Up @@ -410,7 +409,7 @@ def CreateReverberatedCopy(input_dir,
pointsource_noise_addition_probability, # Probability of adding point-source noises
max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
):

wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
if not os.path.isfile(input_dir + "/reco2dur"):
print("Getting the duration of the recordings...");
Expand All @@ -426,8 +425,8 @@ def CreateReverberatedCopy(input_dir,
background_snr_array = map(lambda x: float(x), background_snr_string.split(':'))

GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
pointsource_noise_addition_probability, max_noises_per_minute)

AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
Expand All @@ -447,7 +446,7 @@ def CreateReverberatedCopy(input_dir,
if os.path.isfile(input_dir + "/reco2file_and_channel"):
AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])

data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}"
data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
.format(output_dir = output_dir))


Expand Down Expand Up @@ -507,7 +506,7 @@ def ParseSetParameterStrings(set_para_array):
return SmoothProbabilityDistribution(set_list)


# This function creates the RIR list
# This function creates the RIR list
# Each rir object in the list contains the following attributes:
# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
# Please refer to the help messages in the parser for the meaning of these attributes
Expand All @@ -521,7 +520,7 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse response.')
rir_parser.add_argument('--cte', type=float, default=None, help='Early-to-late index of the impulse response.')
rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse response.')
rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)

set_list = ParseSetParameterStrings(rir_set_para_array)
Expand Down Expand Up @@ -569,7 +568,7 @@ def MakeRoomDict(rir_list):
return room_dict


# This function creates the point-source noise list
# This function creates the point-source noise list
# and the isotropic noise dictionary from the noise information file
# The isotropic noise dictionary is indexed by the room
# and its value is the corrresponding isotropic noise list
Expand All @@ -596,7 +595,7 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
current_noise_list = map(lambda x: noise_parser.parse_args(shlex.split(x.strip())),open(noise_set.filename))
current_pointsource_noise_list = []
for noise in current_noise_list:
if sampling_rate is not None:
if sampling_rate is not None:
# check if the rspecifier is a pipe or not
if len(noise.noise_rspecifier.split()) == 1:
noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate)
Expand All @@ -615,11 +614,11 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)

pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)

# ensure the point-source noise probabilities sum to 1
# ensure the point-source noise probabilities sum to 1
pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0)
if len(pointsource_noise_list) > 0:
assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)

# ensure the isotropic noise source probabilities for a given room sum to 1
for key in iso_noise_dict.keys():
iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key])
Expand Down
11 changes: 7 additions & 4 deletions egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ def set_derived_configs(self):
self._stats_period = int(m.group(4))
self._right_context = int(m.group(5))

output_dim = (self.descriptors['input']['dim']
* (2 if self._output_stddev else 1)
+ 1 if self._output_log_counts else 0)
if self._output_stddev:
output_dim = 2 * self.descriptors['input']['dim']
else:
output_dim = self.descriptors['input']['dim']
if self._output_log_counts:
output_dim = output_dim + 1

if self.config['dim'] > 0 and self.config['dim'] != output_dim:
raise RuntimeError(
Expand All @@ -76,7 +79,7 @@ def set_derived_configs(self):
self.config['dim'] = output_dim

def check_configs(self):
if not (self._left_context > 0 and self._right_context > 0
if not (self._left_context >= 0 and self._right_context >= 0
and self._input_period > 0 and self._stats_period > 0
and self._left_context % self._stats_period == 0
and self._right_context % self._stats_period == 0
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/utils/combine_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ else
echo "$0 [info]: not combining segments as it does not exist"
fi

for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
exists_somewhere=false
absent_somewhere=false
for d in $*; do
Expand Down
4 changes: 4 additions & 0 deletions egs/wsj/s5/utils/copy_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# that contains some subset of the following files:
# feats.scp
# wav.scp
# vad.scp
# spk2utt
# utt2spk
# text
Expand Down Expand Up @@ -79,6 +80,9 @@ if [ -f $srcdir/feats.scp ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
fi

if [ -f $srcdir/vad.scp ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
fi

if [ -f $srcdir/segments ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
Expand Down
4 changes: 2 additions & 2 deletions src/makefiles/default_rules.mk
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ $(LIBFILE): $(OBJFILES)
ifeq ($(KALDI_FLAVOR), dynamic)
ifeq ($(shell uname), Darwin)
$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS)
rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@
else ifeq ($(shell uname), Linux)
# Building shared library from static (static was compiled with -fPIC)
$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive $(LDFLAGS) $(LDLIBS)
rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@
else # Platform not supported
$(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
endif
Expand Down
2 changes: 1 addition & 1 deletion src/nnet3bin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
nnet3-discriminative-compute-objf nnet3-discriminative-train \
nnet3-discriminative-subset-egs nnet3-get-egs-simple \
nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
nnet3-egs-augment-image
nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute

OBJFILES =

Expand Down
211 changes: 211 additions & 0 deletions src/nnet3bin/nnet3-xvector-compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
// nnet3bin/nnet3-xvector-compute.cc

// Copyright 2017 Johns Hopkins University (author: Daniel Povey)
// 2017 Johns Hopkins University (author: Daniel Garcia-Romero)
// 2017 David Snyder

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "nnet3/nnet-am-decodable-simple.h"
#include "base/timer.h"
#include "nnet3/nnet-utils.h"

namespace kaldi {
namespace nnet3 {

// Computes an xvector from a chunk of speech features.
static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
const Nnet &nnet, CachingOptimizingCompiler *compiler,
Vector<BaseFloat> *xvector) {
ComputationRequest request;
request.need_model_derivative = false;
request.store_component_stats = false;
request.inputs.push_back(
IoSpecification("input", 0, features.NumRows()));
IoSpecification output_spec;
output_spec.name = "output";
output_spec.has_deriv = false;
output_spec.indexes.resize(1);
request.outputs.resize(1);
request.outputs[0].Swap(&output_spec);
const NnetComputation *computation = compiler->Compile(request);
Nnet *nnet_to_update = NULL; // we're not doing any update.
NnetComputer computer(NnetComputeOptions(), *computation,
nnet, nnet_to_update);
CuMatrix<BaseFloat> input_feats_cu(features);
computer.AcceptInput("input", &input_feats_cu);
computer.Run();
CuMatrix<BaseFloat> cu_output;
computer.GetOutputDestructive("output", &cu_output);
xvector->Resize(cu_output.NumCols());
xvector->CopyFromVec(cu_output.Row(0));
}

} // namespace nnet3
} // namespace kaldi

int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet3;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;

const char *usage =
"Propagate features through an xvector neural network model and write\n"
"the output vectors. \"Xvector\" is our term for a vector or\n"
"embedding which is the output of a particular type of neural network\n"
"architecture found in speaker recognition. This architecture\n"
"consists of several layers that operate on frames, a statistics\n"
"pooling layer that aggregates over the frame-level representations\n"
"and possibly additional layers that operate on segment-level\n"
"representations. The xvectors are generally extracted from an\n"
"output layer after the statistics pooling layer. By default, one\n"
"xvector is extracted directly from the set of features for each\n"
"utterance. Optionally, xvectors are extracted from chunks of input\n"
"features and averaged, to produce a single vector.\n"
"\n"
"Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
"<features-rspecifier> <vector-wspecifier>\n"
"e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
"ark:nnet_prediction.ark\n"
"See also: nnet3-compute\n";

ParseOptions po(usage);
Timer timer;

NnetSimpleComputationOptions opts;
opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.

std::string use_gpu = "no";
int32 chunk_size = -1,
min_chunk_size = 100;

opts.Register(&po);
po.Register("use-gpu", &use_gpu,
"yes|no|optional|wait, only has effect if compiled with CUDA");
po.Register("chunk-size", &chunk_size,
"If set, extracts xectors from specified chunk-size, and averages. "
"If not set, extracts an xvector from all available features.");
po.Register("min-chunk-size", &min_chunk_size,
"Minimum chunk-size allowed when extracting xvectors.");

po.Read(argc, argv);

if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}

#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif

std::string nnet_rxfilename = po.GetArg(1),
feature_rspecifier = po.GetArg(2),
vector_wspecifier = po.GetArg(3);

Nnet nnet;
ReadKaldiObject(nnet_rxfilename, &nnet);
SetBatchnormTestMode(true, &nnet);
SetDropoutTestMode(true, &nnet);
CollapseModel(CollapseModelConfig(), &nnet);

CachingOptimizingCompiler compiler(nnet, opts.optimize_config);

BaseFloatVectorWriter vector_writer(vector_wspecifier);

int32 num_success = 0, num_fail = 0;
int64 frame_count = 0;
int32 xvector_dim = nnet.OutputDim("output");

SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);

for (; !feature_reader.Done(); feature_reader.Next()) {
std::string utt = feature_reader.Key();
const Matrix<BaseFloat> &features (feature_reader.Value());
if (features.NumRows() == 0) {
KALDI_WARN << "Zero-length utterance: " << utt;
num_fail++;
continue;
}
int32 num_rows = features.NumRows(),
feat_dim = features.NumCols(),
this_chunk_size = chunk_size;

if (num_rows < min_chunk_size) {
KALDI_WARN << "Minimum chunk size of " << min_chunk_size
<< " is greater than the number of rows "
<< "in utterance: " << utt;
num_fail++;
continue;
} else if (num_rows < chunk_size) {
KALDI_LOG << "Chunk size of " << chunk_size << " is greater than "
<< "the number of rows in utterance: " << utt
<< ", using chunk size of " << num_rows;
this_chunk_size = num_rows;
} else if (chunk_size == -1) {
this_chunk_size = num_rows;
}

int32 num_chunks = ceil(
num_rows / static_cast<BaseFloat>(this_chunk_size));
Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);
BaseFloat tot_weight = 0.0;

// Iterate over the feature chunks.
for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
// If we're nearing the end of the input, we may need to shift the
// offset back so that we can get this_chunk_size frames of input to
// the nnet.
int32 offset = std::min(
this_chunk_size, num_rows - chunk_indx * this_chunk_size);
if (offset < min_chunk_size)
continue;
SubMatrix<BaseFloat> sub_features(
features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
Vector<BaseFloat> xvector;
tot_weight += offset;
RunNnetComputation(sub_features, nnet, &compiler, &xvector);
xvector_avg.AddVec(offset, xvector);
}
xvector_avg.Scale(1.0 / tot_weight);
vector_writer.Write(utt, xvector_avg);

frame_count += features.NumRows();
num_success++;
}

#if HAVE_CUDA==1
CuDevice::Instantiate().PrintProfile();
#endif
double elapsed = timer.Elapsed();
KALDI_LOG << "Time taken "<< elapsed
<< "s: real-time factor assuming 100 frames/sec is "
<< (elapsed*100.0/frame_count);
KALDI_LOG << "Done " << num_success << " utterances, failed for "
<< num_fail;

if (num_success != 0) return 0;
else return 1;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}
229 changes: 229 additions & 0 deletions src/nnet3bin/nnet3-xvector-get-egs.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
// nnet3bin/nnet3-xvector-get-egs.cc

// Copyright 2016-2017 Johns Hopkins University (author: Daniel Povey)
// 2016-2017 Johns Hopkins University (author: Daniel Garcia-Romero)
// 2016-2017 David Snyder

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include <sstream>
#include "util/common-utils.h"
#include "nnet3/nnet-example.h"

namespace kaldi {
namespace nnet3 {

// A struct for holding information about the position and
// duration of each chunk.
struct ChunkInfo {
std::string name;
int32 output_archive_id;
int32 start_frame;
int32 num_frames;
int32 label;
};

// Process the range input file and store it as a map from utterance
// name to vector of ChunkInfo structs.
static void ProcessRangeFile(const std::string &range_rxfilename,
unordered_map<std::string, std::vector<ChunkInfo *> > *utt_to_chunks) {
Input range_input(range_rxfilename);
if (!range_rxfilename.empty()) {
std::string line;
while (std::getline(range_input.Stream(), line)) {
ChunkInfo *chunk_info = new ChunkInfo();
std::vector<std::string> fields;
SplitStringToVector(line, " \t\n\r", true, &fields);
if (fields.size() != 6)
KALDI_ERR << "Expected 6 fields in line of range file, got "
<< fields.size() << " instead.";

std::string utt = fields[0],
start_frame_str = fields[3],
num_frames_str = fields[4],
label_str = fields[5];

if (!ConvertStringToInteger(fields[1], &(chunk_info->output_archive_id))
|| !ConvertStringToInteger(start_frame_str, &(chunk_info->start_frame))
|| !ConvertStringToInteger(num_frames_str, &(chunk_info->num_frames))
|| !ConvertStringToInteger(label_str, &(chunk_info->label)))
KALDI_ERR << "Expected integer for output archive in range file.";

chunk_info->name = utt + "-" + start_frame_str + "-" + num_frames_str
+ "-" + label_str;
unordered_map<std::string, std::vector<ChunkInfo*> >::iterator
got = utt_to_chunks->find(utt);

if (got == utt_to_chunks->end()) {
std::vector<ChunkInfo* > chunk_infos;
chunk_infos.push_back(chunk_info);
utt_to_chunks->insert(std::pair<std::string,
std::vector<ChunkInfo* > > (utt, chunk_infos));
} else {
got->second.push_back(chunk_info);
}
}
}
}

static void WriteExamples(const MatrixBase<BaseFloat> &feats,
const std::vector<ChunkInfo *> &chunks, const std::string &utt,
bool compress, int32 num_pdfs, int32 *num_egs_written,
std::vector<NnetExampleWriter *> *example_writers) {
for (std::vector<ChunkInfo *>::const_iterator it = chunks.begin();
it != chunks.end(); ++it) {
ChunkInfo *chunk = *it;
NnetExample eg;
int32 num_rows = feats.NumRows(),
feat_dim = feats.NumCols();
if (num_rows < chunk->num_frames) {
KALDI_WARN << "Unable to create examples for utterance " << utt
<< ". Requested chunk size of "
<< chunk->num_frames
<< " but utterance has only " << num_rows << " frames.";
} else {
// The requested chunk positions are approximate. It's possible
// that they slightly exceed the number of frames in the utterance.
// If that occurs, we can shift the chunks location back slightly.
int32 shift = std::min(0, num_rows - chunk->start_frame
- chunk->num_frames);
SubMatrix<BaseFloat> chunk_mat(feats, chunk->start_frame + shift,
chunk->num_frames, 0, feat_dim);
NnetIo nnet_input = NnetIo("input", 0, chunk_mat);
for (std::vector<Index>::iterator indx_it = nnet_input.indexes.begin();
indx_it != nnet_input.indexes.end(); ++indx_it)
indx_it->n = 0;

Posterior label;
std::vector<std::pair<int32, BaseFloat> > post;
post.push_back(std::pair<int32, BaseFloat>(chunk->label, 1.0));
label.push_back(post);
NnetExample eg;
eg.io.push_back(nnet_input);
eg.io.push_back(NnetIo("output", num_pdfs, 0, label));
if (compress)
eg.Compress();

if (chunk->output_archive_id >= example_writers->size())
KALDI_ERR << "Requested output index exceeds number of specified "
<< "output files.";
(*example_writers)[chunk->output_archive_id]->Write(
chunk->name, eg);
(*num_egs_written) += 1;
}
}
}

} // namespace nnet3
} // namespace kaldi

int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet3;
typedef kaldi::int32 int32;

const char *usage =
"Get examples for training an nnet3 neural network for the xvector\n"
"system. Each output example contains a chunk of features from some\n"
"utterance along with a speaker label. The location and length of\n"
"the feature chunks are specified in the 'ranges' file. Each line\n"
"is interpreted as follows:\n"
" <source-utterance> <relative-output-archive-index> "
"<absolute-archive-index> <start-frame-index> <num-frames> "
"<speaker-label>\n"
"where <relative-output-archive-index> is interpreted as a zero-based\n"
"index into the wspecifiers provided on the command line (<egs-0-out>\n"
"and so on), and <absolute-archive-index> is ignored by this program.\n"
"For example:\n"
" utt1 3 13 65 300 3\n"
" utt1 0 10 50 400 3\n"
" utt2 ...\n"
"\n"
"Usage: nnet3-xvector-get-egs [options] <ranges-filename> "
"<features-rspecifier> <egs-0-out> <egs-1-out> ... <egs-N-1-out>\n"
"\n"
"For example:\n"
"nnet3-xvector-get-egs ranges.1 \"$feats\" ark:egs_temp.1.ark"
" ark:egs_temp.2.ark ark:egs_temp.3.ark\n";

bool compress = true;
int32 num_pdfs = -1;

ParseOptions po(usage);
po.Register("compress", &compress, "If true, write egs in "
"compressed format.");
po.Register("num-pdfs", &num_pdfs, "Number of speakers in the training "
"list.");

po.Read(argc, argv);

if (po.NumArgs() < 3) {
po.PrintUsage();
exit(1);
}

std::string range_rspecifier = po.GetArg(1),
feature_rspecifier = po.GetArg(2);
std::vector<NnetExampleWriter *> example_writers;

for (int32 i = 3; i <= po.NumArgs(); i++)
example_writers.push_back(new NnetExampleWriter(po.GetArg(i)));

unordered_map<std::string, std::vector<ChunkInfo *> > utt_to_chunks;
ProcessRangeFile(range_rspecifier, &utt_to_chunks);
SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);

int32 num_done = 0,
num_err = 0,
num_egs_written = 0;

for (; !feat_reader.Done(); feat_reader.Next()) {
std::string key = feat_reader.Key();
const Matrix<BaseFloat> &feats = feat_reader.Value();
unordered_map<std::string, std::vector<ChunkInfo*> >::iterator
got = utt_to_chunks.find(key);
if (got == utt_to_chunks.end()) {
KALDI_WARN << "Could not create examples from utterance "
<< key << " because it has no entry in the ranges "
<< "input file.";
num_err++;
} else {
std::vector<ChunkInfo *> chunks = got->second;
WriteExamples(feats, chunks, key, compress, num_pdfs,
&num_egs_written, &example_writers);
num_done++;
}
}

// Free memory
for (unordered_map<std::string, std::vector<ChunkInfo*> >::iterator
map_it = utt_to_chunks.begin();
map_it != utt_to_chunks.end(); ++map_it) {
DeletePointers(&map_it->second);
}
DeletePointers(&example_writers);

KALDI_LOG << "Finished generating examples, "
<< "successfully processed " << num_done
<< " feature files, wrote " << num_egs_written << " examples; "
<< num_err << " files had errors.";
return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}