Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: New self repair in Sigmoid/Tanh Component #1236

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/nnet3/nnet-component-itf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ std::string NonlinearComponent::Info() const {
stream << ", self-repair-upper-threshold=" << self_repair_upper_threshold_;
if (self_repair_scale_ != 0.0)
stream << ", self-repair-scale=" << self_repair_scale_;
if (self_repair_margin_threshold_ != BaseFloat(kUnsetThreshold))
stream << ", self-repair-margin-threshold=" << self_repair_margin_threshold_;
if (count_ > 0 && value_sum_.Dim() == dim_ && deriv_sum_.Dim() == dim_) {
stream << ", count=" << std::setprecision(3) << count_
<< std::setprecision(6);
Expand Down Expand Up @@ -399,6 +401,10 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
ReadBasicType(is, binary, &self_repair_scale_);
ReadToken(is, binary, &token);
}
if (token == "<SelfRepairMarginThreshold>") {
ReadBasicType(is, binary, &self_repair_margin_threshold_);
ReadToken(is, binary, &token);
}
if (token != ostr_end.str()) {
KALDI_ERR << "Expected token " << ostr_end.str()
<< ", got " << token;
Expand Down Expand Up @@ -442,6 +448,10 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<SelfRepairScale>");
WriteBasicType(os, binary, self_repair_scale_);
}
if (self_repair_margin_threshold_ != kUnsetThreshold) {
WriteToken(os, binary, "<SelfRepairMarginThreshold>");
WriteBasicType(os, binary, self_repair_margin_threshold_);
}
WriteToken(os, binary, ostr_end.str());
}

Expand All @@ -450,7 +460,7 @@ NonlinearComponent::NonlinearComponent():
num_dims_self_repaired_(0.0), num_dims_processed_(0.0),
self_repair_lower_threshold_(kUnsetThreshold),
self_repair_upper_threshold_(kUnsetThreshold),
self_repair_scale_(0.0) { }
self_repair_scale_(0.0), self_repair_margin_threshold_(kUnsetThreshold) { }

NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
Expand All @@ -459,13 +469,15 @@ NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
num_dims_processed_(other.num_dims_processed_),
self_repair_lower_threshold_(other.self_repair_lower_threshold_),
self_repair_upper_threshold_(other.self_repair_upper_threshold_),
self_repair_scale_(other.self_repair_scale_) { }
self_repair_scale_(other.self_repair_scale_),
self_repair_margin_threshold_(other.self_repair_margin_threshold_) { }

void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
bool ok = cfl->GetValue("dim", &dim_);
cfl->GetValue("self-repair-lower-threshold", &self_repair_lower_threshold_);
cfl->GetValue("self-repair-upper-threshold", &self_repair_upper_threshold_);
cfl->GetValue("self-repair-scale", &self_repair_scale_);
cfl->GetValue("self-repair-margin-threshold", &self_repair_margin_threshold_);
if (!ok || cfl->HasUnusedValues() || dim_ <= 0)
KALDI_ERR << "Invalid initializer for layer of type "
<< Type() << ": \"" << cfl->WholeLine() << "\"";
Expand Down
1 change: 1 addition & 0 deletions src/nnet3/nnet-component-itf.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ class NonlinearComponent: public Component {
BaseFloat self_repair_lower_threshold_;
BaseFloat self_repair_upper_threshold_;
BaseFloat self_repair_scale_;
BaseFloat self_repair_margin_threshold_;

// The mutex is used in UpdateStats, only for resizing vectors.
Mutex mutex_;
Expand Down
134 changes: 62 additions & 72 deletions src/nnet3/nnet-simple-component.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,18 +532,16 @@ void SigmoidComponent::RepairGradients(
CuMatrixBase<BaseFloat> *in_deriv,
SigmoidComponent *to_update) const {
KALDI_ASSERT(to_update != NULL);
// maximum possible derivative of SigmoidComponent is 0.25.
// the default lower-threshold on the derivative, below which we
// add a term to the derivative to encourage the inputs to the sigmoid
// to be closer to zero, is 0.05, which means the derivative is on average
// 5 times smaller than its maximum possible value.
BaseFloat default_lower_threshold = 0.05;
// minimum margin between the actual output value and the asymptotic
// output value of sigmoid (1 and 0), below which we add a term to the
// derivative to encourage the inputs to the sigmoid to get closer to zero.
BaseFloat default_margin_threshold = 0.05;

// we use this 'repair_probability' (hardcoded for now) to limit
// this code to running on about half of the minibatches.
BaseFloat repair_probability = 0.5;

to_update->num_dims_processed_ += dim_;
to_update->num_dims_processed_ += out_value.NumRows() * dim_;

if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
RandUniform() > repair_probability)
Expand All @@ -552,28 +550,31 @@ void SigmoidComponent::RepairGradients(
// check that the self-repair scale is in a reasonable range.
KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
BaseFloat unset = kUnsetThreshold; // -1000.0
BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
default_lower_threshold :
self_repair_lower_threshold_) *
count_;
if (self_repair_upper_threshold_ != unset) {
KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
<< "components, it does nothing.";
}

// thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
// function isn't defined for vectors).
CuMatrix<BaseFloat> thresholds(1, dim_);
CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
thresholds_vec.AddVec(-1.0, deriv_sum_);
thresholds_vec.Add(lower_threshold);
thresholds.ApplyHeaviside();
to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
BaseFloat margin_threshold = (self_repair_margin_threshold_ == unset ?
default_margin_threshold :
self_repair_margin_threshold_);

// repair_mat = out_value * 2 - 1
// sign_mat = sign(repair_mat), i.e., an element in sign_mat is 1
// if its corresponding element in repair_mat > 0, or -1 otherwise
CuMatrix<BaseFloat> repair_mat(out_value);
repair_mat.Scale(2.0);
repair_mat.Add(-1.0);
CuMatrix<BaseFloat> sign_mat(repair_mat);
sign_mat.ApplyHeaviside();
sign_mat.Scale(2.0);
sign_mat.Add(-1.0);

// At this point, 'thresholds_vec' contains a 1 for each dimension of
// the output that is 'problematic', i.e. for which the avg-deriv
// is less than the self-repair lower threshold, and a 0 for
// each dimension that is not problematic.
repair_mat.ApplyPowAbs(1.0);
repair_mat.Add(-(1.0 - margin_threshold * 2.0)); // as out_value was also scaled by 2
CuMatrix<BaseFloat> mask(out_value.NumRows(), out_value.NumCols());
mask.Heaviside(repair_mat);
to_update->num_dims_self_repaired_ += mask.Sum();
repair_mat.ApplyFloor(0.0);
repair_mat.MulElements(sign_mat);
// rescales repair_mat so that the absolute values of its elements is the
// range [0.0,1.0]
repair_mat.Scale(1.0 / (margin_threshold * 2.0));

// what we want to do is to add
// -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
Expand All @@ -587,16 +588,8 @@ void SigmoidComponent::RepairGradients(
// for inputs < 0 and negative for inputs > 0.

// We can rearrange the above as: for only the problematic columns,
// input-deriv -= 2 * self-repair-scale / repair-probabilty * output
// input-deriv += self-repair-scale / repair-probabilty
// which we can write as:
// input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
// input-deriv += self-repair-scale / repair-probabilty * thresholds-vec

in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
out_value, kNoTrans, thresholds_vec);
in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
thresholds_vec);
// input-deriv -= self-repair-scale / repair-probabilty * repair_mat
in_deriv->AddMat(-self_repair_scale_ / repair_probability, repair_mat);
}


Expand Down Expand Up @@ -950,18 +943,16 @@ void TanhComponent::RepairGradients(
CuMatrixBase<BaseFloat> *in_deriv,
TanhComponent *to_update) const {
KALDI_ASSERT(to_update != NULL);
// maximum possible derivative of SigmoidComponent is 1.0
// the default lower-threshold on the derivative, below which we
// add a term to the derivative to encourage the inputs to the sigmoid
// to be closer to zero, is 0.2, which means the derivative is on average
// 5 times smaller than its maximum possible value.
BaseFloat default_lower_threshold = 0.2;
// minimum margin between the actual output value and the asymptotic
// output value of tanh (1 and -1), below which we add a term to the
// derivative to encourage the inputs to the tanh to get closer to zero.
BaseFloat default_margin_threshold = 0.1;

// we use this 'repair_probability' (hardcoded for now) to limit
// this code to running on about half of the minibatches.
BaseFloat repair_probability = 0.5;

to_update->num_dims_processed_ += dim_;
to_update->num_dims_processed_ += out_value.NumRows() * dim_;

if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
RandUniform() > repair_probability)
Expand All @@ -970,28 +961,31 @@ void TanhComponent::RepairGradients(
// check that the self-repair scale is in a reasonable range.
KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
BaseFloat unset = kUnsetThreshold; // -1000.0
BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
default_lower_threshold :
self_repair_lower_threshold_) *
count_;
if (self_repair_upper_threshold_ != unset) {
KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
<< "components, it does nothing.";
}
BaseFloat margin_threshold = (self_repair_margin_threshold_ == unset ?
default_margin_threshold :
self_repair_margin_threshold_);

// thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
// function isn't defined for vectors).
CuMatrix<BaseFloat> thresholds(1, dim_);
CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
thresholds_vec.AddVec(-1.0, deriv_sum_);
thresholds_vec.Add(lower_threshold);
thresholds.ApplyHeaviside();
to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
// sign_mat = sign(out_value), i.e.,
// An element in sign_mat is 1 if its corresponding element in out_value > 0,
// or -1 otherwise
CuMatrix<BaseFloat> sign_mat(out_value);
sign_mat.ApplyHeaviside();
sign_mat.Scale(2.0);
sign_mat.Add(-1.0);

// At this point, 'thresholds_vec' contains a 1 for each dimension of
// the output that is 'problematic', i.e. for which the avg-deriv
// is less than the self-repair lower threshold, and a 0 for
// each dimension that is not problematic.
// repair_mat =
// floor(abs(out_value) - (1 - margin_threshold), 0) .* sign(out_value)
CuMatrix<BaseFloat> repair_mat(out_value);
repair_mat.ApplyPowAbs(1.0);
repair_mat.Add(-(1.0 - margin_threshold));
CuMatrix<BaseFloat> mask(out_value.NumRows(), out_value.NumCols());
mask.Heaviside(repair_mat);
to_update->num_dims_self_repaired_ += mask.Sum();
repair_mat.ApplyFloor(0.0);
repair_mat.MulElements(sign_mat);
// rescales repair_mat so that the absolute values of its elements are in the
// range [0.0,1.0]
repair_mat.Scale(1.0 / margin_threshold);

// what we want to do is to add -self_repair_scale_ / repair_probability times
// output-valiue) to the input derivative for each problematic dimension.
Expand All @@ -1002,13 +996,9 @@ void TanhComponent::RepairGradients(
// available. We could use just about any function that is positive for
// inputs < 0 and negative for inputs > 0.

// We can rearrange the above as: for only the problematic columns,
// input-deriv -= self-repair-scale / repair-probabilty * output
// which we can write as:
// input-deriv -= self-repair-scale / repair-probabilty * output * thresholds-vec

in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
out_value, kNoTrans, thresholds_vec);
// We can rearrange the above as:
// input-deriv -= self-repair-scale / repair-probabilty * repair_mat
in_deriv->AddMat(-self_repair_scale_ / repair_probability, repair_mat);
}

void TanhComponent::Backprop(const std::string &debug_info,
Expand Down