kaldi-asr · freewym · Nov 21, 2016 · Nov 26, 2016 · Nov 30, 2016
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
@@ -319,6 +319,8 @@ std::string NonlinearComponent::Info() const {
     stream << ", self-repair-upper-threshold=" << self_repair_upper_threshold_;
   if (self_repair_scale_ != 0.0)
     stream << ", self-repair-scale=" << self_repair_scale_;
+  if (self_repair_margin_threshold_ != BaseFloat(kUnsetThreshold))
+    stream << ", self-repair-margin-threshold=" << self_repair_margin_threshold_;
   if (count_ > 0 && value_sum_.Dim() == dim_ &&  deriv_sum_.Dim() == dim_) {
     stream << ", count=" << std::setprecision(3) << count_
            << std::setprecision(6);
@@ -399,6 +401,10 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
     ReadBasicType(is, binary, &self_repair_scale_);
     ReadToken(is, binary, &token);
   }
+  if (token == "<SelfRepairMarginThreshold>") {
+    ReadBasicType(is, binary, &self_repair_margin_threshold_);
+    ReadToken(is, binary, &token);
+  }
   if (token != ostr_end.str()) {
     KALDI_ERR << "Expected token " << ostr_end.str()
               << ", got " << token;
@@ -442,6 +448,10 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
     WriteToken(os, binary, "<SelfRepairScale>");
     WriteBasicType(os, binary, self_repair_scale_);
   }
+  if (self_repair_margin_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairMarginThreshold>");
+    WriteBasicType(os, binary, self_repair_margin_threshold_);
+  }
   WriteToken(os, binary, ostr_end.str());
 }
 
@@ -450,7 +460,7 @@ NonlinearComponent::NonlinearComponent():
     num_dims_self_repaired_(0.0), num_dims_processed_(0.0),
     self_repair_lower_threshold_(kUnsetThreshold),
     self_repair_upper_threshold_(kUnsetThreshold),
-    self_repair_scale_(0.0) { }
+    self_repair_scale_(0.0), self_repair_margin_threshold_(kUnsetThreshold) { }
 
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
@@ -459,13 +469,15 @@ NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     num_dims_processed_(other.num_dims_processed_),
     self_repair_lower_threshold_(other.self_repair_lower_threshold_),
     self_repair_upper_threshold_(other.self_repair_upper_threshold_),
-    self_repair_scale_(other.self_repair_scale_) { }
+    self_repair_scale_(other.self_repair_scale_),
+    self_repair_margin_threshold_(other.self_repair_margin_threshold_) { }
 
 void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = cfl->GetValue("dim", &dim_);
   cfl->GetValue("self-repair-lower-threshold", &self_repair_lower_threshold_);
   cfl->GetValue("self-repair-upper-threshold", &self_repair_upper_threshold_);
   cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  cfl->GetValue("self-repair-margin-threshold", &self_repair_margin_threshold_);
   if (!ok || cfl->HasUnusedValues() || dim_ <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";

diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
@@ -544,6 +544,7 @@ class NonlinearComponent: public Component {
   BaseFloat self_repair_lower_threshold_;
   BaseFloat self_repair_upper_threshold_;
   BaseFloat self_repair_scale_;
+  BaseFloat self_repair_margin_threshold_;
 
   // The mutex is used in UpdateStats, only for resizing vectors.
   Mutex mutex_;

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
@@ -532,18 +532,16 @@ void SigmoidComponent::RepairGradients(
     CuMatrixBase<BaseFloat> *in_deriv,
     SigmoidComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
-  // maximum possible derivative of SigmoidComponent is 0.25.
-  // the default lower-threshold on the derivative, below which we
-  // add a term to the derivative to encourage the inputs to the sigmoid
-  // to be closer to zero, is 0.05, which means the derivative is on average
-  // 5 times smaller than its maximum possible value.
-  BaseFloat default_lower_threshold = 0.05;
+  // minimum margin between the actual output value and the asymptotic
+  // output value of sigmoid (1 and 0), below which we add a term to the
+  // derivative to encourage the inputs to the sigmoid to get closer to zero.
+  BaseFloat default_margin_threshold = 0.05;
 
   // we use this 'repair_probability' (hardcoded for now) to limit
   // this code to running on about half of the minibatches.
   BaseFloat repair_probability = 0.5;
 
-  to_update->num_dims_processed_ += dim_;
+  to_update->num_dims_processed_ += out_value.NumRows() * dim_;
 
   if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
       RandUniform() > repair_probability)
@@ -552,28 +550,31 @@ void SigmoidComponent::RepairGradients(
   // check that the self-repair scale is in a reasonable range.
   KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
   BaseFloat unset = kUnsetThreshold; // -1000.0
-  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
-                               default_lower_threshold :
-                               self_repair_lower_threshold_) *
-      count_;
-  if (self_repair_upper_threshold_ != unset) {
-    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
-              << "components, it does nothing.";
-  }
-
-  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
-  // function isn't defined for vectors).
-  CuMatrix<BaseFloat> thresholds(1, dim_);
-  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
-  thresholds_vec.AddVec(-1.0, deriv_sum_);
-  thresholds_vec.Add(lower_threshold);
-  thresholds.ApplyHeaviside();
-  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
+  BaseFloat margin_threshold = (self_repair_margin_threshold_ == unset ?
+                               default_margin_threshold :
+                               self_repair_margin_threshold_);
+
+  // repair_mat = out_value * 2 - 1
+  // sign_mat = sign(repair_mat), i.e., an element in sign_mat is 1
+  // if its corresponding element in repair_mat > 0, or -1 otherwise
+  CuMatrix<BaseFloat> repair_mat(out_value);
+  repair_mat.Scale(2.0);
+  repair_mat.Add(-1.0);
+  CuMatrix<BaseFloat> sign_mat(repair_mat);
+  sign_mat.ApplyHeaviside();
+  sign_mat.Scale(2.0);
+  sign_mat.Add(-1.0);
 
-  // At this point, 'thresholds_vec' contains a 1 for each dimension of
-  // the output that is 'problematic', i.e. for which the avg-deriv
-  // is less than the self-repair lower threshold, and a 0 for
-  // each dimension that is not problematic.
+  repair_mat.ApplyPowAbs(1.0);
+  repair_mat.Add(-(1.0 - margin_threshold * 2.0)); // as out_value was also scaled by 2
+  CuMatrix<BaseFloat> mask(out_value.NumRows(), out_value.NumCols());
+  mask.Heaviside(repair_mat);
+  to_update->num_dims_self_repaired_ += mask.Sum();
+  repair_mat.ApplyFloor(0.0);
+  repair_mat.MulElements(sign_mat);
+  // rescales repair_mat so that the absolute values of its elements is the
+  // range [0.0,1.0]
+  repair_mat.Scale(1.0 / (margin_threshold * 2.0));
 
   // what we want to do is to add
   // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
@@ -587,16 +588,8 @@ void SigmoidComponent::RepairGradients(
   // for inputs < 0 and negative for inputs > 0.
 
   // We can rearrange the above as: for only the problematic columns,
-  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output
-  //   input-deriv +=  self-repair-scale / repair-probabilty
-  // which we can write as:
-  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
-  //   input-deriv +=  self-repair-scale / repair-probabilty * thresholds-vec
-
-  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
-                          out_value, kNoTrans, thresholds_vec);
-  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
-                         thresholds_vec);
+  // input-deriv -= self-repair-scale / repair-probabilty * repair_mat
+  in_deriv->AddMat(-self_repair_scale_ / repair_probability, repair_mat);
 }
 
 
@@ -950,18 +943,16 @@ void TanhComponent::RepairGradients(
     CuMatrixBase<BaseFloat> *in_deriv,
     TanhComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
-  // maximum possible derivative of SigmoidComponent is 1.0
-  // the default lower-threshold on the derivative, below which we
-  // add a term to the derivative to encourage the inputs to the sigmoid
-  // to be closer to zero, is 0.2, which means the derivative is on average
-  // 5 times smaller than its maximum possible value.
-  BaseFloat default_lower_threshold = 0.2;
+  // minimum margin between the actual output value and the asymptotic
+  // output value of tanh (1 and -1), below which we add a term to the
+  // derivative to encourage the inputs to the tanh to get closer to zero.
+  BaseFloat default_margin_threshold = 0.1;
 
   // we use this 'repair_probability' (hardcoded for now) to limit
   // this code to running on about half of the minibatches.
   BaseFloat repair_probability = 0.5;
 
-  to_update->num_dims_processed_ += dim_;
+  to_update->num_dims_processed_ += out_value.NumRows() * dim_;
 
   if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
       RandUniform() > repair_probability)
@@ -970,28 +961,31 @@ void TanhComponent::RepairGradients(
   // check that the self-repair scale is in a reasonable range.
   KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
   BaseFloat unset = kUnsetThreshold; // -1000.0
-  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
-                               default_lower_threshold :
-                               self_repair_lower_threshold_) *
-      count_;
-  if (self_repair_upper_threshold_ != unset) {
-    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
-              << "components, it does nothing.";
-  }
+  BaseFloat margin_threshold = (self_repair_margin_threshold_ == unset ?
+                                default_margin_threshold :
+                                self_repair_margin_threshold_);
 
-  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
-  // function isn't defined for vectors).
-  CuMatrix<BaseFloat> thresholds(1, dim_);
-  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
-  thresholds_vec.AddVec(-1.0, deriv_sum_);
-  thresholds_vec.Add(lower_threshold);
-  thresholds.ApplyHeaviside();
-  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
+  // sign_mat = sign(out_value), i.e.,
+  // An element in sign_mat is 1 if its corresponding element in out_value > 0,
+  // or -1 otherwise
+  CuMatrix<BaseFloat> sign_mat(out_value);
+  sign_mat.ApplyHeaviside();
+  sign_mat.Scale(2.0);
+  sign_mat.Add(-1.0);
 
-  // At this point, 'thresholds_vec' contains a 1 for each dimension of
-  // the output that is 'problematic', i.e. for which the avg-deriv
-  // is less than the self-repair lower threshold, and a 0 for
-  // each dimension that is not problematic.
+  // repair_mat =
+  // floor(abs(out_value) - (1 - margin_threshold), 0) .* sign(out_value)
+  CuMatrix<BaseFloat> repair_mat(out_value);
+  repair_mat.ApplyPowAbs(1.0);
+  repair_mat.Add(-(1.0 - margin_threshold));
+  CuMatrix<BaseFloat> mask(out_value.NumRows(), out_value.NumCols());
+  mask.Heaviside(repair_mat);
+  to_update->num_dims_self_repaired_ += mask.Sum();
+  repair_mat.ApplyFloor(0.0);
+  repair_mat.MulElements(sign_mat);
+  // rescales repair_mat so that the absolute values of its elements are in the
+  // range [0.0,1.0]
+  repair_mat.Scale(1.0 / margin_threshold);
 
   // what we want to do is to add -self_repair_scale_ / repair_probability times
   // output-valiue) to the input derivative for each problematic dimension.
@@ -1002,13 +996,9 @@ void TanhComponent::RepairGradients(
   // available.  We could use just about any function that is positive for
   // inputs < 0 and negative for inputs > 0.
 
-  // We can rearrange the above as: for only the problematic columns,
-  //   input-deriv -= self-repair-scale / repair-probabilty * output
-  // which we can write as:
-  //   input-deriv -=  self-repair-scale / repair-probabilty * output * thresholds-vec
-
-  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
-                          out_value, kNoTrans, thresholds_vec);
+  // We can rearrange the above as:
+  // input-deriv -= self-repair-scale / repair-probabilty * repair_mat
+  in_deriv->AddMat(-self_repair_scale_ / repair_probability, repair_mat);
 }
 
 void TanhComponent::Backprop(const std::string &debug_info,