From 1cd073152d6f12c5476f471015f58c4f21d916f4 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 28 Feb 2016 22:28:26 +0000 Subject: [PATCH] Move some normalization to reduce math in inner loop --- lm/interpolate/tune_derivatives.cc | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/lm/interpolate/tune_derivatives.cc b/lm/interpolate/tune_derivatives.cc index 1e6ac128..7cd915a6 100644 --- a/lm/interpolate/tune_derivatives.cc +++ b/lm/interpolate/tune_derivatives.cc @@ -14,14 +14,13 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix hessian = Matrix::Zero(weights.rows(), weights.rows()); // TODO: loop instead to force low-memory evaluation? - // Compute p_I(x). - Vector interp_uni((in.LNUnigrams() * weights).array().exp()); + // Compute p_I(x)*Z_{\epsilon} i.e. the unnormalized probabilities + Vector weighted_uni((in.LNUnigrams() * weights).array().exp()); // Even -inf doesn't work for because weights can be negative. Manually set it to zero. - interp_uni(in.BOS()) = 0.0; - Accum Z_epsilon = interp_uni.sum(); - interp_uni /= Z_epsilon; + weighted_uni(in.BOS()) = 0.0; + Accum Z_epsilon = weighted_uni.sum(); // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x) - Vector unigram_cross(in.LNUnigrams().transpose() * interp_uni); + Vector unigram_cross(in.LNUnigrams().transpose() * weighted_uni / Z_epsilon); Accum sum_B_I = 0.0; Accum sum_ln_Z_context = 0.0; @@ -47,8 +46,8 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix assert(extensions); Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights)); - // Compute \sum_{x: model does not back off to unigram} p_I(x) - Accum sum_x_p_I = 0.0; + // Compute \sum_{x: model does not back off to unigram} p_I(x)Z(epsilon) + Accum unnormalized_sum_x_p_I = 0.0; // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context) Accum unnormalized_sum_x_p_I_full = 0.0; @@ -60,7 +59,7 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix // Loop over words within an instance for which extension exists. An extension happens when any model matches more than a unigram in the tuning instance. while (extensions && extensions->instance == n) { const WordIndex word = extensions->word; - sum_x_p_I += interp_uni(word); + unnormalized_sum_x_p_I += weighted_uni(word); ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n); @@ -78,18 +77,18 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix // These aren't normalized by Z_context (happens later) full_cross.noalias() += weighted * ln_p_i_full - - interp_uni(word) * Z_epsilon * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose(); + - weighted_uni(word) * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose(); // This will get multiplied by Z_context then added to the Hessian. hessian_missing_Z_context.noalias() += // Replacement terms. weighted * ln_p_i_full * ln_p_i_full.transpose() // Presumed unigrams. Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context. - - interp_uni(word) * Z_epsilon * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose(); + - weighted_uni(word) * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose(); } Accum Z_context = - weighted_backoffs * Z_epsilon * (1.0 - sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension. + weighted_backoffs * (Z_epsilon - unnormalized_sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension. + unnormalized_sum_x_p_I_full; // Add the extensions. sum_ln_Z_context += log(Z_context); Accum B_I = Z_epsilon / Z_context * weighted_backoffs; @@ -103,7 +102,7 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix // Uncorrected term B_I * (in.LNBackoffs(n).transpose() + unigram_cross) // Subtract values that should not have been charged. - - sum_x_p_I * B_I * in.LNBackoffs(n).transpose(); + - unnormalized_sum_x_p_I / Z_epsilon * B_I * in.LNBackoffs(n).transpose(); gradient += full_cross; convolve = unigram_cross * in.LNBackoffs(n); @@ -117,10 +116,10 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix - full_cross * full_cross.transpose(); } - for (Matrix::Index x = 0; x < interp_uni.rows(); ++x) { + for (Matrix::Index x = 0; x < weighted_uni.rows(); ++x) { // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x) // TODO can this be optimized? It's summing over the entire vocab which should be a matrix operation. - hessian.noalias() += sum_B_I * interp_uni(x) * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x); + hessian.noalias() += sum_B_I * weighted_uni(x) / Z_epsilon * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x); } return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast(in.NumInstances())); }