Do not keep around default quantization matrices

This saves ~3Mb of static memory. Speed impact with multiple repetitions is negligible, significant speed improvement for one-off decodes. Slight variation in 6th digit of en/decoding results due to more approximate quant matrix computation. ``` Before: Encoding kPixels Bytes BPP E MP/s D MP/s Max norm pnorm BPP*pnorm Bugs ------------------------------------------------------------------------------------------------------------ jxl:d0.7 13270 3400483 2.0499581 5.466 47.067 1.50473666 0.47348297 0.970620226409 0 jxl:d1 13270 2678725 1.6148512 5.750 50.382 1.68153656 0.60257544 0.973069654737 0 jxl:d2 13270 1683788 1.0150602 6.102 53.015 3.00312042 0.95243347 0.966777268691 0 jxl:d4 13270 1004956 0.6058309 5.979 37.066 4.85261440 1.50554379 0.912104941537 0 jxl:d6 13270 718865 0.4333629 5.772 36.305 7.34840679 1.97809701 0.857233817426 0 jxl:d8 13270 576902 0.3477815 5.852 35.697 8.16245556 2.36179685 0.821389132427 0 Aggregate: 13270 1362308 0.8212584 5.817 42.674 3.60956523 1.11400975 0.914889813753 0 2268 x 1512, geomean: 90.43 MP/s [54.13, 91.49], 100 reps, 0 threads. 2268 x 1512, 56.57 MP/s [56.57, 56.57], 1 reps, 0 threads. 444 x 258, geomean: 62.33 MP/s [6.49, 63.54], 100 reps, 0 threads. 444 x 258, 7.06 MP/s [7.06, 7.06], 1 reps, 0 threads. After: Encoding kPixels Bytes BPP E MP/s D MP/s Max norm pnorm BPP*pnorm Bugs ------------------------------------------------------------------------------------------------------------ jxl:d0.7 13270 3400473 2.0499520 5.424 46.559 1.50473595 0.47348321 0.970617862124 0 jxl:d1 13270 2678723 1.6148500 5.710 49.788 1.68153679 0.60257545 0.973068940575 0 jxl:d2 13270 1683788 1.0150602 6.064 52.348 3.00312042 0.95243348 0.966777275451 0 jxl:d4 13270 1004957 0.6058315 5.942 36.558 4.85261631 1.50554828 0.912108572859 0 jxl:d6 13270 718865 0.4333629 5.736 35.850 7.34841013 1.97809768 0.857234107317 0 jxl:d8 13270 576891 0.3477748 5.811 35.189 8.16245461 2.36179583 0.821373116628 0 Aggregate: 13270 1362303 0.8212554 5.777 42.136 3.60956547 1.11401039 0.914887016940 0 2268 x 1512, geomean: 90.40 MP/s [71.12, 91.11], 100 reps, 0 threads. 2268 x 1512, 68.69 MP/s [68.69, 68.69], 1 reps, 0 threads. 444 x 258, geomean: 63.93 MP/s [37.00, 64.71], 100 reps, 0 threads. 444 x 258, 37.56 MP/s [37.56, 37.56], 1 reps, 0 threads. ```
libjxl · Feb 16, 2022 · a67a570 · a67a570
1 parent 40cf87c
commit a67a570
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 257 deletions.
diff --git a/lib/jxl/compressed_image_test.cc b/lib/jxl/compressed_image_test.cc
@@ -77,6 +77,8 @@ void RunRGBRoundTrip(float distance, bool fast) {
   PassesEncoderState enc_state;
   JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state.shared));
 
+  JXL_CHECK(enc_state.shared.matrices.EnsureComputed(~0u));
+
   enc_state.shared.quantizer.SetQuant(4.0f, 4.0f,
                                       &enc_state.shared.raw_quant_field);
   enc_state.shared.ac_strategy.FillDCT8();

diff --git a/lib/jxl/dec_frame.cc b/lib/jxl/dec_frame.cc
@@ -545,6 +545,8 @@ Status FrameDecoder::ProcessACGlobal(BitReader* br) {
   if (frame_header_.encoding == FrameEncoding::kVarDCT) {
     JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode(
         br, &modular_frame_decoder_));
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.EnsureComputed(
+        dec_state_->used_acs));
 
     size_t num_histo_bits =
         CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups);

diff --git a/lib/jxl/dec_group.cc b/lib/jxl/dec_group.cc
@@ -97,15 +97,14 @@ void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
 template <ACType ac_type>
 void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
                  Vec<D> scaled_dequant_b,
-                 const float* JXL_RESTRICT dequant_matrices, size_t dq_ofs,
-                 size_t size, size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
+                 const float* JXL_RESTRICT dequant_matrices, size_t size,
+                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
                  const float* JXL_RESTRICT biases, ACPtr qblock[3],
                  float* JXL_RESTRICT block) {
-  const auto x_mul = Load(d, dequant_matrices + dq_ofs + k) * scaled_dequant_x;
-  const auto y_mul =
-      Load(d, dequant_matrices + dq_ofs + size + k) * scaled_dequant_y;
+  const auto x_mul = Load(d, dequant_matrices + k) * scaled_dequant_x;
+  const auto y_mul = Load(d, dequant_matrices + size + k) * scaled_dequant_y;
   const auto b_mul =
-      Load(d, dequant_matrices + dq_ofs + 2 * size + k) * scaled_dequant_b;
+      Load(d, dequant_matrices + 2 * size + k) * scaled_dequant_b;
 
   Vec<DI> quantized_x_int;
   Vec<DI> quantized_y_int;
@@ -139,9 +138,8 @@ template <ACType ac_type>
 void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
                   float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
                   Vec<D> b_cc_mul, size_t kind, size_t size,
-                  const Quantizer& quantizer,
-                  const float* JXL_RESTRICT dequant_matrices,
-                  size_t covered_blocks, const size_t* sbx,
+                  const Quantizer& quantizer, size_t covered_blocks,
+                  const size_t* sbx,
                   const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
                   size_t dc_stride, const float* JXL_RESTRICT biases,
                   ACPtr qblock[3], float* JXL_RESTRICT block) {
@@ -153,12 +151,12 @@ void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
   const auto scaled_dequant_y = Set(d, scaled_dequant_s);
   const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
 
-  const size_t dq_ofs = quantizer.DequantMatrixOffset(kind, 0);
+  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
 
   for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
     DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
-                         dequant_matrices, dq_ofs, size, k, x_cc_mul, b_cc_mul,
-                         biases, qblock, block);
+                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
+                         qblock, block);
   }
   for (size_t c = 0; c < 3; c++) {
     LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
@@ -186,8 +184,6 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
   const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
 
   const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
-  const float* JXL_RESTRICT dequant_matrices =
-      dec_state->shared->quantizer.DequantMatrix(0, 0);
 
   const YCbCrChromaSubsampling& cs =
       dec_state->shared->frame_header.chroma_subsampling;
@@ -428,7 +424,7 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
           dequant_block(
               acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
               dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(),
-              size, dec_state->shared->quantizer, dequant_matrices,
+              size, dec_state->shared->quantizer,
               acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
               dc_stride,
               dec_state->output_encoding_info.opsin_params.quant_biases, qblock,

diff --git a/lib/jxl/enc_ac_strategy.cc b/lib/jxl/enc_ac_strategy.cc
@@ -1008,6 +1008,17 @@ void AcStrategyHeuristics::Init(const Image3F& src,
   const CompressParams& cparams = enc_state->cparams;
   const float butteraugli_target = cparams.butteraugli_distance;
 
+  if (cparams.speed_tier >= SpeedTier::kCheetah) {
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1));  // DCT8 only
+  } else {
+    uint32_t acs_mask = 0;
+    // All transforms up to 64x64.
+    for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
+      acs_mask |= (1 << i);
+    }
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask));
+  }
+
   // Image row pointers and strides.
   config.quant_field_row = enc_state->initial_quant_field.Row(0);
   config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();

diff --git a/lib/jxl/enc_heuristics.cc b/lib/jxl/enc_heuristics.cc
@@ -866,6 +866,9 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     GaborishInverse(opsin, 0.9908511000000001f, pool);
   }
 
+  FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
+                          &enc_state->shared.matrices);
+
   cfl_heuristics.Init(*opsin);
   acs_heuristics.Init(*opsin, enc_state);
 
@@ -934,9 +937,6 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
                              &enc_state->shared.cmap);
   }
 
-  FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
-                          &enc_state->shared.matrices);
-
   // Refine quantization levels.
   FindBestQuantizer(original_pixels, *opsin, enc_state, cms, pool, aux_out);