Skip to content

Commit

Permalink
Do not keep around default quantization matrices
Browse files Browse the repository at this point in the history
This saves ~3Mb of static memory. Speed impact with multiple repetitions
is negligible, significant speed improvement for one-off decodes.

Slight variation in 6th digit of en/decoding results due to more
approximate quant matrix computation.

```
Before:
Encoding      kPixels    Bytes          BPP  E MP/s  D MP/s     Max norm        pnorm       BPP*pnorm   Bugs
------------------------------------------------------------------------------------------------------------
jxl:d0.7        13270  3400483    2.0499581   5.466  47.067   1.50473666   0.47348297  0.970620226409      0
jxl:d1          13270  2678725    1.6148512   5.750  50.382   1.68153656   0.60257544  0.973069654737      0
jxl:d2          13270  1683788    1.0150602   6.102  53.015   3.00312042   0.95243347  0.966777268691      0
jxl:d4          13270  1004956    0.6058309   5.979  37.066   4.85261440   1.50554379  0.912104941537      0
jxl:d6          13270   718865    0.4333629   5.772  36.305   7.34840679   1.97809701  0.857233817426      0
jxl:d8          13270   576902    0.3477815   5.852  35.697   8.16245556   2.36179685  0.821389132427      0
Aggregate:      13270  1362308    0.8212584   5.817  42.674   3.60956523   1.11400975  0.914889813753      0

2268 x 1512, geomean: 90.43 MP/s [54.13, 91.49], 100 reps, 0 threads.
2268 x 1512, 56.57 MP/s [56.57, 56.57], 1 reps, 0 threads.

444 x 258, geomean: 62.33 MP/s [6.49, 63.54], 100 reps, 0 threads.
444 x 258, 7.06 MP/s [7.06, 7.06], 1 reps, 0 threads.

After:
Encoding      kPixels    Bytes          BPP  E MP/s  D MP/s     Max norm        pnorm       BPP*pnorm   Bugs
------------------------------------------------------------------------------------------------------------
jxl:d0.7        13270  3400473    2.0499520   5.424  46.559   1.50473595   0.47348321  0.970617862124      0
jxl:d1          13270  2678723    1.6148500   5.710  49.788   1.68153679   0.60257545  0.973068940575      0
jxl:d2          13270  1683788    1.0150602   6.064  52.348   3.00312042   0.95243348  0.966777275451      0
jxl:d4          13270  1004957    0.6058315   5.942  36.558   4.85261631   1.50554828  0.912108572859      0
jxl:d6          13270   718865    0.4333629   5.736  35.850   7.34841013   1.97809768  0.857234107317      0
jxl:d8          13270   576891    0.3477748   5.811  35.189   8.16245461   2.36179583  0.821373116628      0
Aggregate:      13270  1362303    0.8212554   5.777  42.136   3.60956547   1.11401039  0.914887016940      0

2268 x 1512, geomean: 90.40 MP/s [71.12, 91.11], 100 reps, 0 threads.
2268 x 1512, 68.69 MP/s [68.69, 68.69], 1 reps, 0 threads.

444 x 258, geomean: 63.93 MP/s [37.00, 64.71], 100 reps, 0 threads.
444 x 258, 37.56 MP/s [37.56, 37.56], 1 reps, 0 threads.
```
  • Loading branch information
veluca93 committed Feb 16, 2022
1 parent 40cf87c commit a67a570
Show file tree
Hide file tree
Showing 9 changed files with 293 additions and 257 deletions.
2 changes: 2 additions & 0 deletions lib/jxl/compressed_image_test.cc
Expand Up @@ -77,6 +77,8 @@ void RunRGBRoundTrip(float distance, bool fast) {
PassesEncoderState enc_state;
JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state.shared));

JXL_CHECK(enc_state.shared.matrices.EnsureComputed(~0u));

enc_state.shared.quantizer.SetQuant(4.0f, 4.0f,
&enc_state.shared.raw_quant_field);
enc_state.shared.ac_strategy.FillDCT8();
Expand Down
2 changes: 2 additions & 0 deletions lib/jxl/dec_frame.cc
Expand Up @@ -545,6 +545,8 @@ Status FrameDecoder::ProcessACGlobal(BitReader* br) {
if (frame_header_.encoding == FrameEncoding::kVarDCT) {
JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode(
br, &modular_frame_decoder_));
JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.EnsureComputed(
dec_state_->used_acs));

size_t num_histo_bits =
CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups);
Expand Down
26 changes: 11 additions & 15 deletions lib/jxl/dec_group.cc
Expand Up @@ -97,15 +97,14 @@ void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
template <ACType ac_type>
void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
Vec<D> scaled_dequant_b,
const float* JXL_RESTRICT dequant_matrices, size_t dq_ofs,
size_t size, size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
const float* JXL_RESTRICT dequant_matrices, size_t size,
size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
const float* JXL_RESTRICT biases, ACPtr qblock[3],
float* JXL_RESTRICT block) {
const auto x_mul = Load(d, dequant_matrices + dq_ofs + k) * scaled_dequant_x;
const auto y_mul =
Load(d, dequant_matrices + dq_ofs + size + k) * scaled_dequant_y;
const auto x_mul = Load(d, dequant_matrices + k) * scaled_dequant_x;
const auto y_mul = Load(d, dequant_matrices + size + k) * scaled_dequant_y;
const auto b_mul =
Load(d, dequant_matrices + dq_ofs + 2 * size + k) * scaled_dequant_b;
Load(d, dequant_matrices + 2 * size + k) * scaled_dequant_b;

Vec<DI> quantized_x_int;
Vec<DI> quantized_y_int;
Expand Down Expand Up @@ -139,9 +138,8 @@ template <ACType ac_type>
void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
Vec<D> b_cc_mul, size_t kind, size_t size,
const Quantizer& quantizer,
const float* JXL_RESTRICT dequant_matrices,
size_t covered_blocks, const size_t* sbx,
const Quantizer& quantizer, size_t covered_blocks,
const size_t* sbx,
const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
size_t dc_stride, const float* JXL_RESTRICT biases,
ACPtr qblock[3], float* JXL_RESTRICT block) {
Expand All @@ -153,12 +151,12 @@ void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
const auto scaled_dequant_y = Set(d, scaled_dequant_s);
const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);

const size_t dq_ofs = quantizer.DequantMatrixOffset(kind, 0);
const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);

for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
dequant_matrices, dq_ofs, size, k, x_cc_mul, b_cc_mul,
biases, qblock, block);
dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
qblock, block);
}
for (size_t c = 0; c < 3; c++) {
LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
Expand Down Expand Up @@ -186,8 +184,6 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();

const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
const float* JXL_RESTRICT dequant_matrices =
dec_state->shared->quantizer.DequantMatrix(0, 0);

const YCbCrChromaSubsampling& cs =
dec_state->shared->frame_header.chroma_subsampling;
Expand Down Expand Up @@ -428,7 +424,7 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
dequant_block(
acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(),
size, dec_state->shared->quantizer, dequant_matrices,
size, dec_state->shared->quantizer,
acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
dc_stride,
dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
Expand Down
11 changes: 11 additions & 0 deletions lib/jxl/enc_ac_strategy.cc
Expand Up @@ -1008,6 +1008,17 @@ void AcStrategyHeuristics::Init(const Image3F& src,
const CompressParams& cparams = enc_state->cparams;
const float butteraugli_target = cparams.butteraugli_distance;

if (cparams.speed_tier >= SpeedTier::kCheetah) {
JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1)); // DCT8 only
} else {
uint32_t acs_mask = 0;
// All transforms up to 64x64.
for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
acs_mask |= (1 << i);
}
JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask));
}

// Image row pointers and strides.
config.quant_field_row = enc_state->initial_quant_field.Row(0);
config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();
Expand Down
6 changes: 3 additions & 3 deletions lib/jxl/enc_heuristics.cc
Expand Up @@ -866,6 +866,9 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
GaborishInverse(opsin, 0.9908511000000001f, pool);
}

FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
&enc_state->shared.matrices);

cfl_heuristics.Init(*opsin);
acs_heuristics.Init(*opsin, enc_state);

Expand Down Expand Up @@ -934,9 +937,6 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
&enc_state->shared.cmap);
}

FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
&enc_state->shared.matrices);

// Refine quantization levels.
FindBestQuantizer(original_pixels, *opsin, enc_state, cms, pool, aux_out);

Expand Down

0 comments on commit a67a570

Please sign in to comment.