Skip to content

Commit

Permalink
Speedup 14+ bit e1 using AVX2 by ~6.6%. (libjxl#3944)
Browse files Browse the repository at this point in the history
```
2000 x 3008,  geomean: 123.866 MP/s [108.75, 129.53], , 100 reps, 1 threads.
3072 x 2048,  geomean: 211.124 MP/s [188.20, 221.33], , 100 reps, 1 threads.

->

2000 x 3008,  geomean: 132.063 MP/s [111.09, 137.27], , 100 reps, 1 threads.
3072 x 2048,  geomean: 225.402 MP/s [183.71, 236.48], , 100 reps, 1 threads.
```
  • Loading branch information
veluca93 authored Nov 22, 2024
1 parent bf4781a commit 8a39b30
Showing 1 changed file with 7 additions and 43 deletions.
50 changes: 7 additions & 43 deletions lib/jxl/enc_fast_lossless.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include "lib/jxl/base/status.h"
#ifndef FJXL_SELF_INCLUDE

#include "lib/jxl/enc_fast_lossless.h"

#include <assert.h>
#include <stdint.h>
#include <string.h>
Expand All @@ -18,6 +16,8 @@
#include <memory>
#include <vector>

#include "lib/jxl/enc_fast_lossless.h"

#if FJXL_STANDALONE
#if defined(_MSC_VER)
using ssize_t = intptr_t;
Expand Down Expand Up @@ -1468,46 +1468,10 @@ struct SIMDVec32 {
return SIMDVec32{_mm256_set1_epi32(v)};
}
FJXL_INLINE SIMDVec32 ValToToken() const {
// we know that each value has at most 20 bits, so we just need 5 nibbles
// and don't need to mask the fifth. However we do need to set the higher
// bytes to 0xFF, which will make table lookups return 0.
auto nibble0 =
_mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi32(0xF)),
_mm256_set1_epi32(0xFFFFFF00));
auto nibble1 = _mm256_or_si256(
_mm256_and_si256(_mm256_srli_epi32(vec, 4), _mm256_set1_epi32(0xF)),
_mm256_set1_epi32(0xFFFFFF00));
auto nibble2 = _mm256_or_si256(
_mm256_and_si256(_mm256_srli_epi32(vec, 8), _mm256_set1_epi32(0xF)),
_mm256_set1_epi32(0xFFFFFF00));
auto nibble3 = _mm256_or_si256(
_mm256_and_si256(_mm256_srli_epi32(vec, 12), _mm256_set1_epi32(0xF)),
_mm256_set1_epi32(0xFFFFFF00));
auto nibble4 = _mm256_or_si256(_mm256_srli_epi32(vec, 16),
_mm256_set1_epi32(0xFFFFFF00));

auto lut0 = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
auto lut1 = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
auto lut4 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20));

auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
auto token4 = _mm256_shuffle_epi8(lut4, nibble4);

auto token =
_mm256_max_epi32(_mm256_max_epi32(_mm256_max_epi32(token0, token1),
_mm256_max_epi32(token2, token3)),
token4);
return SIMDVec32{token};
auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
return SIMDVec32{_mm256_max_epi32(
_mm256_setzero_si256(),
_mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
}
FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
Expand Down Expand Up @@ -4171,7 +4135,7 @@ namespace default_implementation {
#undef FJXL_NEON
} // namespace default_implementation

#else // FJXL_ENABLE_NEON
#else // FJXL_ENABLE_NEON

namespace default_implementation {
#include "lib/jxl/enc_fast_lossless.cc" // NOLINT
Expand Down

0 comments on commit 8a39b30

Please sign in to comment.