Skip to content

Commit

Permalink
Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON
Browse files Browse the repository at this point in the history
MergeRGB and SplitRGB use a register to point to 9 shuffle tables.

- fixes an out of registers error with -mcmodel=large

InterpolateRow_16To8_NEON improves performance for I210ToI420:

On Pixel 4 for 720p x1000 images
Was I210ToI420_Opt (608 ms)
Now I210ToI420_Opt (336 ms)

On Skylake Xeon
Was I210ToI420_Opt (259 ms)
Now I210ToI420_Opt (209 ms)


Bug: libyuv:931, libyuv:930
Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
  • Loading branch information
fbarchard authored and libyuv LUCI CQ committed Jun 29, 2022
1 parent fe4a50d commit 6900494
Show file tree
Hide file tree
Showing 8 changed files with 339 additions and 181 deletions.
2 changes: 1 addition & 1 deletion README.chromium
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1831
Version: 1832
License: BSD
License File: LICENSE

Expand Down
26 changes: 26 additions & 0 deletions include/libyuv/row.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ extern "C" {
#define HAS_AR64TOARGBROW_AVX2
#define HAS_AB64TOARGBROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_INTERPOLATEROW_16TO8_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_DIVIDEROW_16_AVX2
#define HAS_HALFMERGEUVROW_AVX2
Expand Down Expand Up @@ -539,6 +540,7 @@ extern "C" {

// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_INTERPOLATEROW_16TO8_NEON
#define HAS_SCALESUMSAMPLES_NEON
#define HAS_GAUSSROW_F32_NEON
#define HAS_GAUSSCOL_F32_NEON
Expand Down Expand Up @@ -5221,6 +5223,30 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
int scale,
int width,
int source_y_fraction);
void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction);
void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction);
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction);
void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction);

// Sobel images.
void SobelXRow_C(const uint8_t* src_y0,
Expand Down
2 changes: 1 addition & 1 deletion include/libyuv/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_

#define LIBYUV_VERSION 1831
#define LIBYUV_VERSION 1832

#endif // INCLUDE_LIBYUV_VERSION_H_
102 changes: 78 additions & 24 deletions source/row_any.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1625,47 +1625,101 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
#undef ANY11C

// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11I(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(T* dst_ptr, const T* src_ptr, ptrdiff_t src_stride, int width, \
int source_y_fraction) { \
SIMD_ALIGNED(T temp[64 * 3]); \
memset(temp, 0, 64 * 2 * sizeof(T)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP * sizeof(T)); \
if (source_y_fraction) { \
memcpy(temp + 64, src_ptr + src_stride + n * SBPP, \
r * SBPP * sizeof(T)); \
} \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP * sizeof(T)); \
#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
int width, int source_y_fraction) { \
SIMD_ALIGNED(TS temps[64 * 2]); \
SIMD_ALIGNED(TD tempd[64]); \
memset(temps, 0, sizeof(temps)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
} \
memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
if (source_y_fraction) { \
memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
r * SBPP * sizeof(TS)); \
} \
ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
}

#ifdef HAS_INTERPOLATEROW_AVX2
ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, 1, 1, 31)
ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, uint8_t, 1, 1, 15)
ANY11I(InterpolateRow_Any_SSSE3,
InterpolateRow_SSSE3,
uint8_t,
uint8_t,
1,
1,
15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, 1, 1, 15)
ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_MSA
ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, 1, 1, 31)
ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_LSX
ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, 1, 1, 31)
ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
#endif

#ifdef HAS_INTERPOLATEROW_16_NEON
ANY11I(InterpolateRow_16_Any_NEON, InterpolateRow_16_NEON, uint16_t, 1, 1, 7)
ANY11I(InterpolateRow_16_Any_NEON,
InterpolateRow_16_NEON,
uint16_t,
uint16_t,
1,
1,
7)
#endif

#undef ANY11I

// Any 1 to 1 interpolate with scale param
#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
int scale, int width, int source_y_fraction) { \
SIMD_ALIGNED(TS temps[64 * 2]); \
SIMD_ALIGNED(TD tempd[64]); \
memset(temps, 0, sizeof(temps)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
} \
memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
if (source_y_fraction) { \
memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
r * SBPP * sizeof(TS)); \
} \
ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
}

#ifdef HAS_INTERPOLATEROW_16TO8_NEON
ANY11IS(InterpolateRow_16To8_Any_NEON,
InterpolateRow_16To8_NEON,
uint8_t,
uint16_t,
1,
1,
7)
#endif
#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
ANY11IS(InterpolateRow_16To8_Any_AVX2,
InterpolateRow_16To8_AVX2,
uint8_t,
uint16_t,
1,
1,
31)
#endif

#undef ANY11IS

// Any 1 to 1 mirror.
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
Expand Down
128 changes: 71 additions & 57 deletions source/row_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
// TODO(fbarchard): change scale to bits
#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)

void Convert16To8Row_C(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
Expand All @@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
assert(scale <= 32768);

for (x = 0; x < width; ++x) {
dst_y[x] = clamp255((src_y[x] * scale) >> 16);
dst_y[x] = C16TO8(src_y[x], scale);
}
}

Expand Down Expand Up @@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_uv[x] = clamp255(
(((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16);
dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
}
}

Expand All @@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
int x;
assert(source_y_fraction >= 0);
assert(source_y_fraction < 256);

if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
return;
Expand All @@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr,
HalfRow_C(src_ptr, src_stride, dst_ptr, width);
return;
}
for (x = 0; x < width - 1; x += 2) {
for (x = 0; x < width; ++x) {
dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
dst_ptr[1] =
(src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
src_ptr += 2;
src_ptr1 += 2;
dst_ptr += 2;
++src_ptr;
++src_ptr1;
++dst_ptr;
}
if (width & 1) {
}

// C version 2x2 -> 2x1.
void InterpolateRow_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
assert(source_y_fraction >= 0);
assert(source_y_fraction < 256);

if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width * 2);
return;
}
if (y1_fraction == 128) {
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
return;
}
for (x = 0; x < width; ++x) {
dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
++src_ptr;
++src_ptr1;
++dst_ptr;
}
}

Expand All @@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
// TODO(fbarchard): change scale to bits

void InterpolateRow_16To8_C(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
Expand All @@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
assert(source_y_fraction >= 0);
assert(source_y_fraction < 256);

if (source_y_fraction == 0) {
Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
return;
Expand All @@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
return;
}
for (x = 0; x < width - 1; x += 2) {
dst_ptr[0] = clamp255(
(((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
scale) >>
16);
dst_ptr[1] = clamp255(
(((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) *
scale) >>
16);
src_ptr += 2;
src_ptr1 += 2;
dst_ptr += 2;
}
if (width & 1) {
dst_ptr[0] = clamp255(
(((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
scale) >>
16);
}
}

void InterpolateRow_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
if (source_y_fraction == 0) {
memcpy(dst_ptr, src_ptr, width * 2);
return;
}
if (source_y_fraction == 128) {
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
return;
}
for (x = 0; x < width - 1; x += 2) {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
src_ptr += 2;
src_ptr1 += 2;
dst_ptr += 2;
}
if (width & 1) {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
for (x = 0; x < width; ++x) {
dst_ptr[0] = C16TO8(
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
scale);
src_ptr += 1;
src_ptr1 += 1;
dst_ptr += 1;
}
}

Expand Down Expand Up @@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
}
#endif // HAS_RAWTOYJROW_SSSE3

#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction) {
// Row buffer for intermediate 16 bit pixels.
SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
src_ptr += twidth;
dst_ptr += twidth;
width -= twidth;
}
}
#endif // HAS_INTERPOLATEROW_16TO8_AVX2

float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
Expand Down
Loading

0 comments on commit 6900494

Please sign in to comment.