Skip to content

Commit ed81dcc

Browse files
committed
x86/vp9: add AVX and AVX2 MC
Roughly 25% faster MC than ssse3 for blocksizes 32 and 64. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
1 parent 2284413 commit ed81dcc

File tree

3 files changed

+150
-88
lines changed

3 files changed

+150
-88
lines changed

libavcodec/x86/constants.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x004
4141
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
4242
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
4343
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
44-
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL };
44+
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
45+
0x0100010001000100ULL, 0x0100010001000100ULL };
4546
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
4647
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
4748
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL };

libavcodec/x86/vp9dsp_init.c

Lines changed: 99 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,27 @@ fpel_func(avg, 8, mmxext);
4343
fpel_func(avg, 16, sse2);
4444
fpel_func(avg, 32, sse2);
4545
fpel_func(avg, 64, sse2);
46+
fpel_func(put, 32, avx);
47+
fpel_func(put, 64, avx);
48+
fpel_func(avg, 32, avx2);
49+
fpel_func(avg, 64, avx2);
4650
#undef fpel_func
4751

4852
#define mc_func(avg, sz, dir, opt) \
4953
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
5054
const uint8_t *src, ptrdiff_t src_stride, \
51-
int h, const int8_t (*filter)[16])
52-
#define mc_funcs(sz) \
53-
mc_func(put, sz, h, ssse3); \
54-
mc_func(avg, sz, h, ssse3); \
55-
mc_func(put, sz, v, ssse3); \
56-
mc_func(avg, sz, v, ssse3)
57-
58-
mc_funcs(4);
59-
mc_funcs(8);
55+
int h, const int8_t (*filter)[32])
56+
#define mc_funcs(sz, opt) \
57+
mc_func(put, sz, h, opt); \
58+
mc_func(avg, sz, h, opt); \
59+
mc_func(put, sz, v, opt); \
60+
mc_func(avg, sz, v, opt)
61+
62+
mc_funcs(4, ssse3);
63+
mc_funcs(8, ssse3);
6064
#if ARCH_X86_64
61-
mc_funcs(16);
65+
mc_funcs(16, ssse3);
66+
mc_funcs(32, avx2);
6267
#endif
6368

6469
#undef mc_funcs
@@ -68,89 +73,104 @@ mc_funcs(16);
6873
static av_always_inline void \
6974
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
7075
const uint8_t *src, ptrdiff_t src_stride, \
71-
int h, const int8_t (*filter)[16]) \
76+
int h, const int8_t (*filter)[32]) \
7277
{ \
7378
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
7479
src_stride, h, filter); \
7580
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
7681
src_stride, h, filter); \
7782
}
7883

79-
#define mc_rep_funcs(sz, hsz) \
80-
mc_rep_func(put, sz, hsz, h, ssse3); \
81-
mc_rep_func(avg, sz, hsz, h, ssse3); \
82-
mc_rep_func(put, sz, hsz, v, ssse3); \
83-
mc_rep_func(avg, sz, hsz, v, ssse3)
84+
#define mc_rep_funcs(sz, hsz, opt) \
85+
mc_rep_func(put, sz, hsz, h, opt); \
86+
mc_rep_func(avg, sz, hsz, h, opt); \
87+
mc_rep_func(put, sz, hsz, v, opt); \
88+
mc_rep_func(avg, sz, hsz, v, opt)
8489

8590
#if ARCH_X86_32
86-
mc_rep_funcs(16, 8);
91+
mc_rep_funcs(16, 8, ssse3);
92+
#endif
93+
mc_rep_funcs(32, 16, ssse3);
94+
mc_rep_funcs(64, 32, ssse3);
95+
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
96+
mc_rep_funcs(64, 32, avx2);
8797
#endif
88-
mc_rep_funcs(32, 16);
89-
mc_rep_funcs(64, 32);
9098

9199
#undef mc_rep_funcs
92100
#undef mc_rep_func
93101

94-
extern const int8_t ff_filters_ssse3[3][15][4][16];
102+
extern const int8_t ff_filters_ssse3[3][15][4][32];
95103

96-
#define filter_8tap_2d_fn(op, sz, f, fname) \
97-
static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
104+
#define filter_8tap_2d_fn(op, sz, f, fname, align, opt) \
105+
static void op##_8tap_##fname##_##sz##hv_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
98106
const uint8_t *src, ptrdiff_t src_stride, \
99107
int h, int mx, int my) \
100108
{ \
101-
LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \
102-
ff_vp9_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \
109+
LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64]); \
110+
ff_vp9_put_8tap_1d_h_##sz##_##opt(temp, 64, src - 3 * src_stride, src_stride, \
103111
h + 7, ff_filters_ssse3[f][mx - 1]); \
104-
ff_vp9_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \
112+
ff_vp9_##op##_8tap_1d_v_##sz##_##opt(dst, dst_stride, temp + 3 * 64, 64, \
105113
h, ff_filters_ssse3[f][my - 1]); \
106114
}
107115

108-
#define filters_8tap_2d_fn(op, sz) \
109-
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
110-
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
111-
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
112-
113-
#define filters_8tap_2d_fn2(op) \
114-
filters_8tap_2d_fn(op, 64) \
115-
filters_8tap_2d_fn(op, 32) \
116-
filters_8tap_2d_fn(op, 16) \
117-
filters_8tap_2d_fn(op, 8) \
118-
filters_8tap_2d_fn(op, 4)
119-
120-
filters_8tap_2d_fn2(put)
121-
filters_8tap_2d_fn2(avg)
116+
#define filters_8tap_2d_fn(op, sz, align, opt) \
117+
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular, align, opt) \
118+
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp, align, opt) \
119+
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, align, opt)
120+
121+
#define filters_8tap_2d_fn2(op, align, opt) \
122+
filters_8tap_2d_fn(op, 64, align, opt) \
123+
filters_8tap_2d_fn(op, 32, align, opt) \
124+
filters_8tap_2d_fn(op, 16, align, opt) \
125+
filters_8tap_2d_fn(op, 8, align, opt) \
126+
filters_8tap_2d_fn(op, 4, align, opt)
127+
128+
filters_8tap_2d_fn2(put, 16, ssse3)
129+
filters_8tap_2d_fn2(avg, 16, ssse3)
130+
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
131+
filters_8tap_2d_fn(put, 64, 32, avx2)
132+
filters_8tap_2d_fn(put, 32, 32, avx2)
133+
filters_8tap_2d_fn(avg, 64, 32, avx2)
134+
filters_8tap_2d_fn(avg, 32, 32, avx2)
135+
#endif
122136

123137
#undef filters_8tap_2d_fn2
124138
#undef filters_8tap_2d_fn
125139
#undef filter_8tap_2d_fn
126140

127-
#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \
128-
static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
141+
#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar, opt) \
142+
static void op##_8tap_##fname##_##sz##dir##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
129143
const uint8_t *src, ptrdiff_t src_stride, \
130144
int h, int mx, int my) \
131145
{ \
132-
ff_vp9_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \
146+
ff_vp9_##op##_8tap_1d_##dir##_##sz##_##opt(dst, dst_stride, src, src_stride, \
133147
h, ff_filters_ssse3[f][dvar - 1]); \
134148
}
135149

136-
#define filters_8tap_1d_fn(op, sz, dir, dvar) \
137-
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
138-
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
139-
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
140-
141-
#define filters_8tap_1d_fn2(op, sz) \
142-
filters_8tap_1d_fn(op, sz, h, mx) \
143-
filters_8tap_1d_fn(op, sz, v, my)
144-
145-
#define filters_8tap_1d_fn3(op) \
146-
filters_8tap_1d_fn2(op, 64) \
147-
filters_8tap_1d_fn2(op, 32) \
148-
filters_8tap_1d_fn2(op, 16) \
149-
filters_8tap_1d_fn2(op, 8) \
150-
filters_8tap_1d_fn2(op, 4)
151-
152-
filters_8tap_1d_fn3(put)
153-
filters_8tap_1d_fn3(avg)
150+
#define filters_8tap_1d_fn(op, sz, dir, dvar, opt) \
151+
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar, opt) \
152+
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar, opt) \
153+
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar, opt)
154+
155+
#define filters_8tap_1d_fn2(op, sz, opt) \
156+
filters_8tap_1d_fn(op, sz, h, mx, opt) \
157+
filters_8tap_1d_fn(op, sz, v, my, opt)
158+
159+
#define filters_8tap_1d_fn3(op, opt) \
160+
filters_8tap_1d_fn2(op, 64, opt) \
161+
filters_8tap_1d_fn2(op, 32, opt) \
162+
filters_8tap_1d_fn2(op, 16, opt) \
163+
filters_8tap_1d_fn2(op, 8, opt) \
164+
filters_8tap_1d_fn2(op, 4, opt)
165+
166+
filters_8tap_1d_fn3(put, ssse3)
167+
filters_8tap_1d_fn3(avg, ssse3)
168+
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
169+
filters_8tap_1d_fn2(put, 64, avx2)
170+
filters_8tap_1d_fn2(put, 32, avx2)
171+
filters_8tap_1d_fn2(avg, 64, avx2)
172+
filters_8tap_1d_fn2(avg, 32, avx2)
173+
#endif
154174

155175
#undef filters_8tap_1d_fn
156176
#undef filters_8tap_1d_fn2
@@ -270,9 +290,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
270290
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
271291
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
272292

273-
#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
293+
#define init_subpel2_32_64(idx, idxh, idxv, dir, type, opt) \
274294
init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
275-
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
295+
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt)
296+
297+
#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
298+
init_subpel2_32_64(idx, idxh, idxv, dir, type, opt); \
276299
init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
277300
init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
278301
init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
@@ -389,13 +412,27 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
389412
dsp->itxfm_add[TX_32X32][DCT_ADST] =
390413
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
391414
}
415+
init_fpel(1, 0, 32, put, avx);
416+
init_fpel(0, 0, 64, put, avx);
392417
init_lpf(avx);
393418
init_ipred(TX_8X8, 8, avx);
394419
init_ipred(TX_16X16, 16, avx);
395420
init_ipred(TX_32X32, 32, avx);
396421
}
397422

398423
if (EXTERNAL_AVX2(cpu_flags)) {
424+
init_fpel(1, 1, 32, avg, avx2);
425+
init_fpel(0, 1, 64, avg, avx2);
426+
if (ARCH_X86_64) {
427+
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
428+
init_subpel2_32_64(0, 1, 1, hv, put, avx2);
429+
init_subpel2_32_64(0, 0, 1, v, put, avx2);
430+
init_subpel2_32_64(0, 1, 0, h, put, avx2);
431+
init_subpel2_32_64(1, 1, 1, hv, avg, avx2);
432+
init_subpel2_32_64(1, 0, 1, v, avg, avx2);
433+
init_subpel2_32_64(1, 1, 0, h, avg, avx2);
434+
#endif
435+
}
399436
dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
400437
dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
401438
dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;

0 commit comments

Comments
 (0)