@@ -43,22 +43,27 @@ fpel_func(avg, 8, mmxext);
4343fpel_func (avg , 16 , sse2 );
4444fpel_func (avg , 32 , sse2 );
4545fpel_func (avg , 64 , sse2 );
46+ fpel_func (put , 32 , avx );
47+ fpel_func (put , 64 , avx );
48+ fpel_func (avg , 32 , avx2 );
49+ fpel_func (avg , 64 , avx2 );
4650#undef fpel_func
4751
4852#define mc_func (avg , sz , dir , opt ) \
4953void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
5054 const uint8_t *src, ptrdiff_t src_stride, \
51- int h, const int8_t (*filter)[16 ])
52- #define mc_funcs (sz ) \
53- mc_func(put, sz, h, ssse3 ); \
54- mc_func(avg, sz, h, ssse3 ); \
55- mc_func(put, sz, v, ssse3 ); \
56- mc_func(avg, sz, v, ssse3 )
57-
58- mc_funcs (4 );
59- mc_funcs (8 );
55+ int h, const int8_t (*filter)[32 ])
56+ #define mc_funcs (sz , opt ) \
57+ mc_func(put, sz, h, opt ); \
58+ mc_func(avg, sz, h, opt ); \
59+ mc_func(put, sz, v, opt ); \
60+ mc_func(avg, sz, v, opt )
61+
62+ mc_funcs (4 , ssse3 );
63+ mc_funcs (8 , ssse3 );
6064#if ARCH_X86_64
61- mc_funcs (16 );
65+ mc_funcs (16 , ssse3 );
66+ mc_funcs (32 , avx2 );
6267#endif
6368
6469#undef mc_funcs
@@ -68,89 +73,104 @@ mc_funcs(16);
6873static av_always_inline void \
6974ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
7075 const uint8_t *src, ptrdiff_t src_stride, \
71- int h, const int8_t (*filter)[16 ]) \
76+ int h, const int8_t (*filter)[32 ]) \
7277{ \
7378 ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
7479 src_stride, h, filter); \
7580 ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
7681 src_stride, h, filter); \
7782}
7883
79- #define mc_rep_funcs (sz , hsz ) \
80- mc_rep_func(put, sz, hsz, h, ssse3 ); \
81- mc_rep_func(avg, sz, hsz, h, ssse3 ); \
82- mc_rep_func(put, sz, hsz, v, ssse3 ); \
83- mc_rep_func(avg, sz, hsz, v, ssse3 )
84+ #define mc_rep_funcs (sz , hsz , opt ) \
85+ mc_rep_func(put, sz, hsz, h, opt ); \
86+ mc_rep_func(avg, sz, hsz, h, opt ); \
87+ mc_rep_func(put, sz, hsz, v, opt ); \
88+ mc_rep_func(avg, sz, hsz, v, opt )
8489
8590#if ARCH_X86_32
86- mc_rep_funcs (16 , 8 );
91+ mc_rep_funcs (16 , 8 , ssse3 );
92+ #endif
93+ mc_rep_funcs (32 , 16 , ssse3 );
94+ mc_rep_funcs (64 , 32 , ssse3 );
95+ #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
96+ mc_rep_funcs (64 , 32 , avx2 );
8797#endif
88- mc_rep_funcs (32 , 16 );
89- mc_rep_funcs (64 , 32 );
9098
9199#undef mc_rep_funcs
92100#undef mc_rep_func
93101
94- extern const int8_t ff_filters_ssse3 [3 ][15 ][4 ][16 ];
102+ extern const int8_t ff_filters_ssse3 [3 ][15 ][4 ][32 ];
95103
96- #define filter_8tap_2d_fn (op , sz , f , fname ) \
97- static void op##_8tap_##fname##_##sz##hv_ssse3 (uint8_t *dst, ptrdiff_t dst_stride, \
104+ #define filter_8tap_2d_fn (op , sz , f , fname , align , opt ) \
105+ static void op##_8tap_##fname##_##sz##hv_##opt (uint8_t *dst, ptrdiff_t dst_stride, \
98106 const uint8_t *src, ptrdiff_t src_stride, \
99107 int h, int mx, int my) \
100108{ \
101- LOCAL_ALIGNED_16 (uint8_t, temp, [71 * 64]); \
102- ff_vp9_put_8tap_1d_h_##sz##_ssse3 (temp, 64, src - 3 * src_stride, src_stride, \
109+ LOCAL_ALIGNED_##align (uint8_t, temp, [71 * 64]); \
110+ ff_vp9_put_8tap_1d_h_##sz##_##opt (temp, 64, src - 3 * src_stride, src_stride, \
103111 h + 7, ff_filters_ssse3[f][mx - 1]); \
104- ff_vp9_##op##_8tap_1d_v_##sz##_ssse3 (dst, dst_stride, temp + 3 * 64, 64, \
112+ ff_vp9_##op##_8tap_1d_v_##sz##_##opt (dst, dst_stride, temp + 3 * 64, 64, \
105113 h, ff_filters_ssse3[f][my - 1]); \
106114}
107115
108- #define filters_8tap_2d_fn (op , sz ) \
109- filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
110- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
111- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
112-
113- #define filters_8tap_2d_fn2 (op ) \
114- filters_8tap_2d_fn(op, 64) \
115- filters_8tap_2d_fn(op, 32) \
116- filters_8tap_2d_fn(op, 16) \
117- filters_8tap_2d_fn(op, 8) \
118- filters_8tap_2d_fn(op, 4)
119-
120- filters_8tap_2d_fn2 (put )
121- filters_8tap_2d_fn2 (avg )
116+ #define filters_8tap_2d_fn (op , sz , align , opt ) \
117+ filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular, align, opt) \
118+ filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp, align, opt) \
119+ filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, align, opt)
120+
121+ #define filters_8tap_2d_fn2 (op , align , opt ) \
122+ filters_8tap_2d_fn(op, 64, align, opt) \
123+ filters_8tap_2d_fn(op, 32, align, opt) \
124+ filters_8tap_2d_fn(op, 16, align, opt) \
125+ filters_8tap_2d_fn(op, 8, align, opt) \
126+ filters_8tap_2d_fn(op, 4, align, opt)
127+
128+ filters_8tap_2d_fn2 (put , 16 , ssse3 )
129+ filters_8tap_2d_fn2 (avg , 16 , ssse3 )
130+ #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
131+ filters_8tap_2d_fn (put , 64 , 32 , avx2 )
132+ filters_8tap_2d_fn (put , 32 , 32 , avx2 )
133+ filters_8tap_2d_fn (avg , 64 , 32 , avx2 )
134+ filters_8tap_2d_fn (avg , 32 , 32 , avx2 )
135+ #endif
122136
123137#undef filters_8tap_2d_fn2
124138#undef filters_8tap_2d_fn
125139#undef filter_8tap_2d_fn
126140
127- #define filter_8tap_1d_fn (op , sz , f , fname , dir , dvar ) \
128- static void op##_8tap_##fname##_##sz##dir##_ssse3 (uint8_t *dst, ptrdiff_t dst_stride, \
141+ #define filter_8tap_1d_fn (op , sz , f , fname , dir , dvar , opt ) \
142+ static void op##_8tap_##fname##_##sz##dir##_##opt (uint8_t *dst, ptrdiff_t dst_stride, \
129143 const uint8_t *src, ptrdiff_t src_stride, \
130144 int h, int mx, int my) \
131145{ \
132- ff_vp9_##op##_8tap_1d_##dir##_##sz##_ssse3 (dst, dst_stride, src, src_stride, \
146+ ff_vp9_##op##_8tap_1d_##dir##_##sz##_##opt (dst, dst_stride, src, src_stride, \
133147 h, ff_filters_ssse3[f][dvar - 1]); \
134148}
135149
136- #define filters_8tap_1d_fn (op , sz , dir , dvar ) \
137- filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
138- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
139- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
140-
141- #define filters_8tap_1d_fn2 (op , sz ) \
142- filters_8tap_1d_fn(op, sz, h, mx) \
143- filters_8tap_1d_fn(op, sz, v, my)
144-
145- #define filters_8tap_1d_fn3 (op ) \
146- filters_8tap_1d_fn2(op, 64) \
147- filters_8tap_1d_fn2(op, 32) \
148- filters_8tap_1d_fn2(op, 16) \
149- filters_8tap_1d_fn2(op, 8) \
150- filters_8tap_1d_fn2(op, 4)
151-
152- filters_8tap_1d_fn3 (put )
153- filters_8tap_1d_fn3 (avg )
150+ #define filters_8tap_1d_fn (op , sz , dir , dvar , opt ) \
151+ filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar, opt) \
152+ filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar, opt) \
153+ filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar, opt)
154+
155+ #define filters_8tap_1d_fn2 (op , sz , opt ) \
156+ filters_8tap_1d_fn(op, sz, h, mx, opt) \
157+ filters_8tap_1d_fn(op, sz, v, my, opt)
158+
159+ #define filters_8tap_1d_fn3 (op , opt ) \
160+ filters_8tap_1d_fn2(op, 64, opt) \
161+ filters_8tap_1d_fn2(op, 32, opt) \
162+ filters_8tap_1d_fn2(op, 16, opt) \
163+ filters_8tap_1d_fn2(op, 8, opt) \
164+ filters_8tap_1d_fn2(op, 4, opt)
165+
166+ filters_8tap_1d_fn3 (put , ssse3 )
167+ filters_8tap_1d_fn3 (avg , ssse3 )
168+ #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
169+ filters_8tap_1d_fn2 (put , 64 , avx2 )
170+ filters_8tap_1d_fn2 (put , 32 , avx2 )
171+ filters_8tap_1d_fn2 (avg , 64 , avx2 )
172+ filters_8tap_1d_fn2 (avg , 32 , avx2 )
173+ #endif
154174
155175#undef filters_8tap_1d_fn
156176#undef filters_8tap_1d_fn2
@@ -270,9 +290,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
270290 dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
271291 dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
272292
273- #define init_subpel2 (idx , idxh , idxv , dir , type , opt ) \
293+ #define init_subpel2_32_64 (idx , idxh , idxv , dir , type , opt ) \
274294 init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
275- init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
295+ init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt)
296+
297+ #define init_subpel2 (idx , idxh , idxv , dir , type , opt ) \
298+ init_subpel2_32_64(idx, idxh, idxv, dir, type, opt); \
276299 init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
277300 init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
278301 init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
@@ -389,13 +412,27 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
389412 dsp -> itxfm_add [TX_32X32 ][DCT_ADST ] =
390413 dsp -> itxfm_add [TX_32X32 ][DCT_DCT ] = ff_vp9_idct_idct_32x32_add_avx ;
391414 }
415+ init_fpel (1 , 0 , 32 , put , avx );
416+ init_fpel (0 , 0 , 64 , put , avx );
392417 init_lpf (avx );
393418 init_ipred (TX_8X8 , 8 , avx );
394419 init_ipred (TX_16X16 , 16 , avx );
395420 init_ipred (TX_32X32 , 32 , avx );
396421 }
397422
398423 if (EXTERNAL_AVX2 (cpu_flags )) {
424+ init_fpel (1 , 1 , 32 , avg , avx2 );
425+ init_fpel (0 , 1 , 64 , avg , avx2 );
426+ if (ARCH_X86_64 ) {
427+ #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
428+ init_subpel2_32_64 (0 , 1 , 1 , hv , put , avx2 );
429+ init_subpel2_32_64 (0 , 0 , 1 , v , put , avx2 );
430+ init_subpel2_32_64 (0 , 1 , 0 , h , put , avx2 );
431+ init_subpel2_32_64 (1 , 1 , 1 , hv , avg , avx2 );
432+ init_subpel2_32_64 (1 , 0 , 1 , v , avg , avx2 );
433+ init_subpel2_32_64 (1 , 1 , 0 , h , avg , avx2 );
434+ #endif
435+ }
399436 dsp -> intra_pred [TX_32X32 ][DC_PRED ] = ff_vp9_ipred_dc_32x32_avx2 ;
400437 dsp -> intra_pred [TX_32X32 ][LEFT_DC_PRED ] = ff_vp9_ipred_dc_left_32x32_avx2 ;
401438 dsp -> intra_pred [TX_32X32 ][TOP_DC_PRED ] = ff_vp9_ipred_dc_top_32x32_avx2 ;
0 commit comments