Skip to content

Commit 400445b

Browse files
committed
vp9: intra_pred avx2 WIP
1 parent c7b0890 commit 400445b

File tree

2 files changed

+161
-7
lines changed

2 files changed

+161
-7
lines changed

libavcodec/x86/vp9dsp_init.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,13 @@ ipred_funcs(hd, ssse3, avx);
241241
ipred_funcs(vl, ssse3, avx);
242242
ipred_funcs(vr, ssse3, avx);
243243

244+
ipred_func(32, dc, avx2);
245+
ipred_func(32, dc_left, avx2);
246+
ipred_func(32, dc_top, avx2);
247+
ipred_func(32, v, avx2);
248+
ipred_func(32, h, avx2);
249+
ipred_func(32, tm, avx2);
250+
244251
#undef ipred_funcs
245252
#undef ipred_func_set
246253
#undef ipred_func
@@ -388,6 +395,15 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
388395
init_ipred(TX_32X32, 32, avx);
389396
}
390397

398+
if (EXTERNAL_AVX2(cpu_flags)) {
399+
dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
400+
dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
401+
dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;
402+
dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2;
403+
dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2;
404+
dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2;
405+
}
406+
391407
#undef init_fpel
392408
#undef init_subpel1
393409
#undef init_subpel2

libavcodec/x86/vp9intrapred.asm

Lines changed: 145 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929

3030
%include "libavutil/x86/x86util.asm"
3131

32-
SECTION_RODATA
32+
SECTION_RODATA 32
3333

34-
pw_m256: times 8 dw -256
35-
pw_m255: times 8 dw -255
34+
pw_m256: times 16 dw -256
35+
pw_m255: times 16 dw -255
3636
pw_512: times 8 dw 512
3737
pw_1024: times 8 dw 1024
3838
pw_2048: times 8 dw 2048
@@ -72,12 +72,11 @@ pb_3to1_5x0: db 3, 2, 1
7272
times 9 db 0
7373
pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
7474

75-
pb_2: times 16 db 2
75+
pb_1: times 32 db 1
76+
pb_2: times 32 db 2
77+
pb_3: times 32 db 3
7678
pb_15: times 16 db 15
7779

78-
cextern pb_1
79-
cextern pb_3
80-
8180
SECTION .text
8281

8382
; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
@@ -180,6 +179,38 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
180179
jg .loop
181180
RET
182181

182+
INIT_YMM avx2
183+
cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
184+
mova m0, [lq]
185+
movu m1, [aq]
186+
DEFINE_ARGS dst, stride, stride3, cnt
187+
lea stride3q, [strideq*3]
188+
pxor m2, m2
189+
psadbw m0, m2
190+
psadbw m1, m2
191+
paddw m0, m1
192+
vextracti128 xm1, m0, 1
193+
paddw xm0, xm1
194+
movhlps xm1, xm0
195+
paddw xm0, xm1
196+
pmulhrsw xm0, [pw_512]
197+
vpbroadcastb m0, xm0
198+
mov cntd, 4
199+
.loop:
200+
movu [dstq+strideq*0], m0
201+
movu [dstq+strideq*1], m0
202+
movu [dstq+strideq*2], m0
203+
movu [dstq+stride3q ], m0
204+
lea dstq, [dstq+strideq*4]
205+
movu [dstq+strideq*0], m0
206+
movu [dstq+strideq*1], m0
207+
movu [dstq+strideq*2], m0
208+
movu [dstq+stride3q ], m0
209+
lea dstq, [dstq+strideq*4]
210+
dec cntd
211+
jg .loop
212+
RET
213+
183214
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
184215

185216
%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -267,6 +298,35 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
267298
dec cntd
268299
jg .loop
269300
RET
301+
302+
INIT_YMM avx2
303+
cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
304+
movu m0, [%2q]
305+
DEFINE_ARGS dst, stride, stride3, cnt
306+
lea stride3q, [strideq*3]
307+
pxor m2, m2
308+
psadbw m0, m2
309+
vextracti128 xm1, m0, 1
310+
paddw xm0, xm1
311+
movhlps xm1, xm0
312+
paddw xm0, xm1
313+
pmulhrsw xm0, [pw_1024]
314+
vpbroadcastb m0, xm0
315+
mov cntd, 4
316+
.loop:
317+
movu [dstq+strideq*0], m0
318+
movu [dstq+strideq*1], m0
319+
movu [dstq+strideq*2], m0
320+
movu [dstq+stride3q ], m0
321+
lea dstq, [dstq+strideq*4]
322+
movu [dstq+strideq*0], m0
323+
movu [dstq+strideq*1], m0
324+
movu [dstq+strideq*2], m0
325+
movu [dstq+stride3q ], m0
326+
lea dstq, [dstq+strideq*4]
327+
dec cntd
328+
jg .loop
329+
RET
270330
%endmacro
271331

272332
DC_1D_FUNCS top, a
@@ -327,6 +387,27 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
327387
jg .loop
328388
RET
329389

390+
INIT_YMM avx2
391+
cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
392+
movu m0, [aq]
393+
DEFINE_ARGS dst, stride, stride3, cnt
394+
lea stride3q, [strideq*3]
395+
mov cntd, 4
396+
.loop:
397+
movu [dstq+strideq*0], m0
398+
movu [dstq+strideq*1], m0
399+
movu [dstq+strideq*2], m0
400+
movu [dstq+stride3q ], m0
401+
lea dstq, [dstq+strideq*4]
402+
movu [dstq+strideq*0], m0
403+
movu [dstq+strideq*1], m0
404+
movu [dstq+strideq*2], m0
405+
movu [dstq+stride3q ], m0
406+
lea dstq, [dstq+strideq*4]
407+
dec cntd
408+
jg .loop
409+
RET
410+
330411
; h
331412

332413
INIT_XMM ssse3
@@ -417,6 +498,30 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
417498
H_XMM_FUNCS ssse3
418499
H_XMM_FUNCS avx
419500

501+
INIT_YMM avx2
502+
cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
503+
mova m5, [pb_1]
504+
mova m6, [pb_2]
505+
mova m7, [pb_3]
506+
pxor m4, m4
507+
lea stride3q, [strideq*3]
508+
mov cntq, 7
509+
.loop:
510+
movd xm3, [lq+cntq*4]
511+
vinserti128 m3, m3, xm3, 1
512+
pshufb m0, m3, m7
513+
pshufb m1, m3, m6
514+
movu [dstq+strideq*0], m0
515+
movu [dstq+strideq*1], m1
516+
pshufb m2, m3, m5
517+
pshufb m3, m4
518+
movu [dstq+strideq*2], m2
519+
movu [dstq+stride3q ], m3
520+
lea dstq, [dstq+strideq*4]
521+
dec cntq
522+
jge .loop
523+
RET
524+
420525
; tm
421526

422527
INIT_MMX ssse3
@@ -554,6 +659,39 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
554659
TM_XMM_FUNCS ssse3
555660
TM_XMM_FUNCS avx
556661

662+
INIT_YMM avx2
663+
cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
664+
pxor m3, m3
665+
pinsrw xm2, [aq-1], 0
666+
vinserti128 m2, m2, xm2, 1
667+
movu m0, [aq]
668+
DEFINE_ARGS dst, stride, l, cnt
669+
mova m4, [pw_m256]
670+
mova m5, [pw_m255]
671+
pshufb m2, m4
672+
punpckhbw m1, m0, m3
673+
punpcklbw m0, m3
674+
psubw m1, m2
675+
psubw m0, m2
676+
mov cntq, 15
677+
.loop:
678+
pinsrw xm7, [lq+cntq*2], 0
679+
vinserti128 m7, m7, xm7, 1
680+
pshufb m3, m7, m5
681+
pshufb m7, m4
682+
paddw m2, m3, m0
683+
paddw m3, m1
684+
paddw m6, m7, m0
685+
paddw m7, m1
686+
packuswb m2, m3
687+
packuswb m6, m7
688+
movu [dstq+strideq*0], m2
689+
movu [dstq+strideq*1], m6
690+
lea dstq, [dstq+strideq*2]
691+
dec cntq
692+
jge .loop
693+
RET
694+
557695
; dl
558696

559697
%macro LOWPASS 4 ; left [dst], center, right, tmp

0 commit comments

Comments
 (0)