|
29 | 29 |
|
30 | 30 | %include "libavutil/x86/x86util.asm" |
31 | 31 |
|
32 | | -SECTION_RODATA |
| 32 | +SECTION_RODATA 32 |
33 | 33 |
|
34 | | -pw_m256: times 8 dw -256 |
35 | | -pw_m255: times 8 dw -255 |
| 34 | +pw_m256: times 16 dw -256 |
| 35 | +pw_m255: times 16 dw -255 |
36 | 36 | pw_512: times 8 dw 512 |
37 | 37 | pw_1024: times 8 dw 1024 |
38 | 38 | pw_2048: times 8 dw 2048 |
@@ -72,12 +72,11 @@ pb_3to1_5x0: db 3, 2, 1 |
72 | 72 | times 9 db 0 |
73 | 73 | pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
74 | 74 |
|
75 | | -pb_2: times 16 db 2 |
| 75 | +pb_1: times 32 db 1 |
| 76 | +pb_2: times 32 db 2 |
| 77 | +pb_3: times 32 db 3 |
76 | 78 | pb_15: times 16 db 15 |
77 | 79 |
|
78 | | -cextern pb_1 |
79 | | -cextern pb_3 |
80 | | - |
81 | 80 | SECTION .text |
82 | 81 |
|
83 | 82 | ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) |
@@ -180,6 +179,38 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a |
180 | 179 | jg .loop |
181 | 180 | RET |
182 | 181 |
|
| 182 | +INIT_YMM avx2 |
| 183 | +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a |
| 184 | + mova m0, [lq] |
| 185 | + movu m1, [aq] |
| 186 | + DEFINE_ARGS dst, stride, stride3, cnt |
| 187 | + lea stride3q, [strideq*3] |
| 188 | + pxor m2, m2 |
| 189 | + psadbw m0, m2 |
| 190 | + psadbw m1, m2 |
| 191 | + paddw m0, m1 |
| 192 | + vextracti128 xm1, m0, 1 |
| 193 | + paddw xm0, xm1 |
| 194 | + movhlps xm1, xm0 |
| 195 | + paddw xm0, xm1 |
| 196 | + pmulhrsw xm0, [pw_512] |
| 197 | + vpbroadcastb m0, xm0 |
| 198 | + mov cntd, 4 |
| 199 | +.loop: |
| 200 | + movu [dstq+strideq*0], m0 |
| 201 | + movu [dstq+strideq*1], m0 |
| 202 | + movu [dstq+strideq*2], m0 |
| 203 | + movu [dstq+stride3q ], m0 |
| 204 | + lea dstq, [dstq+strideq*4] |
| 205 | + movu [dstq+strideq*0], m0 |
| 206 | + movu [dstq+strideq*1], m0 |
| 207 | + movu [dstq+strideq*2], m0 |
| 208 | + movu [dstq+stride3q ], m0 |
| 209 | + lea dstq, [dstq+strideq*4] |
| 210 | + dec cntd |
| 211 | + jg .loop |
| 212 | + RET |
| 213 | + |
183 | 214 | ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) |
184 | 215 |
|
185 | 216 | %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l) |
@@ -267,6 +298,35 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a |
267 | 298 | dec cntd |
268 | 299 | jg .loop |
269 | 300 | RET |
| 301 | + |
| 302 | +INIT_YMM avx2 |
| 303 | +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a |
| 304 | + movu m0, [%2q] |
| 305 | + DEFINE_ARGS dst, stride, stride3, cnt |
| 306 | + lea stride3q, [strideq*3] |
| 307 | + pxor m2, m2 |
| 308 | + psadbw m0, m2 |
| 309 | + vextracti128 xm1, m0, 1 |
| 310 | + paddw xm0, xm1 |
| 311 | + movhlps xm1, xm0 |
| 312 | + paddw xm0, xm1 |
| 313 | + pmulhrsw xm0, [pw_1024] |
| 314 | + vpbroadcastb m0, xm0 |
| 315 | + mov cntd, 4 |
| 316 | +.loop: |
| 317 | + movu [dstq+strideq*0], m0 |
| 318 | + movu [dstq+strideq*1], m0 |
| 319 | + movu [dstq+strideq*2], m0 |
| 320 | + movu [dstq+stride3q ], m0 |
| 321 | + lea dstq, [dstq+strideq*4] |
| 322 | + movu [dstq+strideq*0], m0 |
| 323 | + movu [dstq+strideq*1], m0 |
| 324 | + movu [dstq+strideq*2], m0 |
| 325 | + movu [dstq+stride3q ], m0 |
| 326 | + lea dstq, [dstq+strideq*4] |
| 327 | + dec cntd |
| 328 | + jg .loop |
| 329 | + RET |
270 | 330 | %endmacro |
271 | 331 |
|
272 | 332 | DC_1D_FUNCS top, a |
@@ -327,6 +387,27 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a |
327 | 387 | jg .loop |
328 | 388 | RET |
329 | 389 |
|
| 390 | +INIT_YMM avx2 |
| 391 | +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a |
| 392 | + movu m0, [aq] |
| 393 | + DEFINE_ARGS dst, stride, stride3, cnt |
| 394 | + lea stride3q, [strideq*3] |
| 395 | + mov cntd, 4 |
| 396 | +.loop: |
| 397 | + movu [dstq+strideq*0], m0 |
| 398 | + movu [dstq+strideq*1], m0 |
| 399 | + movu [dstq+strideq*2], m0 |
| 400 | + movu [dstq+stride3q ], m0 |
| 401 | + lea dstq, [dstq+strideq*4] |
| 402 | + movu [dstq+strideq*0], m0 |
| 403 | + movu [dstq+strideq*1], m0 |
| 404 | + movu [dstq+strideq*2], m0 |
| 405 | + movu [dstq+stride3q ], m0 |
| 406 | + lea dstq, [dstq+strideq*4] |
| 407 | + dec cntd |
| 408 | + jg .loop |
| 409 | + RET |
| 410 | + |
330 | 411 | ; h |
331 | 412 |
|
332 | 413 | INIT_XMM ssse3 |
@@ -417,6 +498,30 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt |
417 | 498 | H_XMM_FUNCS ssse3 |
418 | 499 | H_XMM_FUNCS avx |
419 | 500 |
|
| 501 | +INIT_YMM avx2 |
| 502 | +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt |
| 503 | + mova m5, [pb_1] |
| 504 | + mova m6, [pb_2] |
| 505 | + mova m7, [pb_3] |
| 506 | + pxor m4, m4 |
| 507 | + lea stride3q, [strideq*3] |
| 508 | + mov cntq, 7 |
| 509 | +.loop: |
| 510 | + movd xm3, [lq+cntq*4] |
| 511 | + vinserti128 m3, m3, xm3, 1 |
| 512 | + pshufb m0, m3, m7 |
| 513 | + pshufb m1, m3, m6 |
| 514 | + movu [dstq+strideq*0], m0 |
| 515 | + movu [dstq+strideq*1], m1 |
| 516 | + pshufb m2, m3, m5 |
| 517 | + pshufb m3, m4 |
| 518 | + movu [dstq+strideq*2], m2 |
| 519 | + movu [dstq+stride3q ], m3 |
| 520 | + lea dstq, [dstq+strideq*4] |
| 521 | + dec cntq |
| 522 | + jge .loop |
| 523 | + RET |
| 524 | + |
420 | 525 | ; tm |
421 | 526 |
|
422 | 527 | INIT_MMX ssse3 |
@@ -554,6 +659,39 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a |
554 | 659 | TM_XMM_FUNCS ssse3 |
555 | 660 | TM_XMM_FUNCS avx |
556 | 661 |
|
| 662 | +INIT_YMM avx2 |
| 663 | +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a |
| 664 | + pxor m3, m3 |
| 665 | + pinsrw xm2, [aq-1], 0 |
| 666 | + vinserti128 m2, m2, xm2, 1 |
| 667 | + movu m0, [aq] |
| 668 | + DEFINE_ARGS dst, stride, l, cnt |
| 669 | + mova m4, [pw_m256] |
| 670 | + mova m5, [pw_m255] |
| 671 | + pshufb m2, m4 |
| 672 | + punpckhbw m1, m0, m3 |
| 673 | + punpcklbw m0, m3 |
| 674 | + psubw m1, m2 |
| 675 | + psubw m0, m2 |
| 676 | + mov cntq, 15 |
| 677 | +.loop: |
| 678 | + pinsrw xm7, [lq+cntq*2], 0 |
| 679 | + vinserti128 m7, m7, xm7, 1 |
| 680 | + pshufb m3, m7, m5 |
| 681 | + pshufb m7, m4 |
| 682 | + paddw m2, m3, m0 |
| 683 | + paddw m3, m1 |
| 684 | + paddw m6, m7, m0 |
| 685 | + paddw m7, m1 |
| 686 | + packuswb m2, m3 |
| 687 | + packuswb m6, m7 |
| 688 | + movu [dstq+strideq*0], m2 |
| 689 | + movu [dstq+strideq*1], m6 |
| 690 | + lea dstq, [dstq+strideq*2] |
| 691 | + dec cntq |
| 692 | + jge .loop |
| 693 | + RET |
| 694 | + |
557 | 695 | ; dl |
558 | 696 |
|
559 | 697 | %macro LOWPASS 4 ; left [dst], center, right, tmp |
|
0 commit comments