Skip to content

Commit

Permalink
Implement Mathias Benthrup's suggestion for x86 ASM snapvector implem…
Browse files Browse the repository at this point in the history
…entation which reduces cache misses.
  • Loading branch information
Thilo Schulz committed Sep 19, 2011
1 parent 98af5f4 commit c927fab
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
27 changes: 15 additions & 12 deletions code/asm/snapvector.asm
Expand Up @@ -48,14 +48,15 @@ IFDEF idx64
stmxcsr [rsp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest

push rdi
mov rdi, rcx ; maskmovdqu uses rdi as implicit memory operand
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
movups xmm0, [rdi] ; here is stored our vector. Read 4 values in one go
movaps xmm1, ssemask ; initialize the mask register
movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go
movaps xmm2, xmm0 ; keep a copy of the original data
andps xmm0, xmm1 ; set the fourth value to zero in xmm0
andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
maskmovdqu xmm0, xmm1 ; write 3 values back to memory
pop rdi
orps xmm0, xmm1 ; combine all 4 values again
movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory

ldmxcsr [rsp] ; restore sse control word to old value
add rsp, 8
Expand All @@ -69,14 +70,16 @@ ELSE
stmxcsr [esp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest

push edi
mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go
mov eax, dword ptr 16[esp] ; store address of vector in eax
movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go
movaps xmm2, xmm0 ; keep a copy of the original data
andps xmm0, xmm1 ; set the fourth value to zero in xmm0
andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
maskmovdqu xmm0, xmm1 ; write 3 values back to memory
pop edi
orps xmm0, xmm1 ; combine all 4 values again
movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory

ldmxcsr [esp] ; restore sse control word to old value
add esp, 8
Expand Down
13 changes: 7 additions & 6 deletions code/asm/snapvector.c
Expand Up @@ -47,17 +47,18 @@ void qsnapvectorsse(vec3_t vec)

"movaps (%0), %%xmm1\n"
"movups (%2), %%xmm0\n"
"movaps %%xmm0, %%xmm2\n"
"andps %%xmm1, %%xmm0\n"
"andnps %%xmm2, %%xmm1\n"
"cvtps2dq %%xmm0, %%xmm0\n"
"cvtdq2ps %%xmm0, %%xmm0\n"
// vec MUST reside in register rdi as maskmovdqu uses
// it as an implicit operand. The "D" constraint makes
// sure of that.
"maskmovdqu %%xmm1, %%xmm0\n"
"orps %%xmm1, %%xmm0\n"
"movups %%xmm0, (%2)\n"

"ldmxcsr %3\n"
:
: "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw)
: "memory", "%xmm0", "%xmm1"
: "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw)
: "memory", "%xmm0", "%xmm1", "%xmm2"
);

}
Expand Down

0 comments on commit c927fab

Please sign in to comment.