Skip to content

Commit

Permalink
x86: More optimal 4444 in vertexjit.
Browse files Browse the repository at this point in the history
This was commented out, but works fine and goes from 320% -> 450% the speed
of non-jit for simple pos/col verts.
  • Loading branch information
unknownbrackets committed May 7, 2017
1 parent 7699fa5 commit b06e271
Showing 1 changed file with 7 additions and 59 deletions.
66 changes: 7 additions & 59 deletions GPU/Common/VertexDecoderX86.cpp
Expand Up @@ -890,71 +890,23 @@ static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f
static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };

void VertexDecoderJitCache::Jit_Color4444() {
// Needs benchmarking. A bit wasteful by only using 1 SSE lane.
#if 0
// This over-reads slightly, but we assume pos or another component follows anyway.
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
// Spread to RGBA -> R00GB00A.
PUNPCKLBW(fpScratchReg, R(fpScratchReg));
PAND(fpScratchReg, M(color4444mask));
MOVSS(fpScratchReg2, R(fpScratchReg));
MOVSS(fpScratchReg3, R(fpScratchReg));
// Create 0R000B00 and 00G000A0.
PSRLW(fpScratchReg2, 4);
PSLLW(fpScratchReg3, 4);
// Combine for the complete set: RRGGBBAA.
POR(fpScratchReg, R(fpScratchReg2));
POR(fpScratchReg, R(fpScratchReg3));
MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), fpScratchReg);
return;
#elif 0
// Alternate approach
MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff));
MOVAPS(XMM2, R(XMM3));
MOVAPS(XMM1, M(nibbles));
PSLLD(XMM2, 4);
PAND(XMM3, R(XMM1));
PAND(XMM2, R(XMM1));
PSRLD(XMM2, 4);
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM2, R(XMM1));
PUNPCKLBW(XMM3, R(XMM1));
PSLLD(XMM2, 4);
POR(XMM3, R(XMM2));
MOVAPS(XMM2, R(XMM3));
PSLLD(XMM2, 4);
POR(XMM3, R(XMM2));
MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), XMM3);
return;
#endif

MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));

// Pick out A and B, and space them out by a nibble.
MOV(32, R(tempReg2), R(tempReg1));
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg2), Imm32(0x0000F000));
AND(32, R(tempReg3), Imm32(0x00000F00));
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));

// Now grab R and G.
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg1), Imm32(0x0000000F));
AND(32, R(tempReg3), Imm32(0x000000F0));

// Currently: 000A0B00, so let's shift once so G is spaced out.
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));

// Now: 00A0B0G0, so shift it once more to add R at the bottom.
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg1));

// Now we just need to duplicate the nibbles.
MOV(32, R(tempReg3), R(tempReg2));
SHL(32, R(tempReg3), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));

MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
MOVD_xmm(R(tempReg1), fpScratchReg);
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));

CMP(32, R(tempReg2), Imm32(0xFF000000));
CMP(32, R(tempReg1), Imm32(0xFF000000));
FixupBranch skip = J_CC(CC_AE, false);
MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0));
SetJumpTarget(skip);
Expand Down Expand Up @@ -1364,7 +1316,6 @@ void VertexDecoderJitCache::Jit_PosS16Skin() {
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
}

// Just copy 12 bytes.
void VertexDecoderJitCache::Jit_PosFloatSkin() {
MOVUPS(XMM3, MDisp(srcReg, dec_->posoff));
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
Expand Down Expand Up @@ -1493,7 +1444,6 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
}
}

// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}

Expand Down Expand Up @@ -1532,7 +1482,6 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
}
}

// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}

Expand All @@ -1554,7 +1503,6 @@ void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
}
}

// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}

Expand Down

0 comments on commit b06e271

Please sign in to comment.