From f1f3e6fba29fffe1bf56bc903df914642515bf4d Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 13:08:33 -0700 Subject: [PATCH 1/5] arm64jit: Optimize vertex full alpha tracking. --- GPU/Common/VertexDecoderArm64.cpp | 51 ++++++++++++------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 83948a16adee..fa0c7856dc8a 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -50,7 +50,7 @@ static const ARM64Reg scratchReg = W6; static const ARM64Reg scratchReg64 = X6; static const ARM64Reg scratchReg2 = W7; static const ARM64Reg scratchReg3 = W8; -static const ARM64Reg fullAlphaReg = W12; +static const ARM64Reg alphaNonFullReg = W12; static const ARM64Reg boundsMinUReg = W13; static const ARM64Reg boundsMinVReg = W14; static const ARM64Reg boundsMaxUReg = W15; @@ -172,6 +172,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // if (skinning) log = true; + bool updateFullAlpha = dec.col; + if (updateFullAlpha && (dec.VertexType() & GE_VTYPE_COL_MASK) == GE_VTYPE_COL_565) + updateFullAlpha = false; + // GPRs 0-15 do not need to be saved. // We don't use any higher GPRs than 16. So: uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED; @@ -227,9 +231,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int } } - if (dec.col) { - // Or LDB and skip the conditional? This is probably cheaper. - MOVI2R(fullAlphaReg, 0xFF); + if (updateFullAlpha) { + // This ends up non-zero if alpha is not full. + // Often we just ORN into it. + MOVI2R(alphaNonFullReg, 0); } if (dec.tc && dec.throughmode) { @@ -259,11 +264,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int SUBS(counterReg, counterReg, 1); B(CC_NEQ, loopStart); - if (dec.col) { + if (updateFullAlpha) { + FixupBranch skip = CBZ(alphaNonFullReg); MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha); - CMP(fullAlphaReg, 0); - FixupBranch skip = B(CC_NEQ); - STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0); + STRB(INDEX_UNSIGNED, WZR, tempRegPtr, 0); SetJumpTarget(skip); } @@ -482,13 +486,8 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() { void VertexDecoderJitCache::Jit_Color8888() { LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff); - // Set flags to determine if alpha != 0xFF. - ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); - CMP(tempReg2, 0); - - // Clear fullAlphaReg when the inverse was not 0. - // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1; - CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); + // Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full. + ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); } @@ -508,15 +507,10 @@ void VertexDecoderJitCache::Jit_Color4444() { // And expand to 8 bits. ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4)); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); - - // Set flags to determine if alpha != 0xFF. - ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); - CMP(tempReg2, 0); + // Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full. + ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); - // Clear fullAlphaReg when the inverse was not 0. - // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1; - CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); } void VertexDecoderJitCache::Jit_Color565() { @@ -540,7 +534,7 @@ void VertexDecoderJitCache::Jit_Color565() { ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4)); ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8)); - // Add in full alpha. No need to update fullAlphaReg. + // Add in full alpha. No need to update alphaNonFullReg. ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); @@ -566,15 +560,10 @@ void VertexDecoderJitCache::Jit_Color5551() { ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg); ORR(tempReg2, tempReg2, tempReg1); - // Set flags to determine if alpha != 0xFF. - ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); - CMP(tempReg3, 0); + // Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full. + ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off); - - // Clear fullAlphaReg when the inverse was not 0. - // fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1; - CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { From a8493c0e19a8b2d4fe99a9b6318b6e1a3f775822 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 14:33:04 -0700 Subject: [PATCH 2/5] arm64jit: Optimize weight loading a bit. --- GPU/Common/VertexDecoderArm64.cpp | 59 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index fa0c7856dc8a..586d1a6118be 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -27,7 +27,6 @@ #include "GPU/Common/VertexDecoderCommon.h" alignas(16) static float bones[16 * 8]; // First four are kept in registers -alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f}; static const float by128 = 1.0f / 128.0f; static const float by32768 = 1.0f / 32768.0f; @@ -185,8 +184,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { - fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0); - fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8); + fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0); if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg); fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD); @@ -201,33 +199,38 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int if (dec.skinInDecode) { // Copying from R3 to R4 MOVP2R(X3, gstate.boneMatrix); - MOVP2R(X4, bones); - MOVP2R(X5, boneMask); - fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0); + // This is only used with more than 4 weights, and points to the first of them. + if (dec.nweights > 4) + MOVP2R(X4, &bones[16 * 4]); + + // Construct a mask to zero out the top lane with. + fp.MVNI(32, Q3, 0); + fp.MOVI(32, Q4, 0); + fp.EXT(Q3, Q3, Q4, 4); + for (int i = 0; i < dec.nweights; i++) { - // Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST. - fp.LDR(128, INDEX_POST, Q4, X3, 12); // Load 128 bits even though we just want 96 - fp.LDR(128, INDEX_POST, Q5, X3, 12); - fp.LDR(128, INDEX_POST, Q6, X3, 12); - fp.LDR(128, INDEX_POST, Q7, X3, 12); + // This loads Q4,Q5,Q6 with 12 floats and increases X3, all in one go. + fp.LD1(32, 3, INDEX_POST, Q4, X3); + // Now sort those floats into 4 regs: ABCD EFGH IJKL -> ABC0 DEF0 GHI0 JKL0. + // Go backwards to avoid overwriting. + fp.EXT(Q7, Q6, Q6, 4); // I[JKLI]JKL + fp.EXT(Q6, Q5, Q6, 8); // EF[GHIJ]KL + fp.EXT(Q5, Q4, Q5, 12); // ABC[DEFG]H + + ARM64Reg matrixRow[4]{ Q4, Q5, Q6, Q7 }; // First four matrices are in registers Q16+. if (i < 4) { - fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3); - fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3); - fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3); - fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3); - ADDI2R(X4, X4, 16 * 4); - } else { - fp.FMUL(32, Q4, Q4, Q3); - fp.FMUL(32, Q5, Q5, Q3); - fp.FMUL(32, Q6, Q6, Q3); - fp.FMUL(32, Q7, Q7, Q3); - fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0); - fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16); - fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32); - fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48); - ADDI2R(X4, X4, 16 * 4); + for (int w = 0; w < 4; ++w) + matrixRow[w] = (ARM64Reg)(Q16 + i * 4 + w); } + // Zero out the top lane of each one with the mask created above. + fp.AND(matrixRow[0], Q4, Q3); + fp.AND(matrixRow[1], Q5, Q3); + fp.AND(matrixRow[2], Q6, Q3); + fp.AND(matrixRow[3], Q7, Q3); + + if (i >= 4) + fp.ST1(32, 4, INDEX_POST, matrixRow[0], X4); } } @@ -346,13 +349,11 @@ void VertexDecoderJitCache::Jit_ApplyWeights() { break; default: // Matrices 4+ need to be loaded from memory. - fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0); - fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16); + fp.LD1(32, 4, INDEX_POST, Q8, scratchReg64); fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3); fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3); fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3); fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3); - ADDI2R(scratchReg64, scratchReg64, 4 * 16); break; } } From 00e691d6339b035b089f63004801ee92bbfe9c80 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 15:18:44 -0700 Subject: [PATCH 3/5] arm64jit: Try shifted MOVI in MOVI2FDUP(). Any penalty from int/float or size change should be less than GPR load. --- Common/Arm64Emitter.cpp | 98 +++++++++++++++++++++++++++++++++++++++++ Common/Arm64Emitter.h | 4 ++ 2 files changed, 102 insertions(+) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index a5d87c5a11fc..1d2c8b0438b6 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo if (negate) { FNEG(32, Rd, Rd); } + } else if (TryAnyMOVI(32, Rd, ival)) { + if (negate) { + FNEG(32, Rd, Rd); + } + } else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) { + if (!negate) { + FNEG(32, Rd, Rd); + } } else { _assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value); if (negate) { @@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo } } +bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) { + if (size == 8) { + // Can always do 8. + MOVI(size, Rd, elementValue & 0xFF); + return true; + } else if (size == 16) { + if ((elementValue & 0xFF00) == 0) { + MOVI(size, Rd, elementValue & 0xFF, 0); + return true; + } else if ((elementValue & 0x00FF) == 0) { + MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8); + return true; + } else if ((elementValue & 0xFF00) == 0xFF00) { + MVNI(size, Rd, ~elementValue & 0xFF, 0); + return true; + } else if ((elementValue & 0x00FF) == 0x00FF) { + MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8); + return true; + } + + return false; + } else if (size == 32) { + for (int shift = 0; shift < 32; shift += 8) { + uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift); + if ((elementValue & mask) == 0) { + MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift); + return true; + } else if ((elementValue & mask) == mask) { + MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift); + return true; + } + } + + // Maybe an MSL shift will work? + for (int shift = 8; shift <= 16; shift += 8) { + uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift); + uint32_t ones = (1 << shift) - 1; + uint32_t notOnes = 0xFFFFFF00 << shift; + if ((elementValue & mask) == ones) { + MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true); + return true; + } else if ((elementValue & mask) == notOnes) { + MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true); + return true; + } + } + + return false; + } else if (size == 64) { + uint8_t imm8 = 0; + for (int i = 0; i < 8; ++i) { + uint8_t byte = (elementValue >> (i * 8)) & 0xFF; + if (byte != 0 && byte != 0xFF) + return false; + + if (byte == 0xFF) + imm8 |= 1 << i; + } + + // Didn't run into any partial bytes, so size 64 is doable. + MOVI(size, Rd, imm8); + return true; + } + return false; +} + +bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) { + // Try the original size first in case that's more optimal. + if (TryMOVI(size, Rd, elementValue)) + return true; + + uint64_t value = elementValue; + if (size != 64) { + uint64_t masked = elementValue & ((1 << size) - 1); + for (int i = size; i < 64; ++i) { + value |= masked << i; + } + } + + for (int attempt = 8; attempt <= 64; attempt += attempt) { + // Original size was already attempted above. + if (attempt != size) { + if (TryMOVI(attempt, Rd, value)) + return true; + } + } + + return false; +} + void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { u32 val; bool shift; diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index cd4a54cb73e9..0c3603d1bf9e 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -925,6 +925,10 @@ class ARM64FloatEmitter void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); + bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value); + // Allow using a different size. Unclear if there's a penalty. + bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value); + // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); From 646e3b269df7fae7d4e2edddfce245b7bfb299f9 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 22:37:33 -0700 Subject: [PATCH 4/5] arm64jit: Skip vertexjit prolog/epilog if possible. --- GPU/Common/VertexDecoderArm64.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 586d1a6118be..58df3b02e8cf 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -149,6 +149,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int bool prescaleStep = false; bool skinning = false; + bool updateTexBounds = false; bool log = false; @@ -164,6 +165,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) { skinning = true; } + if (dec.steps_[i] == &VertexDecoder::Step_TcU16ThroughToFloat) { + updateTexBounds = true; + } } // Not used below, but useful for logging. @@ -177,10 +181,12 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // GPRs 0-15 do not need to be saved. // We don't use any higher GPRs than 16. So: - uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED; + uint64_t regs_to_save = updateTexBounds ? 1 << 16 : 0; // We only need to save Q8-Q15 if skinning is used. uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0; - fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); + // Only bother making stack space and setting up FP if there are saved regs. + if (regs_to_save || regs_to_save_fp) + fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { @@ -240,8 +246,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int MOVI2R(alphaNonFullReg, 0); } - if (dec.tc && dec.throughmode) { - // TODO: Smarter, only when doing bounds. + if (updateTexBounds) { MOVP2R(scratchReg64, &gstate_c.vertBounds.minU); LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU)); LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU)); @@ -274,8 +279,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int SetJumpTarget(skip); } - if (dec.tc && dec.throughmode) { - // TODO: Smarter, only when doing bounds. + if (updateTexBounds) { MOVP2R(scratchReg64, &gstate_c.vertBounds.minU); STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU)); STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU)); @@ -283,7 +287,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV)); } - fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp); + if (regs_to_save || regs_to_save_fp) + fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp); RET(); From 5c4e08fe191acf7075518b225e137d00da1815ae Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 22:55:15 -0700 Subject: [PATCH 5/5] arm64jit: Use FMLA for TC precale. --- GPU/Common/VertexDecoderArm64.cpp | 40 +++++++++++++------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 58df3b02e8cf..719dfa329d7f 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -28,9 +28,6 @@ alignas(16) static float bones[16 * 8]; // First four are kept in registers -static const float by128 = 1.0f / 128.0f; -static const float by32768 = 1.0f / 32768.0f; - using namespace Arm64Gen; // Pointers, X regs (X0 - X17 safe to use.) @@ -62,6 +59,8 @@ static const ARM64Reg fpScratchReg4 = S7; static const ARM64Reg neonScratchRegD = D2; static const ARM64Reg neonScratchRegQ = Q2; +static const ARM64Reg neonScratchReg2D = D3; +static const ARM64Reg neonScratchReg2Q = Q3; static const ARM64Reg neonUVScaleReg = D0; static const ARM64Reg neonUVOffsetReg = D1; @@ -191,13 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0); - if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg); - fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD); - } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg); - fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD); - } } // Add code to convert matrices to 4x4. @@ -603,12 +595,12 @@ void VertexDecoderJitCache::Jit_TcFloat() { } void VertexDecoderJitCache::Jit_TcU8Prescale() { - fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff); - fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit - fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit - fp.UCVTF(32, neonScratchRegD, neonScratchRegD); - fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA - fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); + fp.LDUR(16, neonScratchReg2D, srcReg, dec_->tcoff); + fp.UXTL(8, neonScratchReg2Q, neonScratchReg2D); // Widen to 16-bit + fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit + fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 7); + fp.MOV(neonScratchRegD, neonUVOffsetReg); + fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg); fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); } @@ -621,11 +613,11 @@ void VertexDecoderJitCache::Jit_TcU8ToFloat() { } void VertexDecoderJitCache::Jit_TcU16Prescale() { - fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff); - fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit - fp.UCVTF(32, neonScratchRegD, neonScratchRegD); - fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA - fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); + fp.LDUR(32, neonScratchReg2D, srcReg, dec_->tcoff); + fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit + fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 15); + fp.MOV(neonScratchRegD, neonUVOffsetReg); + fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg); fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); } @@ -637,9 +629,9 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() { } void VertexDecoderJitCache::Jit_TcFloatPrescale() { - fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff); - fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA - fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); + fp.LDUR(64, neonScratchReg2D, srcReg, dec_->tcoff); + fp.MOV(neonScratchRegD, neonUVOffsetReg); + fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg); fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); }