diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index aaeb24fb1dde..01901af9c074 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -94,7 +94,7 @@ enum ARM64Reg // R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately. const u32 ALL_CALLEE_SAVED = 0x1FF80000; -const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // d8-d15 +const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // q8-q15 inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; } inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; } diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 837e0ec8301a..445d86879c08 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -64,9 +64,9 @@ static const ARM64Reg neonScratchRegQ = Q2; static const ARM64Reg neonUVScaleReg = D0; static const ARM64Reg neonUVOffsetReg = D1; -static const ARM64Reg src[3] = {S2, S3, S8}; -static const ARM64Reg srcD[3] = {D2, D3, D8}; -static const ARM64Reg srcQ[3] = {Q2, Q3, Q8}; +static const ARM64Reg src[2] = {S2, S3}; +static const ARM64Reg srcD = D2; +static const ARM64Reg srcQ = Q2; static const ARM64Reg srcNEON = Q8; static const ARM64Reg accNEON = Q9; @@ -169,8 +169,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // if (skinning) log = true; - uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED; - uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP; + // GPRs 0-15 do not need to be saved. + // We don't use any higher GPRs than 16. So: + uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED; + // We only need to save Q8-Q15 if skinning is used. + uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0; fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); // Keep the scale/offset in a few fp registers if we need it. @@ -645,12 +648,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() { void VertexDecoderJitCache::Jit_PosS8() { Jit_AnyS8ToFloat(dec_->posoff); - fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff); + fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff); } void VertexDecoderJitCache::Jit_PosS16() { Jit_AnyS16ToFloat(dec_->posoff); - fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff); + fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff); } void VertexDecoderJitCache::Jit_PosFloat() { @@ -677,8 +680,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() { void VertexDecoderJitCache::Jit_PosS16Through() { // Start with X and Y (which is signed.) fp.LDUR(32, src[0], srcReg, dec_->posoff); - fp.SXTL(16, srcD[0], src[0]); - fp.SCVTF(32, srcD[0], srcD[0]); + fp.SXTL(16, srcD, src[0]); + fp.SCVTF(32, srcD, srcD); fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff); // Now load in Z (which is unsigned.) LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4); @@ -744,7 +747,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() { } void VertexDecoderJitCache::Jit_NormalFloatSkin() { - fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff); + fp.LDUR(128, srcQ, srcReg, dec_->nrmoff); Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); } @@ -759,28 +762,28 @@ void VertexDecoderJitCache::Jit_PosS16Skin() { } void VertexDecoderJitCache::Jit_PosFloatSkin() { - fp.LDUR(128, srcQ[0], srcReg, dec_->posoff); + fp.LDUR(128, srcQ, srcReg, dec_->posoff); Jit_WriteMatrixMul(dec_->decFmt.posoff, true); } void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) { fp.LDUR(32, src[0], srcReg, srcoff); - fp.SXTL(8, srcD[0], src[0]); - fp.SXTL(16, srcQ[0], srcD[0]); - fp.SCVTF(32, srcQ[0], srcQ[0], 7); + fp.SXTL(8, srcD, src[0]); + fp.SXTL(16, srcQ, srcD); + fp.SCVTF(32, srcQ, srcQ, 7); } void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) { fp.LDUR(64, src[0], srcReg, srcoff); - fp.SXTL(16, srcQ[0], srcD[0]); - fp.SCVTF(32, srcQ[0], srcQ[0], 15); + fp.SXTL(16, srcQ, srcD); + fp.SCVTF(32, srcQ, srcQ, 15); } void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { - // Multiply with the matrix sitting in Q4-Q7. - fp.FMUL(32, accNEON, Q4, srcQ[0], 0); - fp.FMLA(32, accNEON, Q5, srcQ[0], 1); - fp.FMLA(32, accNEON, Q6, srcQ[0], 2); + // Multiply srcQ with the matrix sitting in Q4-Q7. + fp.FMUL(32, accNEON, Q4, srcQ, 0); + fp.FMLA(32, accNEON, Q5, srcQ, 1); + fp.FMLA(32, accNEON, Q6, srcQ, 2); if (pos) { fp.FADD(32, accNEON, accNEON, Q7); }