Skip to content

Commit

Permalink
Merge pull request #17569 from hrydgard/arm64dec-optimize-saved-regs
Browse files Browse the repository at this point in the history
ARM64: Optimize saved registers in vertex decoder.
  • Loading branch information
hrydgard committed Jun 13, 2023
2 parents 10ae6f0 + cdcf3b2 commit 71a34d4
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Common/Arm64Emitter.h
Expand Up @@ -94,7 +94,7 @@ enum ARM64Reg

// R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
const u32 ALL_CALLEE_SAVED = 0x1FF80000;
const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // d8-d15
const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // q8-q15

inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; }
Expand Down
43 changes: 23 additions & 20 deletions GPU/Common/VertexDecoderArm64.cpp
Expand Up @@ -64,9 +64,9 @@ static const ARM64Reg neonScratchRegQ = Q2;
static const ARM64Reg neonUVScaleReg = D0;
static const ARM64Reg neonUVOffsetReg = D1;

static const ARM64Reg src[3] = {S2, S3, S8};
static const ARM64Reg srcD[3] = {D2, D3, D8};
static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
static const ARM64Reg src[2] = {S2, S3};
static const ARM64Reg srcD = D2;
static const ARM64Reg srcQ = Q2;

static const ARM64Reg srcNEON = Q8;
static const ARM64Reg accNEON = Q9;
Expand Down Expand Up @@ -169,8 +169,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

// if (skinning) log = true;

uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
// GPRs 0-15 do not need to be saved.
// We don't use any higher GPRs than 16. So:
uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
// We only need to save Q8-Q15 if skinning is used.
uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);

// Keep the scale/offset in a few fp registers if we need it.
Expand Down Expand Up @@ -645,12 +648,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() {

void VertexDecoderJitCache::Jit_PosS8() {
Jit_AnyS8ToFloat(dec_->posoff);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff);
}

void VertexDecoderJitCache::Jit_PosS16() {
Jit_AnyS16ToFloat(dec_->posoff);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff);
}

void VertexDecoderJitCache::Jit_PosFloat() {
Expand All @@ -677,8 +680,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
void VertexDecoderJitCache::Jit_PosS16Through() {
// Start with X and Y (which is signed.)
fp.LDUR(32, src[0], srcReg, dec_->posoff);
fp.SXTL(16, srcD[0], src[0]);
fp.SCVTF(32, srcD[0], srcD[0]);
fp.SXTL(16, srcD, src[0]);
fp.SCVTF(32, srcD, srcD);
fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
// Now load in Z (which is unsigned.)
LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
Expand Down Expand Up @@ -744,7 +747,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() {
}

void VertexDecoderJitCache::Jit_NormalFloatSkin() {
fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
fp.LDUR(128, srcQ, srcReg, dec_->nrmoff);
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
}

Expand All @@ -759,28 +762,28 @@ void VertexDecoderJitCache::Jit_PosS16Skin() {
}

void VertexDecoderJitCache::Jit_PosFloatSkin() {
fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
fp.LDUR(128, srcQ, srcReg, dec_->posoff);
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
}

void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
fp.LDUR(32, src[0], srcReg, srcoff);
fp.SXTL(8, srcD[0], src[0]);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 7);
fp.SXTL(8, srcD, src[0]);
fp.SXTL(16, srcQ, srcD);
fp.SCVTF(32, srcQ, srcQ, 7);
}

void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
fp.LDUR(64, src[0], srcReg, srcoff);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 15);
fp.SXTL(16, srcQ, srcD);
fp.SCVTF(32, srcQ, srcQ, 15);
}

void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
// Multiply with the matrix sitting in Q4-Q7.
fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
// Multiply srcQ with the matrix sitting in Q4-Q7.
fp.FMUL(32, accNEON, Q4, srcQ, 0);
fp.FMLA(32, accNEON, Q5, srcQ, 1);
fp.FMLA(32, accNEON, Q6, srcQ, 2);
if (pos) {
fp.FADD(32, accNEON, accNEON, Q7);
}
Expand Down

0 comments on commit 71a34d4

Please sign in to comment.