From f1f3e6fba29fffe1bf56bc903df914642515bf4d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 13:08:33 -0700
Subject: [PATCH 1/5] arm64jit: Optimize vertex full alpha tracking.

---
 GPU/Common/VertexDecoderArm64.cpp | 51 ++++++++++++-------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 83948a16adee..fa0c7856dc8a 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -50,7 +50,7 @@ static const ARM64Reg scratchReg = W6;
 static const ARM64Reg scratchReg64 = X6;
 static const ARM64Reg scratchReg2 = W7;
 static const ARM64Reg scratchReg3 = W8;
-static const ARM64Reg fullAlphaReg = W12;
+static const ARM64Reg alphaNonFullReg = W12;
 static const ARM64Reg boundsMinUReg = W13;
 static const ARM64Reg boundsMinVReg = W14;
 static const ARM64Reg boundsMaxUReg = W15;
@@ -172,6 +172,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// if (skinning) log = true;
 
+	bool updateFullAlpha = dec.col;
+	if (updateFullAlpha && (dec.VertexType() & GE_VTYPE_COL_MASK) == GE_VTYPE_COL_565)
+		updateFullAlpha = false;
+
 	// GPRs 0-15 do not need to be saved.
 	// We don't use any higher GPRs than 16. So:
 	uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
@@ -227,9 +231,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		}
 	}
 
-	if (dec.col) {
-		// Or LDB and skip the conditional?  This is probably cheaper.
-		MOVI2R(fullAlphaReg, 0xFF);
+	if (updateFullAlpha) {
+		// This ends up non-zero if alpha is not full.
+		// Often we just ORN into it.
+		MOVI2R(alphaNonFullReg, 0);
 	}
 
 	if (dec.tc && dec.throughmode) {
@@ -259,11 +264,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	SUBS(counterReg, counterReg, 1);
 	B(CC_NEQ, loopStart);
 
-	if (dec.col) {
+	if (updateFullAlpha) {
+		FixupBranch skip = CBZ(alphaNonFullReg);
 		MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha);
-		CMP(fullAlphaReg, 0);
-		FixupBranch skip = B(CC_NEQ);
-		STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0);
+		STRB(INDEX_UNSIGNED, WZR, tempRegPtr, 0);
 		SetJumpTarget(skip);
 	}
 
@@ -482,13 +486,8 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 void VertexDecoderJitCache::Jit_Color8888() {
 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
 
-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg2, 0);
-
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
 
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
 }
@@ -508,15 +507,10 @@ void VertexDecoderJitCache::Jit_Color4444() {
 	// And expand to 8 bits.
 	ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4));
 
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
-
-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg2, 0);
+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
 
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
+	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
 }
 
 void VertexDecoderJitCache::Jit_Color565() {
@@ -540,7 +534,7 @@ void VertexDecoderJitCache::Jit_Color565() {
 	ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4));
 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
 
-	// Add in full alpha.  No need to update fullAlphaReg.
+	// Add in full alpha.  No need to update alphaNonFullReg.
 	ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
 
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
@@ -566,15 +560,10 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
 	ORR(tempReg2, tempReg2, tempReg1);
 	
-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg3, 0);
+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
 
 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
-
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }
 
 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {

From a8493c0e19a8b2d4fe99a9b6318b6e1a3f775822 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 14:33:04 -0700
Subject: [PATCH 2/5] arm64jit: Optimize weight loading a bit.

---
 GPU/Common/VertexDecoderArm64.cpp | 59 ++++++++++++++++---------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index fa0c7856dc8a..586d1a6118be 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -27,7 +27,6 @@
 #include "GPU/Common/VertexDecoderCommon.h"
 
 alignas(16) static float bones[16 * 8];  // First four are kept in registers
-alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
 
 static const float by128 = 1.0f / 128.0f;
 static const float by32768 = 1.0f / 32768.0f;
@@ -185,8 +184,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
-		fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
-		fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
+		fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0);
 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
 			fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
 			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
@@ -201,33 +199,38 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	if (dec.skinInDecode) {
 		// Copying from R3 to R4
 		MOVP2R(X3, gstate.boneMatrix);
-		MOVP2R(X4, bones);
-		MOVP2R(X5, boneMask);
-		fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0);
+		// This is only used with more than 4 weights, and points to the first of them.
+		if (dec.nweights > 4)
+			MOVP2R(X4, &bones[16 * 4]);
+
+		// Construct a mask to zero out the top lane with.
+		fp.MVNI(32, Q3, 0);
+		fp.MOVI(32, Q4, 0);
+		fp.EXT(Q3, Q3, Q4, 4);
+
 		for (int i = 0; i < dec.nweights; i++) {
-			// Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST.
-			fp.LDR(128, INDEX_POST, Q4, X3, 12);  // Load 128 bits even though we just want 96
-			fp.LDR(128, INDEX_POST, Q5, X3, 12);
-			fp.LDR(128, INDEX_POST, Q6, X3, 12);
-			fp.LDR(128, INDEX_POST, Q7, X3, 12);
+			// This loads Q4,Q5,Q6 with 12 floats and increases X3, all in one go.
+			fp.LD1(32, 3, INDEX_POST, Q4, X3);
+			// Now sort those floats into 4 regs: ABCD EFGH IJKL -> ABC0 DEF0 GHI0 JKL0.
+			// Go backwards to avoid overwriting.
+			fp.EXT(Q7, Q6, Q6, 4); // I[JKLI]JKL
+			fp.EXT(Q6, Q5, Q6, 8); // EF[GHIJ]KL
+			fp.EXT(Q5, Q4, Q5, 12); // ABC[DEFG]H
+
+			ARM64Reg matrixRow[4]{ Q4, Q5, Q6, Q7 };
 			// First four matrices are in registers Q16+.
 			if (i < 4) {
-				fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3);
-				ADDI2R(X4, X4, 16 * 4);
-			} else {
-				fp.FMUL(32, Q4, Q4, Q3);
-				fp.FMUL(32, Q5, Q5, Q3);
-				fp.FMUL(32, Q6, Q6, Q3);
-				fp.FMUL(32, Q7, Q7, Q3);
-				fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0);
-				fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16);
-				fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32);
-				fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48);
-				ADDI2R(X4, X4, 16 * 4);
+				for (int w = 0; w < 4; ++w)
+					matrixRow[w] = (ARM64Reg)(Q16 + i * 4 + w);
 			}
+			// Zero out the top lane of each one with the mask created above.
+			fp.AND(matrixRow[0], Q4, Q3);
+			fp.AND(matrixRow[1], Q5, Q3);
+			fp.AND(matrixRow[2], Q6, Q3);
+			fp.AND(matrixRow[3], Q7, Q3);
+
+			if (i >= 4)
+				fp.ST1(32, 4, INDEX_POST, matrixRow[0], X4);
 		}
 	}
 
@@ -346,13 +349,11 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
 			break;
 		default:
 			// Matrices 4+ need to be loaded from memory.
-			fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
-			fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
+			fp.LD1(32, 4, INDEX_POST, Q8, scratchReg64);
 			fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3);
-			ADDI2R(scratchReg64, scratchReg64, 4 * 16);
 			break;
 		}
 	}

From 00e691d6339b035b089f63004801ee92bbfe9c80 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 15:18:44 -0700
Subject: [PATCH 3/5] arm64jit: Try shifted MOVI in MOVI2FDUP().

Any penalty from int/float or size change should be less than GPR load.
---
 Common/Arm64Emitter.cpp | 98 +++++++++++++++++++++++++++++++++++++++++
 Common/Arm64Emitter.h   |  4 ++
 2 files changed, 102 insertions(+)

diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
index a5d87c5a11fc..1d2c8b0438b6 100644
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 		if (negate) {
 			FNEG(32, Rd, Rd);
 		}
+	} else if (TryAnyMOVI(32, Rd, ival)) {
+		if (negate) {
+			FNEG(32, Rd, Rd);
+		}
+	} else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) {
+		if (!negate) {
+			FNEG(32, Rd, Rd);
+		}
 	} else {
 		_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
 		if (negate) {
@@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 	}
 }
 
+bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	if (size == 8) {
+		// Can always do 8.
+		MOVI(size, Rd, elementValue & 0xFF);
+		return true;
+	} else if (size == 16) {
+		if ((elementValue & 0xFF00) == 0) {
+			MOVI(size, Rd, elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0) {
+			MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8);
+			return true;
+		} else if ((elementValue & 0xFF00) == 0xFF00) {
+			MVNI(size, Rd, ~elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0x00FF) {
+			MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8);
+			return true;
+		}
+
+		return false;
+	} else if (size == 32) {
+		for (int shift = 0; shift < 32; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift);
+			if ((elementValue & mask) == 0) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift);
+				return true;
+			} else if ((elementValue & mask) == mask) {
+				MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift);
+				return true;
+			}
+		}
+
+		// Maybe an MSL shift will work?
+		for (int shift = 8; shift <= 16; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift);
+			uint32_t ones = (1 << shift) - 1;
+			uint32_t notOnes = 0xFFFFFF00 << shift;
+			if ((elementValue & mask) == ones) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			} else if ((elementValue & mask) == notOnes) {
+				MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			}
+		}
+
+		return false;
+	} else if (size == 64) {
+		uint8_t imm8 = 0;
+		for (int i = 0; i < 8; ++i) {
+			uint8_t byte = (elementValue >> (i * 8)) & 0xFF;
+			if (byte != 0 && byte != 0xFF)
+				return false;
+
+			if (byte == 0xFF)
+				imm8 |= 1 << i;
+		}
+
+		// Didn't run into any partial bytes, so size 64 is doable.
+		MOVI(size, Rd, imm8);
+		return true;
+	}
+	return false;
+}
+
+bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	// Try the original size first in case that's more optimal.
+	if (TryMOVI(size, Rd, elementValue))
+		return true;
+
+	uint64_t value = elementValue;
+	if (size != 64) {
+		uint64_t masked = elementValue & ((1 << size) - 1);
+		for (int i = size; i < 64; ++i) {
+			value |= masked << i;
+		}
+	}
+
+	for (int attempt = 8; attempt <= 64; attempt += attempt) {
+		// Original size was already attempted above.
+		if (attempt != size) {
+			if (TryMOVI(attempt, Rd, value))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	u32 val;
 	bool shift;
diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
index cd4a54cb73e9..0c3603d1bf9e 100644
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@@ -925,6 +925,10 @@ class ARM64FloatEmitter
 	void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
 	void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
 
+	bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+	// Allow using a different size.  Unclear if there's a penalty.
+	bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+
 	// One source
 	void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
 

From 646e3b269df7fae7d4e2edddfce245b7bfb299f9 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 22:37:33 -0700
Subject: [PATCH 4/5] arm64jit: Skip vertexjit prolog/epilog if possible.

---
 GPU/Common/VertexDecoderArm64.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 586d1a6118be..58df3b02e8cf 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -149,6 +149,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	bool prescaleStep = false;
 	bool skinning = false;
+	bool updateTexBounds = false;
 
 	bool log = false;
 
@@ -164,6 +165,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 			dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
 			skinning = true;
 		}
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU16ThroughToFloat) {
+			updateTexBounds = true;
+		}
 	}
 
 	// Not used below, but useful for logging.
@@ -177,10 +181,12 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// GPRs 0-15 do not need to be saved.
 	// We don't use any higher GPRs than 16. So:
-	uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
+	uint64_t regs_to_save = updateTexBounds ? 1 << 16 : 0;
 	// We only need to save Q8-Q15 if skinning is used.
 	uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0;
-	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
+	// Only bother making stack space and setting up FP if there are saved regs.
+	if (regs_to_save || regs_to_save_fp)
+		fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
@@ -240,8 +246,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		MOVI2R(alphaNonFullReg, 0);
 	}
 
-	if (dec.tc && dec.throughmode) {
-		// TODO: Smarter, only when doing bounds.
+	if (updateTexBounds) {
 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
 		LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
 		LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@@ -274,8 +279,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		SetJumpTarget(skip);
 	}
 
-	if (dec.tc && dec.throughmode) {
-		// TODO: Smarter, only when doing bounds.
+	if (updateTexBounds) {
 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
 		STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
 		STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@@ -283,7 +287,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
 	}
 
-	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
+	if (regs_to_save || regs_to_save_fp)
+		fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
 
 	RET();
 

From 5c4e08fe191acf7075518b225e137d00da1815ae Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 22:55:15 -0700
Subject: [PATCH 5/5] arm64jit: Use FMLA for TC precale.

---
 GPU/Common/VertexDecoderArm64.cpp | 40 +++++++++++++------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 58df3b02e8cf..719dfa329d7f 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -28,9 +28,6 @@
 
 alignas(16) static float bones[16 * 8];  // First four are kept in registers
 
-static const float by128 = 1.0f / 128.0f;
-static const float by32768 = 1.0f / 32768.0f;
-
 using namespace Arm64Gen;
 
 // Pointers, X regs (X0 - X17 safe to use.)
@@ -62,6 +59,8 @@ static const ARM64Reg fpScratchReg4 = S7;
 
 static const ARM64Reg neonScratchRegD = D2;
 static const ARM64Reg neonScratchRegQ = Q2;
+static const ARM64Reg neonScratchReg2D = D3;
+static const ARM64Reg neonScratchReg2Q = Q3;
 
 static const ARM64Reg neonUVScaleReg = D0;
 static const ARM64Reg neonUVOffsetReg = D1;
@@ -191,13 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
 		fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0);
-		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
-			fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
-			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
-		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
-			fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg);
-			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
-		}
 	}
 
 	// Add code to convert matrices to 4x4.
@@ -603,12 +595,12 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 }
 
 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
-	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
-	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(16, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.UXTL(8, neonScratchReg2Q, neonScratchReg2D); // Widen to 16-bit
+	fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
+	fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 7);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }
 
@@ -621,11 +613,11 @@ void VertexDecoderJitCache::Jit_TcU8ToFloat() {
 }
 
 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
-	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(32, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
+	fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 15);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }
 
@@ -637,9 +629,9 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() {
 }
 
 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
-	fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(64, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }