Nop-align the ARM and ARM64 loops too. Many CPUs benefit somewhat fro…

…m hot loops being 16-byte aligned.
hrydgard · Jun 12, 2023 · 4af6fac · 4af6fac
1 parent c4e44d6
commit 4af6fac
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 2 deletions.
diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
@@ -315,6 +315,14 @@ const u8* ARM64XEmitter::AlignCodePage()
 	return m_code;
 }
 
+const u8 *ARM64XEmitter::NopAlignCode16() {
+	int bytes = ((-(intptr_t)m_code) & 15);
+	for (int i = 0; i < bytes / 4; i++) {
+		Write32(0xD503201F); // official nop instruction
+	}
+	return m_code;
+}
+
 void ARM64XEmitter::FlushIcache()
 {
 	FlushIcacheSection(m_lastCacheFlushEnd, m_code);

diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
@@ -401,6 +401,7 @@ class ARM64XEmitter
 	void ReserveCodeSpace(u32 bytes);
 	const u8* AlignCode16();
 	const u8* AlignCodePage();
+	const u8 *NopAlignCode16();
 	void FlushIcache();
 	void FlushIcacheSection(const u8* start, const u8* end);
 	u8* GetWritableCodePtr();

diff --git a/Common/ArmEmitter.cpp b/Common/ArmEmitter.cpp
@@ -613,6 +613,14 @@ const u8 *ARMXEmitter::AlignCode16()
 	return code;
 }
 
+const u8 *ARMXEmitter::NopAlignCode16() {
+	int bytes = ((-(intptr_t)code) & 15);
+	for (int i = 0; i < bytes / 4; i++) {
+		Write32(0xE320F000); // one of many possible nops
+	}
+	return code;
+}
+
 const u8 *ARMXEmitter::AlignCodePage()
 {
 	ReserveCodeSpace((-(intptr_t)code) & 4095);

diff --git a/Common/ArmEmitter.h b/Common/ArmEmitter.h
@@ -446,6 +446,8 @@ class ARMXEmitter
 	void ReserveCodeSpace(u32 bytes);
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
+	const u8 *NopAlignCode16();
+
 	void FlushIcache();
 	void FlushIcacheSection(u8 *start, u8 *end);
 	u8 *GetWritableCodePtr();

diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
@@ -248,7 +248,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		MOV(fullAlphaReg, 0xFF);
 	}
 
-	JumpTarget loopStart = GetCodePtr();
+	JumpTarget loopStart = NopAlignCode16();
 	// Preload data cache ahead of reading. This offset seems pretty good.
 	PLD(srcReg, 64);
 	for (int i = 0; i < dec.numSteps_; i++) {

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
@@ -238,7 +238,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
 	}
 
-	const u8 *loopStart = GetCodePtr();
+	const u8 *loopStart = NopAlignCode16();
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (!CompileStep(dec, i)) {
 			EndWrite();