Skip to content

Commit

Permalink
Nop-align the ARM and ARM64 loops too. Many CPUs benefit somewhat fro…
Browse files Browse the repository at this point in the history
…m hot loops being 16-byte aligned.
  • Loading branch information
hrydgard committed Jun 12, 2023
1 parent c4e44d6 commit 4af6fac
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 2 deletions.
8 changes: 8 additions & 0 deletions Common/Arm64Emitter.cpp
Expand Up @@ -315,6 +315,14 @@ const u8* ARM64XEmitter::AlignCodePage()
return m_code;
}

const u8 *ARM64XEmitter::NopAlignCode16() {
int bytes = ((-(intptr_t)m_code) & 15);
for (int i = 0; i < bytes / 4; i++) {
Write32(0xD503201F); // official nop instruction
}
return m_code;
}

void ARM64XEmitter::FlushIcache()
{
FlushIcacheSection(m_lastCacheFlushEnd, m_code);
Expand Down
1 change: 1 addition & 0 deletions Common/Arm64Emitter.h
Expand Up @@ -401,6 +401,7 @@ class ARM64XEmitter
void ReserveCodeSpace(u32 bytes);
const u8* AlignCode16();
const u8* AlignCodePage();
const u8 *NopAlignCode16();
void FlushIcache();
void FlushIcacheSection(const u8* start, const u8* end);
u8* GetWritableCodePtr();
Expand Down
8 changes: 8 additions & 0 deletions Common/ArmEmitter.cpp
Expand Up @@ -613,6 +613,14 @@ const u8 *ARMXEmitter::AlignCode16()
return code;
}

const u8 *ARMXEmitter::NopAlignCode16() {
int bytes = ((-(intptr_t)code) & 15);
for (int i = 0; i < bytes / 4; i++) {
Write32(0xE320F000); // one of many possible nops
}
return code;
}

const u8 *ARMXEmitter::AlignCodePage()
{
ReserveCodeSpace((-(intptr_t)code) & 4095);
Expand Down
2 changes: 2 additions & 0 deletions Common/ArmEmitter.h
Expand Up @@ -446,6 +446,8 @@ class ARMXEmitter
void ReserveCodeSpace(u32 bytes);
const u8 *AlignCode16();
const u8 *AlignCodePage();
const u8 *NopAlignCode16();

void FlushIcache();
void FlushIcacheSection(u8 *start, u8 *end);
u8 *GetWritableCodePtr();
Expand Down
2 changes: 1 addition & 1 deletion GPU/Common/VertexDecoderArm.cpp
Expand Up @@ -248,7 +248,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
MOV(fullAlphaReg, 0xFF);
}

JumpTarget loopStart = GetCodePtr();
JumpTarget loopStart = NopAlignCode16();
// Preload data cache ahead of reading. This offset seems pretty good.
PLD(srcReg, 64);
for (int i = 0; i < dec.numSteps_; i++) {
Expand Down
2 changes: 1 addition & 1 deletion GPU/Common/VertexDecoderArm64.cpp
Expand Up @@ -238,7 +238,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
}

const u8 *loopStart = GetCodePtr();
const u8 *loopStart = NopAlignCode16();
for (int i = 0; i < dec.numSteps_; i++) {
if (!CompileStep(dec, i)) {
EndWrite();
Expand Down

0 comments on commit 4af6fac

Please sign in to comment.