Skip to content

Commit

Permalink
Merge pull request #18560 from unknownbrackets/replacement-slice
Browse files Browse the repository at this point in the history
HLE: Slice the very slow memset/memcpy variants
  • Loading branch information
hrydgard committed Dec 17, 2023
2 parents b09e5f4 + e1eecb4 commit e5af1f8
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 23 deletions.
70 changes: 54 additions & 16 deletions Core/HLE/ReplaceTables.cpp
Expand Up @@ -182,37 +182,54 @@ static int Replace_memcpy_jak() {
u32 destPtr = PARAM(0);
u32 srcPtr = PARAM(1);
u32 bytes = PARAM(2);
bool skip = false;

if (bytes == 0) {
RETURN(destPtr);
return 5;
}

bool skip = false;
bool sliced = false;
static constexpr uint32_t SLICE_SIZE = 32768;

currentMIPS->InvalidateICache(srcPtr, bytes);
if ((skipGPUReplacements & (int)GPUReplacementSkip::MEMCPY) == 0) {
if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) {
skip = gpu->PerformMemoryCopy(destPtr, srcPtr, bytes);
}
}
if (!skip && bytes > SLICE_SIZE && bytes != 512 * 272 * 4) {
// This is a very slow func. To avoid thread blocking, do a slice at a time.
// Avoiding exactly 512 * 272 * 4 to detect videos, though.
bytes = SLICE_SIZE;
sliced = true;
}
if (!skip && bytes != 0) {
u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes);
const u8 *src = Memory::GetPointerRange(srcPtr, bytes);

if (!dst || !src) {
} else {
if (dst && src) {
// Jak style overlap.
for (u32 i = 0; i < bytes; i++) {
dst[i] = src[i];
}
}
}

// Jak relies on more registers coming out right than the ABI specifies.
// See the disassembly of the function for the explanations for these...
currentMIPS->r[MIPS_REG_T0] = 0;
currentMIPS->r[MIPS_REG_A0] = -1;
currentMIPS->r[MIPS_REG_A2] = 0;
currentMIPS->r[MIPS_REG_A3] = destPtr + bytes;
RETURN(destPtr);
if (sliced) {
currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A1] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE;
} else {
// Jak relies on more registers coming out right than the ABI specifies.
// See the disassembly of the function for the explanations for these...
currentMIPS->r[MIPS_REG_T0] = 0;
currentMIPS->r[MIPS_REG_A0] = -1;
currentMIPS->r[MIPS_REG_A2] = 0;
// Even after slicing, this ends up correct.
currentMIPS->r[MIPS_REG_A3] = destPtr + bytes;
RETURN(destPtr);
}

if (MemBlockInfoDetailed(bytes)) {
// It's pretty common that games will copy video data.
Expand All @@ -231,6 +248,10 @@ static int Replace_memcpy_jak() {
}
}

if (sliced) {
// Negative causes the function to be run again for the next slice.
return 5 + bytes * -8 + 2;
}
return 5 + bytes * 8 + 2; // approximation. This is a slow memcpy - a byte copy loop..
}

Expand Down Expand Up @@ -364,24 +385,41 @@ static int Replace_memset_jak() {
}

bool skip = false;
bool sliced = false;
static constexpr uint32_t SLICE_SIZE = 32768;
if (Memory::IsVRAMAddress(destPtr) && (skipGPUReplacements & (int)GPUReplacementSkip::MEMSET) == 0) {
skip = gpu->PerformMemorySet(destPtr, value, bytes);
}
if (!skip && bytes > SLICE_SIZE) {
// This is a very slow func. To avoid thread blocking, do a slice at a time.
bytes = SLICE_SIZE;
sliced = true;
}
if (!skip && bytes != 0) {
u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes);
if (dst) {
memset(dst, value, bytes);
}
}

NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset");

if (sliced) {
currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE;

// This is approximate, and must be a negative value.
// Negative causes the function to be run again for the next slice.
return 5 + (int)SLICE_SIZE * -6 + 2;
}

// Even after slicing, this ends up correct.
currentMIPS->r[MIPS_REG_T0] = destPtr + bytes;
currentMIPS->r[MIPS_REG_A2] = -1;
currentMIPS->r[MIPS_REG_A3] = -1;
RETURN(destPtr);

NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset");

return 5 + bytes * 6 + 2; // approximation (hm, inspecting the disasm this should be 5 + 6 * bytes + 2, but this is what works..)
return 5 + bytes * 6 + 2; // approximation
}

static uint32_t SafeStringLen(const uint32_t ptr, uint32_t maxLen = 0x07FFFFFF) {
Expand Down Expand Up @@ -1449,12 +1487,12 @@ static const ReplacementTableEntry entries[] = {
{ "ceilf", &Replace_ceilf, 0, REPFLAG_DISABLED },

{ "memcpy", &Replace_memcpy, 0, 0 },
{ "memcpy_jak", &Replace_memcpy_jak, 0, 0 },
{ "memcpy_jak", &Replace_memcpy_jak, 0, REPFLAG_SLICED },
{ "memcpy16", &Replace_memcpy16, 0, 0 },
{ "memcpy_swizzled", &Replace_memcpy_swizzled, 0, 0 },
{ "memmove", &Replace_memmove, 0, 0 },
{ "memset", &Replace_memset, 0, 0 },
{ "memset_jak", &Replace_memset_jak, 0, 0 },
{ "memset_jak", &Replace_memset_jak, 0, REPFLAG_SLICED },
{ "strlen", &Replace_strlen, 0, REPFLAG_DISABLED },
{ "strcpy", &Replace_strcpy, 0, REPFLAG_DISABLED },
{ "strncpy", &Replace_strncpy, 0, REPFLAG_DISABLED },
Expand Down Expand Up @@ -1738,7 +1776,7 @@ bool CanReplaceJalTo(u32 dest, const ReplacementTableEntry **entry, u32 *funcSiz
return false;
}

if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED)) {
if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED | REPFLAG_SLICED)) {
// If it's a hook, we can't replace the jal, we have to go inside the func.
return false;
}
Expand Down
2 changes: 2 additions & 0 deletions Core/HLE/ReplaceTables.h
Expand Up @@ -48,6 +48,8 @@ enum {
REPFLAG_HOOKENTER = 0x04,
// Only hooks jr ra, so only use on funcs that have that.
REPFLAG_HOOKEXIT = 0x08,
// Function may take a lot of time and execute in slices (executed multiple times.)
REPFLAG_SLICED = 0x10,
};

// Kind of similar to HLE functions but with different data.
Expand Down
11 changes: 11 additions & 0 deletions Core/MIPS/ARM/ArmJit.cpp
Expand Up @@ -617,7 +617,18 @@ void ArmJit::Comp_ReplacementFunc(MIPSOpcode op)
} else {
ApplyRoundingMode();
RestoreDowncount();

CMPI2R(R0, 0, SCRATCHREG2);
FixupBranch positive = B_CC(CC_GE);

RSB(R0, R0, Operand2(0));
MovFromPC(R1);
FixupBranch done = B();

SetJumpTarget(positive);
LDR(R1, CTXREG, MIPS_REG_RA * 4);

SetJumpTarget(done);
WriteDownCountR(R0);
WriteExitDestInR(R1);
js.compiling = false;
Expand Down
11 changes: 10 additions & 1 deletion Core/MIPS/ARM64/Arm64IRCompSystem.cpp
Expand Up @@ -242,7 +242,16 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
QuickCallFunction(SCRATCH2_64, GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
SUB(DOWNCOUNTREG, DOWNCOUNTREG, W0);

// Absolute value the result and subtract.
CMP(W0, 0);
CSNEG(SCRATCH1, W0, W0, CC_PL);
SUB(DOWNCOUNTREG, DOWNCOUNTREG, SCRATCH1);

// W0 might be the mapped reg, but there's only one.
// Set dest reg to the sign of the result.
regs_.Map(inst);
ASR(regs_.R(inst.dest), W0, 31);
break;

case IROp::Break:
Expand Down
11 changes: 11 additions & 0 deletions Core/MIPS/ARM64/Arm64Jit.cpp
Expand Up @@ -614,7 +614,18 @@ void Arm64Jit::Comp_ReplacementFunc(MIPSOpcode op)
} else {
ApplyRoundingMode();
LoadStaticRegisters();

CMPI2R(W0, 0);
FixupBranch positive = B(CC_GE);

NEG(W0, W0);
MovFromPC(W1);
FixupBranch done = B();

SetJumpTarget(positive);
LDR(INDEX_UNSIGNED, W1, CTXREG, MIPS_REG_RA * 4);

SetJumpTarget(done);
WriteDownCountR(W0);
WriteExitDestInR(W1);
js.compiling = false;
Expand Down
5 changes: 4 additions & 1 deletion Core/MIPS/IR/IRFrontend.cpp
Expand Up @@ -164,15 +164,18 @@ void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
FlushAll();
RestoreRoundingMode();
ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
ir.Write(IROp::CallReplacement, IRTEMP_0, ir.AddConstant(index));

if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
// Compile the original instruction at this address. We ignore cycles for hooks.
ApplyRoundingMode();
MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
} else {
ApplyRoundingMode();
// If IRTEMP_0 was set to 1, it means the replacement needs to run again (sliced.)
// This is necessary for replacements that take a lot of cycles.
ir.Write(IROp::Downcount, 0, ir.AddConstant(js.downcountAmount));
ir.Write(IROp::ExitToConstIfNeq, ir.AddConstant(GetCompilerPC()), IRTEMP_0, MIPS_REG_ZERO);
ir.Write(IROp::ExitToReg, 0, MIPS_REG_RA, 0);
js.compiling = false;
}
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/IR/IRInst.cpp
Expand Up @@ -165,7 +165,7 @@ static const IRMeta irMeta[] = {
{ IROp::Break, "Break", "", IRFLAG_EXIT },
{ IROp::SetPC, "SetPC", "_G" },
{ IROp::SetPCConst, "SetPC", "_C" },
{ IROp::CallReplacement, "CallRepl", "_C", IRFLAG_BARRIER },
{ IROp::CallReplacement, "CallRepl", "GC", IRFLAG_BARRIER },
{ IROp::Breakpoint, "Breakpoint", "_C", IRFLAG_BARRIER },
{ IROp::MemoryCheck, "MemoryCheck", "IGC", IRFLAG_BARRIER },

Expand Down
3 changes: 2 additions & 1 deletion Core/MIPS/IR/IRInterpreter.cpp
Expand Up @@ -1089,7 +1089,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
int funcIndex = inst->constant;
const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
int cycles = f->replaceFunc();
mips->downcount -= cycles;
mips->r[inst->dest] = cycles < 0 ? -1 : 0;
mips->downcount -= cycles < 0 ? -cycles : cycles;
break;
}

Expand Down
6 changes: 5 additions & 1 deletion Core/MIPS/MIPSInt.cpp
Expand Up @@ -1038,13 +1038,17 @@ namespace MIPSInt
int index = op.encoding & 0xFFFFFF;
const ReplacementTableEntry *entry = GetReplacementFunc(index);
if (entry && entry->replaceFunc && (entry->flags & REPFLAG_DISABLED) == 0) {
entry->replaceFunc();
int cycles = entry->replaceFunc();

if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
// Interpret the original instruction under the hook.
MIPSInterpret(Memory::Read_Instruction(PC, true));
} else if (cycles < 0) {
// Leave PC unchanged, call the replacement again (assumes args are modified.)
currentMIPS->downcount += cycles;
} else {
PC = currentMIPS->r[MIPS_REG_RA];
currentMIPS->downcount -= cycles;
}
} else {
if (!entry || !entry->replaceFunc) {
Expand Down
7 changes: 7 additions & 0 deletions Core/MIPS/RiscV/RiscVCompSystem.cpp
Expand Up @@ -220,6 +220,13 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc, SCRATCH2);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();

regs_.Map(inst);
SRAIW(regs_.R(inst.dest), X10, 31);

// Absolute value trick: if neg, abs(x) == (x ^ -1) + 1.
XOR(X10, X10, regs_.R(inst.dest));
SUBW(X10, X10, regs_.R(inst.dest));
SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
break;

Expand Down
10 changes: 10 additions & 0 deletions Core/MIPS/x86/Jit.cpp
Expand Up @@ -658,8 +658,18 @@ void Jit::Comp_ReplacementFunc(MIPSOpcode op) {
ApplyRoundingMode();
MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
} else {
CMP(32, R(EAX), Imm32(0));
FixupBranch positive = J_CC(CC_GE);

MOV(32, R(ECX), MIPSSTATE_VAR(pc));
ADD(32, MIPSSTATE_VAR(downcount), R(EAX));
FixupBranch done = J();

SetJumpTarget(positive);
MOV(32, R(ECX), MIPSSTATE_VAR(r[MIPS_REG_RA]));
SUB(32, MIPSSTATE_VAR(downcount), R(EAX));

SetJumpTarget(done);
ApplyRoundingMode();
// Need to set flags again, ApplyRoundingMode destroyed them (and EAX.)
SUB(32, MIPSSTATE_VAR(downcount), Imm8(0));
Expand Down
17 changes: 15 additions & 2 deletions Core/MIPS/x86/X64IRCompSystem.cpp
Expand Up @@ -233,8 +233,21 @@ void X64JitBackend::CompIR_System(IRInst inst) {
ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
//SUB(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG), R(EAX));
SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));

// Since we flushed above, and we're mapping write, EAX should be safe.
regs_.Map(inst);
MOV(32, regs_.R(inst.dest), R(EAX));
NEG(32, R(EAX));
// Set it back if it negate made it negative. That's the absolute value.
CMOVcc(32, EAX, regs_.R(inst.dest), CC_S);

// Now set the dest to the sign bit status.
SAR(32, regs_.R(inst.dest), Imm8(31));

if (jo.downcountInRegister)
SUB(32, R(DOWNCOUNTREG), R(EAX));
else
SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));
break;

case IROp::Break:
Expand Down

0 comments on commit e5af1f8

Please sign in to comment.