Skip to content

Commit

Permalink
Merge pull request #18213 from unknownbrackets/x86-ir-fcmp
Browse files Browse the repository at this point in the history
IR: Improve fcmp/vfpu compare jit
  • Loading branch information
hrydgard committed Sep 24, 2023
2 parents 87feeeb + 14e2e1e commit ac3139b
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 56 deletions.
28 changes: 17 additions & 11 deletions Core/MIPS/ARM64/Arm64IRCompFPU.cpp
Expand Up @@ -298,17 +298,23 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) {

case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
MOVI2R(SCRATCH1, inst.dest);
// Grab the any bit.
TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
CSET(SCRATCH2, CC_NEQ);
// Now the all bit, by clearing our mask to zero.
BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
CSET(SCRATCH1, CC_EQ);

// Insert the bits into place.
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
if (inst.dest == 1) {
// Just replicate the lowest bit to the others.
BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 5, 1);
} else {
MOVI2R(SCRATCH1, inst.dest);
// Grab the any bit.
TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
CSET(SCRATCH2, CC_NEQ);
// Now the all bit, by clearing our mask to zero.
BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
CSET(SCRATCH1, CC_EQ);

// Insert the bits into place.
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
}
break;

default:
Expand Down
38 changes: 25 additions & 13 deletions Core/MIPS/RiscV/RiscVCompFPU.cpp
Expand Up @@ -520,20 +520,32 @@ void RiscVJitBackend::CompIR_FCompare(IRInst inst) {

case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
// This is the "any bit", easy.
SNEZ(SCRATCH2, SCRATCH1);
// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
ADDI(SCRATCH1, SCRATCH1, -inst.dest);
SEQZ(SCRATCH1, SCRATCH1);
// Now we combine those together.
SLLI(SCRATCH1, SCRATCH1, 5);
SLLI(SCRATCH2, SCRATCH2, 4);
OR(SCRATCH1, SCRATCH1, SCRATCH2);
if (inst.dest == 1) {
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
// Negate so 1 becomes all bits set and zero stays zero, then mask to 0x30.
NEG(SCRATCH1, SCRATCH1);
ANDI(SCRATCH1, SCRATCH1, 0x30);

// Reject the old any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
} else {
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
FixupBranch skipZero = BEQ(SCRATCH1, R_ZERO);

// Reject those any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
ADDI(SCRATCH1, SCRATCH1, -inst.dest);
SEQZ(SCRATCH1, SCRATCH1);
// Now we combine with the "any" bit.
SLLI(SCRATCH1, SCRATCH1, 5);
ORI(SCRATCH1, SCRATCH1, 0x10);

SetJumpTarget(skipZero);

// Reject the old any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
}
break;

default:
Expand Down
107 changes: 75 additions & 32 deletions Core/MIPS/x86/X64IRCompFPU.cpp
Expand Up @@ -296,25 +296,22 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {
break;

case IRFpCompareMode::EqualOrdered:
{
// Since UCOMISS doesn't give us ordered == directly, CMPSS is better.
regs_.SpillLockFPR(inst.src1, inst.src2);
X64Reg tempReg = regs_.GetAndLockTempFPR();
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
// Clear the upper bits of SCRATCH1 so we can AND later.
// We don't have a single flag we can check, unfortunately.
XOR(32, R(SCRATCH1), R(SCRATCH1));
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src2));
// E/ZF = EQUAL or UNORDERED (not exactly what we want.)
SETcc(CC_E, R(SCRATCH1));
if (regs_.HasLowSubregister(regs_.RX(IRREG_FPCOND))) {
// NP/!PF = ORDERED.
SETcc(CC_NP, regs_.R(IRREG_FPCOND));
AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));

if (cpu_info.bAVX) {
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);
} else {
MOVZX(32, 8, regs_.RX(IRREG_FPCOND), R(SCRATCH1));
// Neither of those affected flags, luckily.
// NP/!PF = ORDERED.
SETcc(CC_NP, R(SCRATCH1));
AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));
MOVAPS(tempReg, regs_.F(inst.src1));
CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);
}
MOVD_xmm(regs_.R(IRREG_FPCOND), tempReg);
AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
break;
}

case IRFpCompareMode::EqualUnordered:
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
Expand Down Expand Up @@ -481,23 +478,69 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {

case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
// First, clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));

// Set the any bit.
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
SETcc(CC_NZ, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(4));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));

// Next up, the "all" bit. A bit annoying...
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(inst.dest));
CMP(32, R(SCRATCH1), Imm8(inst.dest));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
if (inst.dest == 1) {
// Special case 1, which is not uncommon.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
BT(32, regs_.R(IRREG_VFPU_CC), Imm8(0));
FixupBranch skip = J_CC(CC_NC);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x30));
SetJumpTarget(skip);
} else if (inst.dest == 3) {
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(3));
// 0, 1, and 3 are already correct for the any and all bits.
CMP(32, R(SCRATCH1), Imm8(2));

FixupBranch skip = J_CC(CC_NE);
SUB(32, R(SCRATCH1), Imm8(1));
SetJumpTarget(skip);

SHL(32, R(SCRATCH1), Imm8(4));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
} else if (inst.dest == 0xF) {
XOR(32, R(SCRATCH1), R(SCRATCH1));

// Clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));

// Set the any bit, just using the AND above.
FixupBranch noneSet = J_CC(CC_Z);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));

// Next up, the "all" bit.
CMP(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));

SetJumpTarget(noneSet);
} else {
XOR(32, R(SCRATCH1), R(SCRATCH1));

// Clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));

// Set the any bit.
if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC)))
TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(inst.dest));
else
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
FixupBranch noneSet = J_CC(CC_Z);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));

// Next up, the "all" bit. A bit annoying...
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(inst.dest));
CMP(32, R(SCRATCH1), Imm8(inst.dest));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));

SetJumpTarget(noneSet);
}
break;

default:
Expand Down

0 comments on commit ac3139b

Please sign in to comment.