Skip to content

Commit

Permalink
Optimize STORE_BLK
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobbotsch committed Apr 28, 2024
1 parent 33cd392 commit fca965b
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 18 deletions.
9 changes: 5 additions & 4 deletions src/coreclr/jit/codegenarm64.cpp
Expand Up @@ -3668,7 +3668,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)
unsigned slots = layout->GetSlotCount();

// Temp register(s) used to perform the sequence of loads and stores.
regNumber tmpReg = internalRegisters.Extract(cpObjNode, RBM_ALLINT);
regMaskTP internalRegs = internalRegisters.ExtractAll(cpObjNode);
regNumber tmpReg = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
regNumber tmpReg2 = REG_NA;

assert(genIsValidIntReg(tmpReg));
Expand All @@ -3677,7 +3678,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)

if (slots > 1)
{
tmpReg2 = internalRegisters.Extract(cpObjNode, RBM_ALLINT);
tmpReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
assert(tmpReg2 != tmpReg);
assert(genIsValidIntReg(tmpReg2));
assert(tmpReg2 != REG_WRITE_BARRIER_DST_BYREF);
Expand Down Expand Up @@ -3730,8 +3731,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)
regNumber tmpSimdReg2 = REG_NA;
if ((slots >= 4) && compiler->IsBaselineSimdIsaSupported())
{
tmpSimdReg1 = internalRegisters.Extract(cpObjNode, RBM_ALLFLOAT);
tmpSimdReg2 = internalRegisters.Extract(cpObjNode, RBM_ALLFLOAT);
tmpSimdReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLFLOAT);
tmpSimdReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLFLOAT);
}

unsigned i = 0;
Expand Down
32 changes: 18 additions & 14 deletions src/coreclr/jit/codegenarmarch.cpp
Expand Up @@ -2965,42 +2965,44 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
GetEmitter()->emitDisableGC();
}

regMaskTP internalRegs = internalRegisters.ExtractAll(node);

if ((srcOffsetAdjustment != 0) && (dstOffsetAdjustment != 0))
{
const regNumber tempReg1 = internalRegisters.Extract(node, RBM_ALLINT);
const regNumber tempReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdjustment, tempReg1);
srcReg = tempReg1;

const regNumber tempReg2 = internalRegisters.Extract(node, RBM_ALLINT);
const regNumber tempReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg2, dstReg, dstOffsetAdjustment, tempReg2);
dstReg = tempReg2;
}
else if (srcOffsetAdjustment != 0)
{
const regNumber tempReg = internalRegisters.Extract(node, RBM_ALLINT);
const regNumber tempReg = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, srcReg, srcOffsetAdjustment, tempReg);
srcReg = tempReg;
}
else if (dstOffsetAdjustment != 0)
{
const regNumber tempReg = internalRegisters.Extract(node, RBM_ALLINT);
const regNumber tempReg = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, dstReg, dstOffsetAdjustment, tempReg);
dstReg = tempReg;
}

regNumber intReg1 = REG_NA;
regNumber intReg2 = REG_NA;

const unsigned intRegCount = internalRegisters.Count(node, RBM_ALLINT);
const unsigned intRegCount = genCountBits(internalRegs & RBM_ALLINT);

if (intRegCount >= 2)
{
intReg1 = internalRegisters.Extract(node, RBM_ALLINT);
intReg2 = internalRegisters.Extract(node, RBM_ALLINT);
intReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
intReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
}
else if (intRegCount == 1)
{
intReg1 = internalRegisters.GetSingle(node, RBM_ALLINT);
intReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
intReg2 = rsGetRsvdReg();
}
else
Expand All @@ -3010,8 +3012,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

if (shouldUse16ByteWideInstrs)
{
const regNumber simdReg1 = internalRegisters.Extract(node, RBM_ALLFLOAT);
const regNumber simdReg2 = internalRegisters.GetSingle(node, RBM_ALLFLOAT);
const regNumber simdReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLFLOAT);
const regNumber simdReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLFLOAT);

helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
}
Expand Down Expand Up @@ -3146,14 +3148,15 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
unsigned simdSize = FP_REGSIZE_BYTES;
if (size >= simdSize)
{
regMaskTP internalRegs = internalRegisters.ExtractAll(tree);
// Number of SIMD regs needed to save the whole src to regs.
const unsigned numberOfSimdRegs = internalRegisters.Count(tree, RBM_ALLFLOAT);
const unsigned numberOfSimdRegs = genCountBits(internalRegs & RBM_ALLFLOAT);

// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
tempRegs[i] = internalRegisters.Extract(tree, RBM_ALLFLOAT);
tempRegs[i] = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLFLOAT);
}

auto emitSimdLoadStore = [&](bool load) {
Expand Down Expand Up @@ -3197,8 +3200,9 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
else
{
assert(internalRegisters.Count(tree) == 2);
const regNumber tmpReg1 = internalRegisters.Extract(tree, RBM_ALLINT);
const regNumber tmpReg2 = internalRegisters.Extract(tree, RBM_ALLINT);
regMaskTP internalRegs = internalRegisters.ExtractAll(tree);
const regNumber tmpReg1 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
const regNumber tmpReg2 = genFirstRegNumFromMaskAndToggle(internalRegs, RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
Expand Down
50 changes: 50 additions & 0 deletions src/coreclr/jit/codegencommon.cpp
Expand Up @@ -67,6 +67,8 @@ NodeInternalRegisters::NodeInternalRegisters(Compiler* comp)
{
}

//static NodeCounts s_nodeCounts;
//static DumpOnShutdown s_d("Node types with internal regs", &s_nodeCounts);
//------------------------------------------------------------------------
// Add: Add internal allocated registers for the specified node.
//
Expand All @@ -78,6 +80,7 @@ void NodeInternalRegisters::Add(GenTree* tree, regMaskTP regs)
{
assert(regs != RBM_NONE);

//s_nodeCounts.record(tree->gtOper);
regMaskTP* result = m_table.LookupPointerOrAdd(tree, RBM_NONE);
*result |= regs;
}
Expand Down Expand Up @@ -151,6 +154,28 @@ regMaskTP NodeInternalRegisters::GetAll(GenTree* tree)
return m_table.Lookup(tree, &regs) ? regs : RBM_NONE;
}

//------------------------------------------------------------------------
// ExtractAll: Extract all internal registers for the specified IR node.
//
// Parameters:
// tree - IR node whose internal registers to query
//
// Returns:
// Mask of registers.
//
regMaskTP NodeInternalRegisters::ExtractAll(GenTree* tree)
{
regMaskTP* regs = m_table.LookupPointer(tree);
if (regs == nullptr)
{
return RBM_NONE;
}

regMaskTP result = *regs;
*regs = RBM_NONE;
return result;
}

//------------------------------------------------------------------------
// Count: return the number of available temporary registers in the (optional)
// given set (typically, RBM_ALLINT or RBM_ALLFLOAT).
Expand Down Expand Up @@ -1983,6 +2008,31 @@ void CodeGen::genGenerateMachineCode()
/* Now generate code for the function */
genCodeForBBlist();

//for (BasicBlock* block : compiler->Blocks())
//{
// for (GenTree* node : LIR::AsRange(block))
// {
// static const unsigned buckets[] = { 0,1,2,3,4,5,6,7,8,0 };
// static Histogram s_numInternalRegs(buckets);
// static DumpOnShutdown s_d("Internal registers after codegen", &s_numInternalRegs);
// if (!node->IsCall() || node->AsCall()->IsFastTailCall())
// {
// s_numInternalRegs.record(0);
// }
// else
// {
// unsigned count = internalRegisters.Count(node);
// s_numInternalRegs.record(count);

// static NodeCounts s_internalRegsByNode;
// static DumpOnShutdown s_d2("Opers with internal registers after codegen", &s_internalRegsByNode);
// if (count > 0)
// s_internalRegsByNode.record(node->gtOper);
// }

// }
//}

#ifdef DEBUG
// After code generation, dump the frame layout again. It should be the same as before code generation, if code
// generation hasn't touched it (it shouldn't!).
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/codegeninterface.h
Expand Up @@ -58,6 +58,7 @@ class NodeInternalRegisters
regNumber Extract(GenTree* tree, regMaskTP mask = static_cast<regMaskTP>(-1));
regNumber GetSingle(GenTree* tree, regMaskTP mask = static_cast<regMaskTP>(-1));
regMaskTP GetAll(GenTree* tree);
regMaskTP ExtractAll(GenTree* tree);
unsigned Count(GenTree* tree, regMaskTP mask = static_cast<regMaskTP>(-1));
};

Expand Down
23 changes: 23 additions & 0 deletions src/coreclr/jit/compiler.hpp
Expand Up @@ -933,6 +933,29 @@ inline regNumber genFirstRegNumFromMaskAndToggle(regMaskTP& mask)
return regNum;
}

//------------------------------------------------------------------------------
// genFirstRegNumFromMaskAndToggle : Maps first bit set in the register mask to a
// register number and also toggle the bit in the `mask`.
// Arguments:
// mask - the register mask to modify
// otherMask - another mask to select which bits to pick from in mask
//
// Return Value:
// The number of the first register contained in the mask and updates the `mask` to toggle
// the bit.
//
inline regNumber genFirstRegNumFromMaskAndToggle(regMaskTP& mask, regMaskTP otherMask)
{
assert((mask & otherMask) != 0); // Must have one bit set, so can't have a mask of zero

/* Convert the mask to a register number */

regNumber regNum = (regNumber)BitOperations::BitScanForward(mask & otherMask);
mask ^= genRegMask(regNum);

return regNum;
}

//------------------------------------------------------------------------------
// genFirstRegNumFromMask : Maps first bit set in the register mask to a register number.
//
Expand Down
17 changes: 17 additions & 0 deletions src/coreclr/jit/lsra.cpp
Expand Up @@ -1428,6 +1428,23 @@ PhaseStatus LinearScan::doLinearScan()
}
compiler->EndPhase(PHASE_LINEAR_SCAN_RESOLVE);

//for (BasicBlock* block : compiler->Blocks())
//{
// for (GenTree* node : LIR::AsRange(block))
// {
// static const unsigned buckets[] = { 0,1,2,3,4,5,6,7,8,0 };
// static Histogram s_numInternalRegs(buckets);
// static DumpOnShutdown s_d("Internal registers after LSRA", &s_numInternalRegs);
// unsigned count = compiler->codeGen->internalRegisters.Count(node);
// s_numInternalRegs.record(count);

// static NodeCounts s_internalRegsByNode;
// static DumpOnShutdown s_d2("Opers with internal registers after LSRA", &s_internalRegsByNode);
// if (count > 0)
// s_internalRegsByNode.record(node->gtOper);
// }
//}

assert(blockSequencingDone); // Should do at least one traversal.
assert(blockEpoch == compiler->GetCurBasicBlockEpoch());

Expand Down

0 comments on commit fca965b

Please sign in to comment.