Skip to content

Commit

Permalink
Add VC support for Arrow Lake and Lunar Lake platforms
Browse files Browse the repository at this point in the history
.
  • Loading branch information
vmustya authored and igcbot committed Jan 6, 2024
1 parent 0fa85c9 commit e3333c9
Show file tree
Hide file tree
Showing 29 changed files with 1,460 additions and 8 deletions.
2 changes: 2 additions & 0 deletions IGC/VectorCompiler/cmake/supported_platforms_list.cmake
Expand Up @@ -15,6 +15,8 @@ set(SUPPORTED_VC_PLATFORMS
"XeHP"
"XeHPG"
"XeLPG"
"XeLPGPlus"
"XeHPC"
"XeHPCVG"
"Xe2"
)
9 changes: 9 additions & 0 deletions IGC/VectorCompiler/igcdeps/src/TranslationInterface.cpp
Expand Up @@ -170,6 +170,11 @@ getPlatformName(const PLATFORM &Platform) {
return {"XeHPG", RevId};
if (Product == IGFX_METEORLAKE)
return {"XeLPG", RevId};
if (Product == IGFX_ARROWLAKE) {
if (GFX_IS_ARL_S(DevId))
return {"XeLPG", RevId};
return {"XeLPGPlus", RevId};
}
break;
case IGFX_XE_HPC_CORE:
if (Product == IGFX_PVC) {
Expand All @@ -178,6 +183,10 @@ getPlatformName(const PLATFORM &Platform) {
return {"XeHPC", RevId & ComputeTileMaskPVC};
}
break;
case IGFX_XE2_LPG_CORE:
if (Product == IGFX_LUNARLAKE)
return {"Xe2", RevId};
break;
default:
break;
}
Expand Down
Expand Up @@ -724,6 +724,63 @@
],
"attributes": "WriteMem", },

## ``llvm.vc.internal.lsc.*.quad.tgm`` : Typed LSC load intrinsic
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * arg0: vNi1 Predicate (overloaded)
## * arg1: i8 L1 cache controls [MBC]
## * arg2: i8 L3 cache controls [MBC]
## * arg3: i8 Channel mask [MBC]
## * arg4: i32 BTI
## * arg5: vNi32 U pixel indices (overloaded)
## * arg6: vNi32 V pixel indices
## * arg7: vNi32 R pixel indices
## * arg8: vNi32 LOD pixel indices
## * arg9: vector to take values for masked simd lanes from (load)
## vector to take values to write (store)
##
## * Return value: the value read from memory (load) or void (store, prefetch)
##
"lsc_load_quad_tgm": { "result": "anyvector",
"arguments": [
"anyint", # vNxi1, predicate
"char", # L1 cache control
"char", # L3 cache control
"char", # channel mask
"int", # i32 BTI
"anyint", # vNi32 U pixel index
2, # vNi32 V pixel index
2, # vNi32 R pixel index
2, # vNi32 LOD pixel index
0, # passthru value
],
"attributes": "ReadMem", },
"lsc_store_quad_tgm": { "result": "void",
"arguments": [
"anyint", # vNxi1, predicate
"char", # L1 cache control
"char", # L3 cache control
"char", # channel mask
"int", # i32 BTI
"anyint", # vNi32 U pixel index
1, # vNi32 V pixel index
1, # vNi32 R pixel index
1, # vNi32 LOD pixel index
"anyvector", # data to write
],
"attributes": "WriteMem", },
"lsc_prefetch_quad_tgm": { "result": "void",
"arguments": [
"anyint", # vNxi1, predicate
"char", # L1 cache control
"char", # L3 cache control
"char", # channel mask
"int", # i32 BTI
"anyint", # vNi32 U pixel index
1, # vNi32 V pixel index
1, # vNi32 R pixel index
1, # vNi32 LOD pixel index
],
"attributes": "SideEffects", },

### --------------------
### Thread ID intrinsics
Expand Down
66 changes: 61 additions & 5 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenX.td
Expand Up @@ -47,12 +47,14 @@ def FeaturePartialI64Emulation : SubtargetFeature<"lightweight_i64_emulation",
"PartialI64Emulation",
"true",
"emulate subset of 64-bit operations">;

def FeatureNoLegacyDataport : SubtargetFeature<"no_legacy_dataport",
"NoLegacyDataport",
"true",
"true if platform has no legacy dataport">;
def FeatureUseMulDDQ : SubtargetFeature<"mul_ddq",
"UseMulDDQ",
"true",
"use native support for mul [U]Dx[U]D->Q">;

def FeatureLongLongEmulation : SubtargetFeature<"emulate_i64",
"EmulateLongLong",
"true",
Expand All @@ -74,6 +76,9 @@ def FeatureSwitchjmp : SubtargetFeature<"switchjmp", "HasSwitchjmp", "true",

def FeaturePreemption : SubtargetFeature<"preemption", "HasPreemption", "true",
"supports preemption">;
def FeatureSystolicDenormControl : SubtargetFeature<
"systolic_denorm_control", "HasSystolicDenormControl", "true",
"supports control for systolic pipeline types denormal values">;

def FeatureWAFusedEUNoMask : SubtargetFeature<
"wa_nomask_fusedEU", "WaNoMaskFusedEU", "true",
Expand Down Expand Up @@ -152,7 +157,10 @@ def FeatureThreadPayloadInMemory : SubtargetFeature<"thread_payload_in_memory",
def FeatureHasLSC : SubtargetFeature<"feature_has_lsc",
"HasLSCMessages", "true",
"Target supports LSC messages">;

def FeatureHasLSCOffset : SubtargetFeature<"feature_has_lsc_offset",
"HasLSCOffset", "true",
"Target supports constant offset for LSC messages",
[FeatureHasLSC]>;
def FeatureHasAdd3 : SubtargetFeature<"feature_has_add3",
"HasAdd3", "true",
"Target supports 3-way addition">;
Expand Down Expand Up @@ -196,8 +204,6 @@ def FeatureSLM64K : SubtargetFeature<"slm_64k",
def FeatureSLM128K : SubtargetFeature<"slm_128k",
"MaxSLMSize", "128",
"Target supports up to 128k of SLM">;


def FeatureHasSad2 : SubtargetFeature<"feature_has_sad2",
"HasSad2", "true",
"Target supports sad2/sad2a instructions">;
Expand Down Expand Up @@ -395,6 +401,27 @@ def : Proc<"XeLPG", [
FeatureThreadPayloadInMemory,
]>;

def : Proc<"XeLPGPlus", [
FeatureFDivFSqrt64Emulation,
FeatureFP64,
FeatureFusedEU,
FeatureHas8ThreadsPerEU,
FeatureHasAdd3,
FeatureHasBfn,
FeatureHasLSC,
FeatureHasLargeGRF,
FeatureHasMadSimd32,
FeatureHasOWordSLM,
FeatureHasPackedFloat,
FeatureIndirectByteGRFCrossing,
FeatureIndirectGRFCrossing,
FeatureInstrBitRotate,
FeatureLongLongEmulation,
FeatureMultiIndirectByteRegioning,
FeatureSLM128K,
FeatureThreadPayloadInMemory,
]>;

def : Proc<"XeHPC", [
FeatureFP64,
FeatureGRFByteSize64,
Expand Down Expand Up @@ -447,6 +474,35 @@ def : Proc<"XeHPCVG", [
FeatureThreadPayloadInMemory,
]>;

def : Proc<"Xe2", [
FeatureFP64,
FeatureGRFByteSize64,
FeatureHas8ThreadsPerEU,
FeatureHasAdd3,
FeatureHasBfn,
FeatureHasLSC,
FeatureHasLSCOffset,
FeatureHasLargeGRF,
FeatureHasMadSimd32,
FeatureHasNamedBarriers,
FeatureHasOWordSLM,
FeatureIEEEDivSqrt,
FeatureIndirectGRFCrossing,
FeatureInstr64BitRotate,
FeatureInstrAdd64,
FeatureInstrGlobalAtomicAddF64,
FeatureInstrLocalIntegerCas64,
FeatureLSCMaxWidth32,
FeatureLongLong,
FeatureNoLegacyDataport,
FeaturePartialI64Emulation,
FeaturePreemption,
FeatureSLM128K,
FeatureSwitchjmp,
FeatureSystolicDenormControl,
FeatureThreadPayloadInMemory,
]>;

def GenX : Target {
// Nothing here (yet?)
}
71 changes: 71 additions & 0 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp
Expand Up @@ -397,6 +397,7 @@ class GenXKernelBuilder {
DoublePrecisionDenorm = 1 << 6,
SinglePrecisionDenorm = 1 << 7,
HalfPrecisionDenorm = 1 << 10,
SystolicDenorm = 1 << 30,
};

uint32_t CRMask = 0;
Expand Down Expand Up @@ -1129,6 +1130,8 @@ bool GenXKernelBuilder::run() {
CRBits::DoublePrecisionDenorm | CRBits::SinglePrecisionDenorm |
CRBits::HalfPrecisionDenorm;

if (Subtarget->hasSystolicDenormControl())
CRMask |= CRBits::SystolicDenorm;

StackCallExecSize =
getExecSizeFromValue(BackendConfig->getInteropSubgroupSize());
Expand Down Expand Up @@ -3763,6 +3766,74 @@ void GenXKernelBuilder::buildIntrinsic(CallInst *CI, unsigned IntrinID,
UNSIGNED, Mod, true /* Dst */);
};

auto CreateLscTypedLoadQuad =
[&](VISA_PredOpnd *Pred, VISA_Exec_Size ExecSize,
VISA_EMask_Ctrl ExecMask, LSC_CACHE_OPTS CacheOpts,
LSC_DATA_CHMASK ChMask, VISA_VectorOpnd *Surface, VISA_RawOpnd *Dst,
VISA_RawOpnd *AddrsU, VISA_RawOpnd *AddrsV, VISA_RawOpnd *AddrsR,
VISA_RawOpnd *AddrsLOD) {
LLVM_DEBUG(dbgs() << "CreateLscTypedLoadQuad:\n");
LLVM_DEBUG(CI->dump());
LLVM_DEBUG(dbgs() << "\n");
LSC_DATA_SHAPE Shape = {LSC_DATA_SIZE_32b, LSC_DATA_ORDER_NONTRANSPOSE};
Shape.chmask = ChMask;
CISA_CALL(Kernel->AppendVISALscTypedLoad(
LSC_OP::LSC_LOAD_QUAD, Pred, ExecSize, ExecMask, CacheOpts,
LSC_ADDR_TYPE_BTI, LSC_ADDR_SIZE_32b, Shape, Surface, 0, Dst,
AddrsU, 0, AddrsV, 0, AddrsR, 0, AddrsLOD));
};
auto CreateLscTypedStoreQuad =
[&](VISA_PredOpnd *Pred, VISA_Exec_Size ExecSize,
VISA_EMask_Ctrl ExecMask, LSC_CACHE_OPTS CacheOpts,
LSC_DATA_CHMASK ChMask, VISA_VectorOpnd *Surface,
VISA_RawOpnd *AddrsU, VISA_RawOpnd *AddrsV, VISA_RawOpnd *AddrsR,
VISA_RawOpnd *AddrsLOD, VISA_RawOpnd *Data) {
LLVM_DEBUG(dbgs() << "CreateLscTypedStoreQuad:\n");
LLVM_DEBUG(CI->dump());
LLVM_DEBUG(dbgs() << "\n");
LSC_DATA_SHAPE Shape = {LSC_DATA_SIZE_32b, LSC_DATA_ORDER_NONTRANSPOSE};
Shape.chmask = ChMask;
CISA_CALL(Kernel->AppendVISALscTypedStore(
LSC_OP::LSC_STORE_QUAD, Pred, ExecSize, ExecMask, CacheOpts,
LSC_ADDR_TYPE_BTI, LSC_ADDR_SIZE_32b, Shape, Surface, 0,
AddrsU, 0, AddrsV, 0, AddrsR, 0, AddrsLOD, Data));
};

auto CreateLscTyped2D = [&](LSC_OP SubOpcode, LSC_CACHE_OPTS CacheOpts,
LSC_ADDR_TYPE AddrType, VISA_VectorOpnd *Surface,
LSC_DATA_SHAPE_TYPED_BLOCK2D DataShape,
VISA_RawOpnd *Dst, VISA_RawOpnd *Src,
VISA_VectorOpnd *XOff, VISA_VectorOpnd *YOff) {
LLVM_DEBUG(dbgs() << "CreateLscTyped2D:\n");
LLVM_DEBUG(CI->dump());
LLVM_DEBUG(dbgs() << "\n");

// work around VISA spec pecularity: for typed messages width is in bytes
// not in elements
VectorType *VT;
constexpr int SrcOperandNum = 7; // to be in sync with json
switch (SubOpcode) {
case LSC_LOAD_BLOCK2D:
VT = cast<VectorType>(CI->getType());
break;
case LSC_STORE_BLOCK2D:
VT = cast<VectorType>(CI->getArgOperand(SrcOperandNum)->getType());
break;
default:
vc::fatal(getContext(), "GenXCisaBuilder",
"Unsupported typed 2D operation", CI);
}

auto *ElementType = VT->getElementType();
unsigned EltSize = DL.getTypeSizeInBits(ElementType) / genx::ByteBits;

LLVM_DEBUG(dbgs() << "Multiplying by: " << EltSize << "\n");
DataShape.width *= EltSize;

CISA_CALL(Kernel->AppendVISALscTypedBlock2DInst(
SubOpcode, CacheOpts, AddrType, DataShape, Surface, 0, Dst, XOff, YOff,
0, 0, Src));
};

auto CheckLscOp = [&](LSC_SFID LscSfid, LSC_ADDR_TYPE AddressType,
LSC_ADDR_SIZE AddressSize, LSC_DATA_SIZE ElementSize) {
Expand Down
74 changes: 74 additions & 0 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXLegacyToLscTranslator.cpp
Expand Up @@ -50,6 +50,7 @@ class GenXLegacyToLscTranslator
Value *translateSVMGatherScatter(CallInst &CI) const;
Value *translateQuadGatherScatter(CallInst &CI) const;
Value *translateAtomic(CallInst &CI) const;
Value *translateMediaLoadStore(CallInst &CI) const;

const GenXSubtarget *ST = nullptr;
};
Expand Down Expand Up @@ -212,6 +213,10 @@ void GenXLegacyToLscTranslator::visitCallInst(CallInst &CI) {
case GenXIntrinsic::genx_svm_block_st:
NewCI = translateOWordLoadStore(CI);
break;
case GenXIntrinsic::genx_media_ld:
case GenXIntrinsic::genx_media_st:
NewCI = translateMediaLoadStore(CI);
break;
}

if (!NewCI) {
Expand Down Expand Up @@ -791,6 +796,75 @@ Value *GenXLegacyToLscTranslator::translateAtomic(CallInst &CI) const {
return I;
}

Value *GenXLegacyToLscTranslator::translateMediaLoadStore(CallInst &CI) const {
LLVM_DEBUG(dbgs() << "Translate intrinsic: " << CI);
IRBuilder<> Builder(&CI);
auto IID = vc::getAnyIntrinsicID(&CI);

IGC_ASSERT(IID == GenXIntrinsic::genx_media_ld ||
IID == GenXIntrinsic::genx_media_st);
auto IsLoad = IID == GenXIntrinsic::genx_media_ld;
auto NewIID = IsLoad ? GenXIntrinsic::genx_lsc_load2d_typed_bti
: GenXIntrinsic::genx_lsc_store2d_typed_bti;

auto *Modifier = cast<ConstantInt>(CI.getArgOperand(0));
auto *BTI = CI.getArgOperand(1);
auto *Plane = cast<ConstantInt>(CI.getArgOperand(2));
auto *BlockWidth = cast<ConstantInt>(CI.getArgOperand(3));
auto *AddrX = CI.getArgOperand(4);
auto *AddrY = CI.getArgOperand(5);
Value *Data = nullptr;
IGCLLVM::FixedVectorType *VTy = nullptr;

if (IsLoad) {
VTy = cast<IGCLLVM::FixedVectorType>(CI.getType());
} else {
Data = CI.getArgOperand(6);
VTy = cast<IGCLLVM::FixedVectorType>(Data->getType());
}

if (Modifier->getZExtValue() != 0) {
LLVM_DEBUG(dbgs() << "Modifiers are not supported for media block "
"intrinsic translations: "
<< CI);
return nullptr;
}
if (Plane->getZExtValue() != 0) {
LLVM_DEBUG(dbgs() << "Non-zero plane is not supported for media block "
"intrinsic translations: "
<< CI);
return nullptr;
}

auto *ETy = VTy->getElementType();
unsigned ESize = ETy->getScalarSizeInBits() / ByteBits;
auto DataSize = ESize * VTy->getNumElements();

unsigned Width = BlockWidth->getZExtValue();
unsigned RoundedWidth = roundedVal(Width, 4u);
unsigned Height = DataSize / RoundedWidth;
IGC_ASSERT(Width > 0 && Width <= 64);
IGC_ASSERT(Width % ESize == 0);
IGC_ASSERT(DataSize % RoundedWidth == 0);

SmallVector<Value *, 8> Args = {
Builder.getInt8(0), // L1 cache control (default)
Builder.getInt8(0), // L3 cache control (default)
BTI,
Builder.getInt32(Height),
Builder.getInt32(Width / ESize),
AddrX,
AddrY,
};
if (!IsLoad)
Args.push_back(Data);

auto *Func = GenXIntrinsic::getGenXDeclaration(CI.getModule(), NewIID, {VTy});
auto *I = Builder.CreateCall(Func, Args);
LLVM_DEBUG(dbgs() << "New intrinsic generated: " << *I);
return I;
}

bool GenXLegacyToLscTranslator::isLocal(Value *BTI) const {
if (auto *C = dyn_cast<ConstantInt>(BTI))
return C->getZExtValue() == visa::ReservedSurfaceIndex::RSI_Slm;
Expand Down

0 comments on commit e3333c9

Please sign in to comment.