Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
llvm/llvm-project@030c6da
llvm/torch-mlir@edf725e

Includes updates for slice changes and analysis changes

---------

Co-authored-by: Nirvedh Meshram <nirvedh@gmail.com>
Co-authored-by: Quinn Dawkins <quinn@nod-labs.com>
  • Loading branch information
3 people committed Sep 13, 2024
1 parent b249aa7 commit 27b0829
Show file tree
Hide file tree
Showing 20 changed files with 457 additions and 157 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pkgci_regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ jobs:
--goldentime-rocm-vae-ms 337.0 \
--goldendispatch-rocm-unet 1551 \
--goldendispatch-rocm-clip 1225 \
--goldendispatch-rocm-vae 247 \
--goldendispatch-rocm-vae 248 \
--goldensize-rocm-unet-bytes 2280000 \
--goldensize-rocm-clip-bytes 860000 \
--goldensize-rocm-vae-bytes 840000 \
Expand All @@ -243,7 +243,7 @@ jobs:
--goldentime-rocm-vae-ms 80.0 \
--goldendispatch-rocm-unet 1551 \
--goldendispatch-rocm-clip 1225 \
--goldendispatch-rocm-vae 247 \
--goldendispatch-rocm-vae 248 \
--goldensize-rocm-unet-bytes 2270000 \
--goldensize-rocm-clip-bytes 860000 \
--goldensize-rocm-vae-bytes 840000 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,17 +132,22 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,

scf::SCFTileAndFuseOptions::ControlFnTy controlFn =
[&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
bool isDestinationOperand) {
Operation *owner = originalProducer.getOwner();
bool yieldProducerReplacement = yieldReplacementsFor.contains(owner);
bool shouldFuse = false;
if (auto tilingOwner = dyn_cast<TilingInterface>(owner)) {
shouldFuse = !payloadOps.contains(tilingOwner);
}
// Do not fuse destination operands.
shouldFuse &= !isDestinationOperand;
return std::make_tuple(shouldFuse, yieldProducerReplacement);
};
bool isDestinationOperand)
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
Operation *owner = originalProducer.getOwner();
bool yieldProducerReplacement = yieldReplacementsFor.contains(owner);
bool shouldFuse = false;
if (auto tilingOwner = dyn_cast<TilingInterface>(owner)) {
shouldFuse = !payloadOps.contains(tilingOwner);
}
// Do not fuse destination operands.
shouldFuse &= !isDestinationOperand;
if (shouldFuse) {
return scf::SCFTileAndFuseOptions::ControlFnResult{
yieldProducerReplacement};
}
return std::nullopt;
};
tileAndFuseOptions.setFusionControlFn(controlFn);

FailureOr<scf::SCFTileAndFuseResult> tiledResults =
Expand Down
8 changes: 6 additions & 2 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUTile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,12 @@ static LogicalResult tileAndDistributeToThreads(TilingInterface consumerOp,
tileAndFuseOptions.setTilingOptions(tilingOptions);
tileAndFuseOptions.setFusionControlFn(
[](tensor::ExtractSliceOp sliceOp, OpResult origProducer,
bool isDestinationOperand) -> std::tuple<bool, bool> {
return {!isa<tensor::PadOp>(origProducer.getOwner()), false};
bool isDestinationOperand)
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
if (isa<tensor::PadOp>(origProducer.getOwner())) {
return std::nullopt;
}
return scf::SCFTileAndFuseOptions::ControlFnResult{false};
});
FailureOr<scf::SCFTileAndFuseResult> tileAndFuseResult =
scf::tileConsumerAndFuseProducersUsingSCF(rewriter, consumerOp,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ struct GPUVerifyDistributionPass final
void runOnOperation() override {
FunctionOpInterface funcOp = getOperation();

WalkResult res = funcOp.walk([](Operation *op) {
auto privateAddressSpace = gpu::AddressSpaceAttr::get(
&getContext(), gpu::GPUDialect::getPrivateAddressSpace());

WalkResult res = funcOp.walk([&](Operation *op) {
if (auto forallOp = dyn_cast<scf::ForallOp>(op)) {
std::optional<ArrayAttr> mapping = forallOp.getMapping();
if (!mapping || mapping.value().empty()) {
Expand All @@ -48,12 +51,25 @@ struct GPUVerifyDistributionPass final
return WalkResult::advance();
}
if (auto memoryEffectOp = dyn_cast<MemoryEffectOpInterface>(op)) {
if (memoryEffectOp.hasEffect<MemoryEffects::Write>() &&
!operationHasParentForallOfMappingType<
if (!operationHasParentForallOfMappingType<
mlir::gpu::GPUThreadMappingAttr, IREE::GPU::LaneIdAttr>(op)) {
op->emitOpError("write affecting operations are restricted to lane "
"or thread distributed contexts.");
return WalkResult::interrupt();
for (Value operand : memoryEffectOp->getOperands()) {
auto type = dyn_cast<MemRefType>(operand.getType());
if (!type || !memoryEffectOp.getEffectOnValue<MemoryEffects::Write>(
operand)) {
continue;
}

// Writes to private memory are fine.
if (type.getMemorySpace() == privateAddressSpace) {
continue;
}

op->emitOpError(
"write affecting operations on shared resources are restricted "
"to lane or thread distributed contexts.");
return WalkResult::interrupt();
}
}
}
return WalkResult::advance();
Expand Down
3 changes: 3 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ def GPUTileReductionPass :
def GPUVerifyDistributionPass :
InterfacePass<"iree-codegen-gpu-verify-distribution", "mlir::FunctionOpInterface"> {
let summary = "Pass to verify writes before resolving distributed contexts.";
let dependentDialects = [
"::mlir::gpu::GPUDialect",
];
}

def GPUVectorAllocPass :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,6 @@ func.func @unmapped_forall(%out : memref<32xi32>) {

// -----

func.func @write_in_warp_forall(%out : memref<32xi32>) {
%c0 = arith.constant 0 : i32
scf.forall (%arg0) in (32) {
// expected-error@+1 {{write affecting operations are restricted to lane or thread distributed contexts}}
memref.store %c0, %out[%arg0] : memref<32xi32>
} {mapping = [#gpu.warp<x>]}
return
}

// -----

func.func @lane_forall_no_warp_parent(%out : memref<32xi32>) {
// expected-error@+1 {{lane distributed scf.forall must have a parent subgroup distributed loop}}
scf.forall (%arg0) in (32) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class DistributionLayout : public AnalysisState {
explicit DistributionLayout(Value val) : AnalysisState(val) {}

TypedValue<VectorType> getValue() const {
ProgramPoint point = getPoint();
assert(isa<Value>(point) && "expected program point to be a value");
Value val = cast<Value>(point);
auto anchor = getAnchor();
assert(isa<Value>(anchor) && "expected anchor to be a value");
Value val = cast<Value>(anchor);
assert(isa<VectorType>(val.getType()) &&
"expected value to be of vector type");
return cast<TypedValue<VectorType>>(val);
Expand Down Expand Up @@ -303,7 +303,7 @@ void DistributionLayout::print(raw_ostream &os) const {
void DistributionLayout::onUpdate(DataFlowSolver *solver) const {
AnalysisState::onUpdate(solver);

Value value = point.get<Value>();
Value value = anchor.get<Value>();

if (propagation) {
// Make propagation run again on all users of this value.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@ static void populateSliceIndices(OpBuilder &b, Location loc, Value src,
}
}

static Value extractSlice(OpBuilder &b, Location loc, Value src,
ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes, AffineMap indexingMap) {
static tensor::ExtractSliceOp extractSlice(OpBuilder &b, Location loc,
Value src,
ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes,
AffineMap indexingMap) {
assert(offsets.size() == indexingMap.getNumDims() &&
offsets.size() == sizes.size() && "Invalid tile");

Expand Down Expand Up @@ -113,21 +115,49 @@ MultiMmaOp::getTiledImplementation(OpBuilder &builder,

Location loc = getLoc();
SmallVector<Value> tiledOperands;
tiledOperands.emplace_back(
extractSlice(builder, loc, getLhs(), offsets, sizes, indexingMaps[0]));
tiledOperands.emplace_back(
extractSlice(builder, loc, getRhs(), offsets, sizes, indexingMaps[1]));
tiledOperands.emplace_back(
extractSlice(builder, loc, getAcc(), offsets, sizes, indexingMaps[2]));
SmallVector<Operation *> slices;

// LHS
{
Operation *lhsSlice =
extractSlice(builder, loc, getLhs(), offsets, sizes, indexingMaps[0]);
if (!lhsSlice) {
return emitOpError("failed to get lhs slice");
}
tiledOperands.emplace_back(lhsSlice->getResult(0));
slices.push_back(lhsSlice);
}

// RHS
{
Operation *rhsSlice =
extractSlice(builder, loc, getRhs(), offsets, sizes, indexingMaps[1]);
if (!rhsSlice) {
return emitOpError("failed to get rhs slice");
}
tiledOperands.emplace_back(rhsSlice->getResult(0));
slices.push_back(rhsSlice);
}

// Acc
{
Operation *accSlice =
extractSlice(builder, loc, getAcc(), offsets, sizes, indexingMaps[2]);
if (!accSlice) {
return emitOpError("failed to get accumulator slice");
}
tiledOperands.emplace_back(accSlice->getResult(0));
slices.push_back(accSlice);
}

SmallVector<Type, 4> resultTypes;
resultTypes.push_back(tiledOperands.back().getType());

Operation *tiledMmaOp =
mlir::clone(builder, getOperation(), resultTypes, tiledOperands);

return TilingResult{{tiledMmaOp},
SmallVector<Value>(tiledMmaOp->getResults())};
return TilingResult{
{tiledMmaOp}, SmallVector<Value>(tiledMmaOp->getResults()), slices};
}

LogicalResult MultiMmaOp::getResultTilePosition(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,19 @@ tileRootAndFuseProducers(IRRewriter &rewriter, TilingInterface rootOp,

scf::SCFTileAndFuseOptions::ControlFnTy controlFn =
[&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
bool isDestinationOperand) {
Operation *owner = originalProducer.getOwner();
bool yieldProducerReplacement = yieldReplacementsFor.contains(owner);
// Do not fuse destination operands if onlyFuseProducerInputOperands is
// true.
bool shouldFuse =
!(onlyFuseProducerInputOperands && isDestinationOperand);
return std::make_tuple(shouldFuse, yieldProducerReplacement);
};
bool isDestinationOperand)
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
Operation *owner = originalProducer.getOwner();
bool yieldProducerReplacement = yieldReplacementsFor.contains(owner);
// Do not fuse destination operands if onlyFuseProducerInputOperands is
// true.
bool shouldFuse = !(onlyFuseProducerInputOperands && isDestinationOperand);
if (shouldFuse) {
return scf::SCFTileAndFuseOptions::ControlFnResult{
yieldProducerReplacement};
}
return std::nullopt;
};
tileAndFuseOptions.setFusionControlFn(controlFn);

FailureOr<scf::SCFTileAndFuseResult> tiledResults =
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: iree-opt --pass-pipeline="builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))" --split-input-file %s | FileCheck %s

// XFAIL: *
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,10 @@ hal.executable public @main {
// the producer's (convolution's) distributed scf.forall loop.
// CHECK-LABEL: func @conv_nchw_fused
// CHECK: %[[ALLOCA:.+]] = memref.alloca() : memref<1x1x1x1xf32, #gpu.address_space<private>>
// CHECK: %[[ALLOCA2:.+]] = memref.alloca() : memref<1x1x1x1xf32, #gpu.address_space<private>>
// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1
// CHECK: linalg.conv_2d_nchw_fchw
// CHECK-SAME: outs(%[[ALLOCA]] : memref<1x1x1x1xf32, #gpu.address_space<private>>)
// CHECK-SAME: outs(%[[ALLOCA2]] : memref<1x1x1x1xf32, #gpu.address_space<private>>)
// CHECK: arith.addf
// CHECK: arith.cmpf
// CHECK: arith.select
Expand Down
Loading

0 comments on commit 27b0829

Please sign in to comment.