Skip to content

Commit

Permalink
Fold standalone linalg.fill ops into flow.tensor.splat ops
Browse files Browse the repository at this point in the history
This allows us to use DMA instead of kernels for pure data
fills. This is another step towards performance: it
further decreases the number of dispatches for MobileNetv2
from 94 to 76, and reduced the latency by 2ms on Galaxy
S20 (Mali G77).
  • Loading branch information
antiagainst committed Apr 25, 2021
1 parent c49dd96 commit 0cc653f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,21 @@ namespace Flow {
static unsigned kNumMaxParallelDims = 3;

namespace {

/// Returns the dynamic dimensions of the given `value`, assuming it has a
/// shaped type.
SmallVector<Value, 4> getDynamicDims(OpBuilder &builder, Location loc,
Value value) {
SmallVector<Value, 4> dynamicDims;
for (auto shape : enumerate(value.getType().cast<ShapedType>().getShape())) {
if (shape.value() == ShapedType::kDynamicSize) {
dynamicDims.push_back(
builder.createOrFold<memref::DimOp>(loc, value, shape.index()));
}
}
return dynamicDims;
}

/// PatternRewriter that allows replacing only a subset of uses.
/// Since this only adds a method, it can just be static_cast'ed to when
/// applying a rewrite.
Expand Down Expand Up @@ -866,6 +881,18 @@ struct MakeDispatchWorkgroupsOp : public RewritePattern {
return failure();
}

// If this is a standalone fill op, we don't need to create a dispatch
// region for it; just use flow.tensor.splat so we can leverage DMA
// functionalities.
Location loc = op->getLoc();
if (auto fillOp = dyn_cast<linalg::FillOp>(op)) {
SmallVector<Value, 4> dynamicDims =
getDynamicDims(rewriter, loc, fillOp.output());
rewriter.replaceOpWithNewOp<TensorSplatOp>(op, fillOp.output().getType(),
fillOp.value(), dynamicDims);
return success();
}

// The workgroup count is based on the result shape.
if (op->getNumResults() != 1) return failure();
Optional<SmallVector<Value>> resultShapeOpt = getResultShape(rewriter, op);
Expand All @@ -877,7 +904,6 @@ struct MakeDispatchWorkgroupsOp : public RewritePattern {
// the flow has three elements of workload size (x, y, z) by linearizing the
// workloads for all higher dimensions greater than or equal to
// kNumMaxParallelDims.
Location loc = op->getLoc();
SmallVector<Value, 4> count(resultShape.begin(), resultShape.end());
if (count.size() > kNumMaxParallelDims) {
unsigned numSymbols = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,18 +347,12 @@ func @fuse_tensor_update_with_fill(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>, %
// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: index
// CHECK-DAG: %[[C0:.+]] = constant 0 : index
// CHECK-DAG: %[[C1:.+]] = constant 1 : index
// CHECK: %[[VAL:.+]] = tensor.extract %[[ARG1]][]
// CHECK-DAG: %[[D0:.+]] = memref.dim %[[ARG0]], %[[C0]]
// CHECK-DAG: %[[D1:.+]] = memref.dim %[[ARG0]], %[[C1]]
// CHECK-DAG: %[[RD0:.+]] = affine.apply #[[MAP]]()[%[[ARG2]], %[[ARG4]], %[[D0]]]
// CHECK-DAG: %[[RD1:.+]] = affine.apply #[[MAP]]()[%[[ARG3]], %[[ARG5]], %[[D1]]]
// CHECK: %[[RESULT:.+]] = flow.dispatch.workgroups
// CHECK-SAME: [%[[RD1]], %[[RD0]], %[[C1]]]
// CHECK-SAME: (%[[ARG1]], %[[RD0]], %[[RD1]])
// CHECK-DAG: %[[VAL:.+]] = tensor.extract
// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor
// CHECK: %[[RETURN:.+]] = linalg.fill(%[[INIT]], %[[VAL]])
// CHECK: flow.dispatch.tensor.store %[[RETURN]], {{.*}}
// CHECK-NEXT: flow.return
// CHECK: %[[RESULT:.+]] = flow.tensor.splat %[[VAL]] : tensor<?x?xf32>{%[[RD0]], %[[RD1]]}
// CHECK: flow.tensor.update %[[ARG0]], %[[RESULT]]

// -----
Expand Down Expand Up @@ -493,12 +487,7 @@ func @subtensor_insert(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x225x225x3xf32
// CHECK: func @subtensor_insert
// CHECK-SAME: (%[[INPUT:.+]]: tensor<1x224x224x3xf32>)
//
// CHECK: %[[FILL:.+]] = flow.dispatch.workgroups[{{.+}}]() : () -> tensor<1x225x225x3xf32> =
// CHECK-NEXT: (%[[OUTPUT:.+]]: !flow.dispatch.tensor<writeonly:1x225x225x3xf32>) {
// CHECK: linalg.init_tensor
// CHECK-NEXT: %[[TENSOR:.+]] = linalg.fill
// CHECK-NEXT: flow.dispatch.tensor.store %[[TENSOR]], %[[OUTPUT]], {{.*}}
// CHECK-NEXT: flow.return
// CHECK: %[[FILL:.+]] = constant dense<0.000000e+00> : tensor<1x225x225x3xf32>
//
// CHECK: %[[PAD:.+]] = flow.dispatch.workgroups[{{.+}}](%[[INPUT]], %[[FILL]]) : (tensor<1x224x224x3xf32>, tensor<1x225x225x3xf32>) -> %[[FILL]] =
// CHECK-NEXT: (%[[SRC:.+]]: !flow.dispatch.tensor<readonly:1x224x224x3xf32>, %[[DST:.+]]: !flow.dispatch.tensor<readwrite:1x225x225x3xf32>) {
Expand Down

0 comments on commit 0cc653f

Please sign in to comment.