Skip to content

Commit

Permalink
Fix vectorization with inner broadcast axes (pytorch#1022)
Browse files Browse the repository at this point in the history
Vectorization was disabled when broadcast inner axes exist.
Fixes pytorch#1021

patched with CI failure

Co-authored-by: jjsjann123 <alex.jann2012@gmail.com>
  • Loading branch information
naoyam and jjsjann123 committed Jul 27, 2021
1 parent 9d26419 commit 5bbddfd
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 50 deletions.
48 changes: 0 additions & 48 deletions .github/workflows/clang_format.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ jobs:
fi
clang-tidy:
runs-on: linux.2xlarge
runs-on: ubuntu-18.04 # linux.2xlarge doesn't run on our repo CI?
container:
# ubuntu20.04-cuda11.2-py3.8-tidy11
image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .github/workflows/pytorch-win-vs2019-cpu-py3.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions test/cpp/jit/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15579,6 +15579,39 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) {
testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
}

// Reproducer of #1021
TEST(NVFuserTest, FusionIssue1021_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, new Double(1));
auto tv2 = broadcast(tv1, {false, true});
fusion.addOutput(tv2);

auto tv3 = tv2->cache_before();

tv2->split(0, 2);

tv1->computeAt(tv2, 1);

tv2->axis(0)->parallelize(ParallelType::TIDx);
tv2->axis(1)->parallelize(ParallelType::Vectorize);

FusionExecutor fe;
fe.compileFusion(&fusion);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({10}, options);
std::vector<IValue> inputs = {t0};
auto outputs = fe.runFusion(inputs);

auto ref = (t0 + 1).unsqueeze(-1);

testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}

} // namespace jit
} // namespace torch
#endif // #if defined(USE_CUDA)
5 changes: 4 additions & 1 deletion torch/csrc/jit/codegen/cuda/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,10 @@ class CudaKernelGenerator : private kir::IrVisitor {

void visit(const kir::ForLoop* node) final {
// TODO(kir): handle this during lowering
if (node->iter_domain()->isBroadcast() || node->vectorize()) {
if (node->iter_domain()->isBroadcast()) {
handleScope(node->body());
return;
} else if (node->vectorize()) {
vectorize_scope_ = node->vectorize();
handleScope(node->body());
vectorize_scope_ = false;
Expand Down

0 comments on commit 5bbddfd

Please sign in to comment.