Fix vectorization with inner broadcast axes (pytorch#1022)

Vectorization was disabled when broadcast inner axes exist. Fixes pytorch#1021 patched with CI failure Co-authored-by: jjsjann123 <alex.jann2012@gmail.com>
jjsjann123 · Jul 27, 2021 · 5bbddfd · 5bbddfd
1 parent 9d26419
commit 5bbddfd
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 50 deletions.
diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -313,7 +313,7 @@ jobs:
           fi
 
   clang-tidy:
-    runs-on: linux.2xlarge
+    runs-on: ubuntu-18.04 # linux.2xlarge doesn't run on our repo CI?
     container:
       # ubuntu20.04-cuda11.2-py3.8-tidy11
       image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a

diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
@@ -15579,6 +15579,39 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) {
   testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
+// Reproducer of #1021
+TEST(NVFuserTest, FusionIssue1021_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, new Double(1));
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv2->cache_before();
+
+  tv2->split(0, 2);
+
+  tv1->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  std::vector<IValue> inputs = {t0};
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + 1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -921,7 +921,10 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
   void visit(const kir::ForLoop* node) final {
     // TODO(kir): handle this during lowering
-    if (node->iter_domain()->isBroadcast() || node->vectorize()) {
+    if (node->iter_domain()->isBroadcast()) {
+      handleScope(node->body());
+      return;
+    } else if (node->vectorize()) {
       vectorize_scope_ = node->vectorize();
       handleScope(node->body());
       vectorize_scope_ = false;