From 5bbddfd46c0944429cd0df97b6374d7ccbd8b871 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <naoyam@users.noreply.github.com>
Date: Tue, 27 Jul 2021 16:20:11 -0700
Subject: [PATCH] Fix vectorization with inner broadcast axes (#1022)

Vectorization was disabled when broadcast inner axes exist.
Fixes #1021

patched with CI failure

Co-authored-by: jjsjann123 <alex.jann2012@gmail.com>
---
 .github/workflows/clang_format.yml            | 48 -------------------
 .github/workflows/lint.yml                    |  2 +-
 .../pytorch-linux-xenial-py3.6-gcc5.4.yml     |  1 +
 .../workflows/pytorch-win-vs2019-cpu-py3.yml  |  1 +
 test/cpp/jit/test_gpu.cpp                     | 33 +++++++++++++
 torch/csrc/jit/codegen/cuda/codegen.cpp       |  5 +-
 6 files changed, 40 insertions(+), 50 deletions(-)
 delete mode 100644 .github/workflows/clang_format.yml

diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
deleted file mode 100644
index 33841222495d9..0000000000000
--- a/.github/workflows/clang_format.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: clang-format
-
-on:
-  pull_request:
-
-jobs:
-  clang-format:
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.x
-          architecture: x64
-      - name: Fetch PyTorch
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0 # deep clone, to allow us to use git merge-base
-      - name: Run clang-format
-        env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        run: |
-          set -eu
-          # This is necessary to get the same results regardless of whether the
-          # PR was opened directly or from a forked repo. See: `9f890a92` for more info.
-          git remote add upstream https://github.com/csarofeen/pytorch
-          git fetch upstream "$GITHUB_BASE_REF"
-
-          # only run clang-format on allowlisted files
-          echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-          echo "| clang-format failures found! Run: "
-          echo "|    tools/clang_format_ci.sh ${BASE_SHA} "
-          echo "| to fix this error. "
-          echo "| For more info, see: https://github.com/pytorch/pytorch/wiki/clang-format "
-          echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-          tools/clang_format_ci.sh "${BASE_SHA}"
-
-          GIT_DIFF=$(git diff)
-          if [[ -z $GIT_DIFF ]]; then
-            exit 0
-          fi
-          echo "$GIT_DIFF"
-          exit 1
-
-concurrency:
-  group: clang-format-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 9b8007a240771..eb3b519b683a7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -313,7 +313,7 @@ jobs:
           fi
 
   clang-tidy:
-    runs-on: linux.2xlarge
+    runs-on: ubuntu-18.04 # linux.2xlarge doesn't run on our repo CI?
     container:
       # ubuntu20.04-cuda11.2-py3.8-tidy11
       image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a
diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
index 4cb288530d2b5..63bbed9da5508 100644
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@@ -5,6 +5,7 @@ name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
 
 on:
   # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index 1fa4851fdf1f0..213f90de74870 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -4,6 +4,7 @@
 name: Windows CI (pytorch-win-vs2019-cpu-py3)
 
 on:
+  pull_request:
   push:
     branches:
       - master
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index 2dd41df1fff6a..aad7bb5b05d0c 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -15579,6 +15579,39 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) {
   testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
+// Reproducer of #1021
+TEST(NVFuserTest, FusionIssue1021_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, new Double(1));
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv2->cache_before();
+
+  tv2->split(0, 2);
+
+  tv1->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  std::vector<IValue> inputs = {t0};
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + 1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
index 2c2c041f6842e..0cc1986203587 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -921,7 +921,10 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
   void visit(const kir::ForLoop* node) final {
     // TODO(kir): handle this during lowering
-    if (node->iter_domain()->isBroadcast() || node->vectorize()) {
+    if (node->iter_domain()->isBroadcast()) {
+      handleScope(node->body());
+      return;
+    } else if (node->vectorize()) {
       vectorize_scope_ = node->vectorize();
       handleScope(node->body());
       vectorize_scope_ = false;