From 5bbddfd46c0944429cd0df97b6374d7ccbd8b871 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 27 Jul 2021 16:20:11 -0700 Subject: [PATCH] Fix vectorization with inner broadcast axes (#1022) Vectorization was disabled when broadcast inner axes exist. Fixes #1021 patched with CI failure Co-authored-by: jjsjann123 --- .github/workflows/clang_format.yml | 48 ------------------- .github/workflows/lint.yml | 2 +- .../pytorch-linux-xenial-py3.6-gcc5.4.yml | 1 + .../workflows/pytorch-win-vs2019-cpu-py3.yml | 1 + test/cpp/jit/test_gpu.cpp | 33 +++++++++++++ torch/csrc/jit/codegen/cuda/codegen.cpp | 5 +- 6 files changed, 40 insertions(+), 50 deletions(-) delete mode 100644 .github/workflows/clang_format.yml diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml deleted file mode 100644 index 33841222495d9..0000000000000 --- a/.github/workflows/clang_format.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: clang-format - -on: - pull_request: - -jobs: - clang-format: - runs-on: ubuntu-18.04 - steps: - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.x - architecture: x64 - - name: Fetch PyTorch - uses: actions/checkout@v2 - with: - fetch-depth: 0 # deep clone, to allow us to use git merge-base - - name: Run clang-format - env: - BASE_SHA: ${{ github.event.pull_request.base.sha }} - run: | - set -eu - # This is necessary to get the same results regardless of whether the - # PR was opened directly or from a forked repo. See: `9f890a92` for more info. - git remote add upstream https://github.com/csarofeen/pytorch - git fetch upstream "$GITHUB_BASE_REF" - - # only run clang-format on allowlisted files - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "| clang-format failures found! Run: " - echo "| tools/clang_format_ci.sh ${BASE_SHA} " - echo "| to fix this error. " - echo "| For more info, see: https://github.com/pytorch/pytorch/wiki/clang-format " - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - - tools/clang_format_ci.sh "${BASE_SHA}" - - GIT_DIFF=$(git diff) - if [[ -z $GIT_DIFF ]]; then - exit 0 - fi - echo "$GIT_DIFF" - exit 1 - -concurrency: - group: clang-format-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 9b8007a240771..eb3b519b683a7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -313,7 +313,7 @@ jobs: fi clang-tidy: - runs-on: linux.2xlarge + runs-on: ubuntu-18.04 # linux.2xlarge doesn't run on our repo CI? container: # ubuntu20.04-cuda11.2-py3.8-tidy11 image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml index 4cb288530d2b5..63bbed9da5508 100644 --- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml @@ -5,6 +5,7 @@ name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4) on: # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + pull_request: push: branches: - master diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml index 1fa4851fdf1f0..213f90de74870 100644 --- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml +++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml @@ -4,6 +4,7 @@ name: Windows CI (pytorch-win-vs2019-cpu-py3) on: + pull_request: push: branches: - master diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 2dd41df1fff6a..aad7bb5b05d0c 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -15579,6 +15579,39 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) { testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); } +// Reproducer of #1021 +TEST(NVFuserTest, FusionIssue1021_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + auto tv1 = add(tv0, new Double(1)); + auto tv2 = broadcast(tv1, {false, true}); + fusion.addOutput(tv2); + + auto tv3 = tv2->cache_before(); + + tv2->split(0, 2); + + tv1->computeAt(tv2, 1); + + tv2->axis(0)->parallelize(ParallelType::TIDx); + tv2->axis(1)->parallelize(ParallelType::Vectorize); + + FusionExecutor fe; + fe.compileFusion(&fusion); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({10}, options); + std::vector inputs = {t0}; + auto outputs = fe.runFusion(inputs); + + auto ref = (t0 + 1).unsqueeze(-1); + + testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); +} + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA) diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp index 2c2c041f6842e..0cc1986203587 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.cpp +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -921,7 +921,10 @@ class CudaKernelGenerator : private kir::IrVisitor { void visit(const kir::ForLoop* node) final { // TODO(kir): handle this during lowering - if (node->iter_domain()->isBroadcast() || node->vectorize()) { + if (node->iter_domain()->isBroadcast()) { + handleScope(node->body()); + return; + } else if (node->vectorize()) { vectorize_scope_ = node->vectorize(); handleScope(node->body()); vectorize_scope_ = false;