diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh index dbba68081d3eb..0b0b1e3599b30 100755 --- a/.ci/pytorch/macos-build.sh +++ b/.ci/pytorch/macos-build.sh @@ -37,7 +37,7 @@ cross_compile_arm64() { # Cross compilation for arm64 # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 - USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel + USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_OPENMP=OFF USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel } compile_x86_64() { diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml deleted file mode 100644 index 765fd1715e891..0000000000000 --- a/.github/auto_request_review.yml +++ /dev/null @@ -1,29 +0,0 @@ -# Documented at https://github.com/necojackarc/auto-request-review -reviewers: - groups: - symbolic-shapes: - - ezyang - - Chillee - - albanD - - miladm - - bdhirsh - - voznesenskym - - jbschlosser - - per_author: - symbolic-shapes: - - symbolic-shapes - - antoniojkim - - wconstab - - SherlockNoMad - -files: - # none yet, TODO: migrate CODEOWNERS here - -options: - ignore_draft: true - ignored_keywords: - - DO NOT REVIEW - # Just manually setup a self-referential per_author rule if you - # want group assignment - enable_group_assignment: false diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index f5f66ae5129bf..5a6483ad54b3e 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -63,8 +63,8 @@ on: jobs: build: - # Don't run on forked repos. - if: github.repository_owner == 'pytorch' + # # Don't run on forked repos. + # if: github.repository_owner == 'pytorch' runs-on: ${{ inputs.runner-type }} env: # For sccache access (only on non-forked PRs) @@ -106,6 +106,7 @@ jobs: environment-file: ${{ inputs.environment-file }} - name: Install macOS homebrew dependencies + if: ${{ runner.arch == 'X64' }} run: | # Install dependencies brew install libomp diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml index 1fcafb6db66ff..f9c402a772ac7 100644 --- a/.github/workflows/_mac-test-mps.yml +++ b/.github/workflows/_mac-test-mps.yml @@ -83,6 +83,20 @@ jobs: set -ex ${CONDA_RUN} python3 test/run_test.py --mps --verbose + - name: Run MPS Test Modules + id: test_2 + env: + ENV_NAME: conda-test-env-${{ github.run_id }} + shell: arch -arch arm64 bash {0} + # During bring up of test_modules don't show this as an error. + continue-on-error: true + run: | + # shellcheck disable=SC1090 + set -ex + # TODO(https://github.com/pytorch/pytorch/issues/79293) + + ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose + - name: Print remaining test logs shell: bash if: always() diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index d8ede95f2958d..fb4ceaad40be9 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -128,6 +128,7 @@ jobs: echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Install macOS homebrew dependencies + if: ${{ runner.arch == 'X64' }} run: | # Install dependencies brew install libomp @@ -182,6 +183,12 @@ jobs: run: | cat test/**/*.log || true + - name: Print remaining test logs + shell: bash + if: always() + run: | + cat test/**/*.log || true + - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml deleted file mode 100644 index 7c98c2990fba7..0000000000000 --- a/.github/workflows/auto_request_review.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Auto Request Review - -on: - pull_request: - types: [opened, ready_for_review, reopened] - -jobs: - auto-request-review: - # Don't run on forked repos - if: ${{ !github.event.pull_request.head.repo.fork }} - name: Auto Request Review - runs-on: ubuntu-latest - steps: - - name: Request review based on files changes and/or groups the author belongs to - # v0.7.0 - uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml deleted file mode 100644 index 5fa5fed16daf8..0000000000000 --- a/.github/workflows/check-labels.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Check Labels - -on: - pull_request: - types: [opened, synchronize, reopened, labeled, unlabeled] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - check-labels: - name: Check labels - runs-on: linux.20_04.4x - steps: - - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@master - with: - submodules: false - fetch-depth: 1 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - architecture: x64 - check-latest: false - cache: pip - cache-dependency-path: | - **/.github/requirements-gha-cache.txt - - - name: Install requirements - id: requirements - run: | - pip install -r .github/requirements-gha-cache.txt --user - - - name: Check labels - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUM: ${{ github.event.number }} - run: | - set -ex - python3 .github/scripts/check_labels.py "${PR_NUM}" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5dc152286e503..58566ebc37465 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -5,251 +5,77 @@ on: push: branches: - master - - main - - release/* - - landchecks/* workflow_dispatch: # The names of steps that actually test the code should be suffixed with `(nonretryable)`. # When any other step fails, it's job will be retried once by retryBot. jobs: - docker-image: - name: docker-image - uses: ./.github/workflows/_calculate-docker-image.yml - with: - docker-image-name: pytorch-linux-focal-linter - lintrunner: - needs: docker-image - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.2xlarge - docker-image: ${{ needs.docker-image.outputs.docker-image }} - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - CACHE_DIRECTORY="/tmp/.lintbin" - # Try to recover the cached binaries - if [[ -d "${CACHE_DIRECTORY}" ]]; then - # It's ok to fail this as lintrunner init would download these binaries - # again if they do not exist - cp -r "${CACHE_DIRECTORY}" . || true - fi - - # This has already been cached in the docker image - lintrunner init 2> /dev/null - - # Do build steps necessary for linters - python3 -m tools.linter.clang_tidy.generate_build_files - python3 -m tools.generate_torch_version --is_debug=false - python3 -m tools.pyi.gen_pyi \ - --native-functions-path aten/src/ATen/native/native_functions.yaml \ - --tags-path aten/src/ATen/native/tags.yaml \ - --deprecated-functions-path "tools/autograd/deprecated.yaml" - - RC=0 - # Run lintrunner on all files - if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then - echo "" - echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" - echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m" - RC=1 - fi - - # Use jq to massage the JSON lint output into GitHub Actions workflow commands. - jq --raw-output \ - '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \ - lint.json || true - - exit $RC - - quick-checks: - needs: docker-image - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.2xlarge - docker-image: ${{ needs.docker-image.outputs.docker-image }} - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - # Ensure no non-breaking spaces - # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2 - # does not support the '\u000a' syntax (which is relevant for local linters) - (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false)) - - # Ensure cross-OS compatible file names - (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false)) - - # Ensure no versionless Python shebangs - (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false)) - - # Ensure ciflow tags mentioned in config - python3 .github/scripts/collect_ciflow_labels.py --validate-tags - - # C++ docs check - pushd docs/cpp/source - ./check-doxygen.sh - popd - - # CUDA kernel launch check - set -eux - python3 torch/testing/_internal/check_kernel_launches.py |& tee cuda_kernel_launch_checks.txt - - pr-sanity-checks: - name: pr-sanity-checks - runs-on: [self-hosted, linux.large] - # Only run this on pull requests. This check is simple enough to be done without a Docker image - if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks') + runs-on: macos-m1-12 steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@master with: submodules: false - fetch-depth: -1 + fetch-depth: 1 + + - name: Setup miniconda + uses: pytorch/test-infra/.github/actions/setup-miniconda@main + with: + python-version: 3.9 + environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }} + # pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt - - name: PR size check (nonretryable) + - name: Install requirements env: - BASE: ${{ github.event.pull_request.base.sha }} - HEAD: ${{ github.event.pull_request.head.sha }} + ENV_NAME: conda-test-env-${{ github.run_id }} + PY_VERS: 3.9 + shell: arch -arch arm64 bash {0} run: | - bash .github/scripts/pr-sanity-check.sh - - workflow-checks: - needs: docker-image - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.2xlarge - docker-image: ${{ needs.docker-image.outputs.docker-image }} - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - # Regenerate workflows - .github/scripts/generate_ci_workflows.py + # shellcheck disable=SC1090 + set -ex + ${CONDA_RUN} python3 -m pip install --force-reinstall -r .github/requirements-gha-cache.txt - RC=0 - # Assert that regenerating the workflows didn't change them - if ! .github/scripts/report_git_status.sh .github/workflows; then - echo - echo 'As shown by the above diff, the committed .github/workflows' - echo 'are not up to date according to .github/templates.' - echo 'Please run this command, commit, and push again to your PR:' - echo - echo ' .github/scripts/generate_ci_workflows.py' - echo - echo 'If running that command does nothing, you may need to rebase' - echo 'onto a more recent commit from the PyTorch master branch.' - RC=1 - fi - - # Check that jobs will be cancelled - .github/scripts/ensure_actions_will_cancel.py - - exit $RC - - toc: - needs: docker-image - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.2xlarge - docker-image: ${{ needs.docker-image.outputs.docker-image }} - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - # Regenerate ToCs and check that they didn't change - set -eu - - export PATH=~/.npm-global/bin:"$PATH" - for FILE in $(git grep -Il '' -- '**.md'); do - markdown-toc --bullets='-' -i "$FILE" - done - - if ! .github/scripts/report_git_status.sh .; then - echo - echo 'As shown by the above diff, the table of contents in one or' - echo 'more Markdown files is not up to date with the file contents.' - echo 'You can either apply that Git diff directly to correct the' - echo 'table of contents, or if you have npm installed, you can' - echo 'install the npm package markdown-toc and run the following' - # shellcheck disable=SC2016 - echo 'command (replacing $FILE with the filename for which you want' - echo 'to regenerate the table of contents):' - echo - # shellcheck disable=SC2016 - echo " markdown-toc --bullets='-' -i \"\$FILE\"" - false - fi - - test-tools: - name: Test tools - if: ${{ github.repository == 'pytorch/pytorch' }} - needs: docker-image - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.2xlarge - docker-image: ${{ needs.docker-image.outputs.docker-image }} - fetch-depth: 0 - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - # Test tools - python3 -m unittest discover -vs tools/test -p 'test_*.py' - python3 -m unittest discover -vs .github/scripts -p 'test_*.py' + - name: Initialize lint dependencies + env: + ENV_NAME: conda-test-env-${{ github.run_id }} + PY_VERS: 3.9 + shell: arch -arch arm64 bash {0} + run: | + # shellcheck disable=SC1090 + set -ex + ${CONDA_RUN} lintrunner init - test_collect_env: - if: ${{ github.repository == 'pytorch/pytorch' }} - name: Test collect_env - runs-on: linux.20_04.4x - strategy: - matrix: - test_type: [with_torch, without_torch, older_python_version] - steps: - # [see note: pytorch repo ref] - # deep clone (fetch-depth 0) required, to allow us to use git log - - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@master - with: - submodules: false - fetch-depth: 1 - - name: Setup Python 3.5 - if: matrix.test_type == 'older_python_version' - uses: actions/setup-python@v4 - with: - python-version: '3.5' - architecture: x64 - check-latest: false - cache: pip - cache-dependency-path: | - **/requirements.txt - - name: Setup Python 3.8 - if: matrix.test_type != 'older_python_version' - uses: actions/setup-python@v4 - with: - python-version: '3.8' - architecture: x64 - check-latest: false - cache: pip - cache-dependency-path: | - **/requirements.txt - - name: Install torch - if: matrix.test_type == 'with_torch' + - name: Do build steps necessary for linters + env: + ENV_NAME: conda-test-env-${{ github.run_id }} + PY_VERS: 3.9 + shell: arch -arch arm64 bash {0} run: | - pip install -r requirements.txt - # Doesn't really matter what torch version, we just need ANY torch installed - pip install 'torch==1.*' - - name: Run collect_env.py (nonretryable) + # shellcheck disable=SC1090 + set -ex + ${CONDA_RUN} python3 -m tools.linter.clang_tidy.generate_build_files + ${CONDA_RUN} python3 -m tools.generate_torch_version --is_debug=false + ${CONDA_RUN} python3 -m tools.pyi.gen_pyi \ + --native-functions-path aten/src/ATen/native/native_functions.yaml \ + --tags-path aten/src/ATen/native/tags.yaml \ + --deprecated-functions-path "tools/autograd/deprecated.yaml" + + - name: Run lintrunner on all MPS files (nonretryable) + env: + ENV_NAME: conda-test-env-${{ github.run_id }} + PY_VERS: 3.9 + shell: arch -arch arm64 bash {0} run: | - # All we need to see is that it passes - python3 torch/utils/collect_env.py + # shellcheck disable=SC1090 + set -ex + set +e + if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" + echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m" + exit 1 + fi concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml index 663eac84514fe..a2ca4867fd76b 100644 --- a/.github/workflows/mac-mps.yml +++ b/.github/workflows/mac-mps.yml @@ -1,10 +1,11 @@ name: Mac MPS on: - push: - tags: - - ciflow/mps/* - workflow_dispatch: + # push: + # tags: + # - ciflow/mps/* + # workflow_dispatch: + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -18,7 +19,7 @@ jobs: sync-tag: macos-12-py3-arm64-build build-environment: macos-12-py3-arm64 xcode-version: "13.3.1" - runner-type: macos-12-xl + runner-type: macos-m1-13 build-generates-artifacts: true # To match the one pre-installed in the m1 runners python_version: 3.9.12 diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml deleted file mode 100644 index 1c137084a97e9..0000000000000 --- a/.github/workflows/periodic.yml +++ /dev/null @@ -1,284 +0,0 @@ -name: periodic - -on: - schedule: - - cron: 45 0,4,8,12,16,20 * * * - - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests - push: - tags: - - ciflow/periodic/* - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} - cancel-in-progress: true - -jobs: - parallelnative-linux-focal-py3_8-gcc7-build: - name: parallelnative-linux-focal-py3.8-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: parallelnative-linux-focal-py3.8-gcc7 - docker-image-name: pytorch-linux-focal-py3.8-gcc7 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - ]} - - parallelnative-linux-focal-py3_8-gcc7-test: - name: parallelnative-linux-focal-py3.8-gcc7 - uses: ./.github/workflows/_linux-test.yml - needs: parallelnative-linux-focal-py3_8-gcc7-build - with: - build-environment: parallelnative-linux-focal-py3.8-gcc7 - docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }} - test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }} - - linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build: - name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test: - name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build - with: - build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck - docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }} - timeout-minutes: 300 - - linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build: - name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - cuda-arch-list: '8.6' - test-matrix: | - { include: [ - { config: "aot_eager_all", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - # These jobs run too slowly so they must be sharded, unfortunately - { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test: - name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }} - - linux-focal-rocm5_4_2-py3_8-build: - name: linux-focal-rocm5.4.2-py3.8 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-rocm5.4.2-py3.8 - docker-image-name: pytorch-linux-focal-rocm-n-py3 - test-matrix: | - { include: [ - { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, - { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, - { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" }, - ]} - - linux-focal-rocm5_4_2-py3_8-test: - name: linux-focal-rocm5.4.2-py3.8 - uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_4_2-py3_8-build - with: - build-environment: linux-focal-rocm5.4.2-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }} - secrets: - AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} - AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} - - linux-bionic-cuda11_7-py3_9-gcc7-build: - name: linux-bionic-cuda11.7-py3.9-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3.9-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - test-matrix: | - { include: [ - { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" }, - ]} - build-with-debug: false - - linux-bionic-cuda11_7-py3_9-gcc7-test: - name: linux-bionic-cuda11.7-py3.9-gcc7 - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3_9-gcc7-build - with: - build-environment: linux-bionic-cuda11.7-py3.9-gcc7 - docker-image: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.test-matrix }} - - linux-bionic-cuda11_7-py3_10-gcc7-debug-build: - name: linux-bionic-cuda11.7-py3.10-gcc7-debug - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - build-with-debug: true - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-debug-test: - name: linux-bionic-cuda11.7-py3.10-gcc7-debug - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3_10-gcc7-debug-build - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug - docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.test-matrix }} - - linux-bionic-cuda11_8-py3_8-gcc7-debug-build: - name: linux-bionic-cuda11.8-py3.8-gcc7-debug - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7 - build-with-debug: true - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_8-py3_8-gcc7-debug-test: - name: linux-bionic-cuda11.8-py3.8-gcc7-debug - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_8-py3_8-gcc7-debug-build - with: - build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug - docker-image: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.test-matrix }} - - libtorch-linux-bionic-cuda11_8-gcc7-build: - name: libtorch-linux-bionic-cuda11.8-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: libtorch-linux-bionic-cuda11.8-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7 - build-generates-artifacts: false - - win-vs2019-cuda11_8-py3-build: - name: win-vs2019-cuda11.8-py3 - uses: ./.github/workflows/_win-build.yml - with: - build-environment: win-vs2019-cuda11.8-py3 - cuda-version: "11.8" - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, - ]} - - win-vs2019-cuda11_8-py3-test: - name: win-vs2019-cuda11.8-py3 - uses: ./.github/workflows/_win-test.yml - needs: win-vs2019-cuda11_8-py3-build - with: - build-environment: win-vs2019-cuda11.8-py3 - cuda-version: "11.8" - test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }} - - libtorch-linux-bionic-cuda11_7-gcc7-build: - name: libtorch-linux-bionic-cuda11.7-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: libtorch-linux-bionic-cuda11.7-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - build-generates-artifacts: false - - win-vs2019-cuda11_7-py3-build: - name: win-vs2019-cuda11.7-py3 - uses: ./.github/workflows/_win-build.yml - with: - build-environment: win-vs2019-cuda11.7-py3 - cuda-version: "11.7" - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, - ]} - - win-vs2019-cuda11_7-py3-test: - name: win-vs2019-cuda11.7-py3 - uses: ./.github/workflows/_win-test.yml - needs: win-vs2019-cuda11_7-py3-build - with: - build-environment: win-vs2019-cuda11.7-py3 - cuda-version: "11.7" - test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }} - - ios-12-5-1-x86-64-coreml: - name: ios-12-5-1-x86-64-coreml - uses: ./.github/workflows/_ios-build-test.yml - with: - build-environment: ios-12-5-1-x86-64-coreml - ios-platform: SIMULATOR - ios-arch: x86_64 - - ios-12-5-1-arm64: - name: ios-12-5-1-arm64 - uses: ./.github/workflows/_ios-build-test.yml - with: - build-environment: ios-12-5-1-arm64 - ios-platform: OS - ios-arch: arm64 - - ios-12-5-1-arm64-coreml: - name: ios-12-5-1-arm64-coreml - uses: ./.github/workflows/_ios-build-test.yml - with: - build-environment: ios-12-5-1-arm64-coreml - ios-platform: OS - ios-arch: arm64 - - ios-12-5-1-arm64-custom-ops: - name: ios-12-5-1-arm64-custom-ops - uses: ./.github/workflows/_ios-build-test.yml - with: - build-environment: ios-12-5-1-arm64-custom-ops - ios-platform: OS - ios-arch: arm64 - - ios-12-5-1-arm64-metal: - name: ios-12-5-1-arm64-metal - uses: ./.github/workflows/_ios-build-test.yml - with: - build-environment: ios-12-5-1-arm64-metal - ios-platform: OS - ios-arch: arm64 - - buck-build-test: - name: buck-build-test - uses: ./.github/workflows/_buck-build-test.yml diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml deleted file mode 100644 index 2c5493639e4e7..0000000000000 --- a/.github/workflows/pull.yml +++ /dev/null @@ -1,368 +0,0 @@ -name: pull - -on: - pull_request: - push: - branches: - - master - - main - - release/* - - landchecks/* - workflow_dispatch: - schedule: - - cron: 29 8 * * * # about 1:29am PDT - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} - cancel-in-progress: true - -jobs: - linux-focal-py3_8-gcc7-build: - name: linux-focal-py3.8-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.8-gcc7 - docker-image-name: pytorch-linux-focal-py3.8-gcc7 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - { config: "docs_test", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - ]} - - linux-focal-py3_8-gcc7-test: - name: linux-focal-py3.8-gcc7 - uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_8-gcc7-build - with: - build-environment: linux-focal-py3.8-gcc7 - docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }} - - linux-docs: - name: linux-docs - uses: ./.github/workflows/_docs.yml - needs: linux-focal-py3_8-gcc7-build - with: - build-environment: linux-focal-py3.8-gcc7 - docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }} - - linux-focal-py3_8-gcc7-no-ops: - name: linux-focal-py3.8-gcc7-no-ops - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.8-gcc7-no-ops - docker-image-name: pytorch-linux-focal-py3.8-gcc7 - - linux-focal-py3_8-gcc7-pch: - name: linux-focal-py3.8-gcc7-pch - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.8-gcc7-pch - docker-image-name: pytorch-linux-focal-py3.8-gcc7 - - linux-focal-py3_9-clang7-asan-build: - name: linux-focal-py3.9-clang7-asan - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.9-clang7-asan - docker-image-name: pytorch-linux-focal-py3-clang7-asan - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge" }, - { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge" }, - { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge" }, - { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" }, - { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - ]} - - linux-focal-py3_9-clang7-asan-test: - name: linux-focal-py3.9-clang7-asan - uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_9-clang7-asan-build - with: - build-environment: linux-focal-py3.9-clang7-asan - docker-image: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.test-matrix }} - - linux-focal-py3_8-clang10-onnx-build: - name: linux-focal-py3.8-clang10-onnx - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.8-clang10-onnx - docker-image-name: pytorch-linux-focal-py3-clang10-onnx - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - ]} - - linux-focal-py3_8-clang10-onnx-test: - name: linux-focal-py3.8-clang10-onnx - uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_8-clang10-onnx-build - with: - build-environment: linux-focal-py3.8-clang10-onnx - docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }} - - linux-bionic-py3_8-clang9-build: - name: linux-bionic-py3.8-clang9 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-py3.8-clang9 - docker-image-name: pytorch-linux-bionic-py3.8-clang9 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - ]} - - linux-bionic-py3_8-clang9-test: - name: linux-bionic-py3.8-clang9 - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-py3_8-clang9-build - with: - build-environment: linux-bionic-py3.8-clang9 - docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }} - - linux-bionic-py3_11-clang9-build: - name: linux-bionic-py3.11-clang9 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-py3.11-clang9 - docker-image-name: pytorch-linux-bionic-py3.11-clang9 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - ]} - - linux-bionic-py3_11-clang9-test: - name: linux-bionic-py3.11-clang9 - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-py3_11-clang9-build - with: - build-environment: linux-bionic-py3.11-clang9 - docker-image: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.test-matrix }} - - linux-vulkan-bionic-py3_11-clang9-build: - name: linux-vulkan-bionic-py3.11-clang9 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-vulkan-bionic-py3.11-clang9 - docker-image-name: pytorch-linux-bionic-py3.11-clang9 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, - ]} - - linux-vulkan-bionic-py3_11-clang9-test: - name: linux-vulkan-bionic-py3.11-clang9 - uses: ./.github/workflows/_linux-test.yml - needs: linux-vulkan-bionic-py3_11-clang9-build - with: - build-environment: linux-vulkan-bionic-py3.11-clang9 - docker-image: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.test-matrix }} - - linux-bionic-cuda11_7-py3_10-gcc7-build: - name: linux-bionic-cuda11.7-py3.10-gcc7 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" }, - { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" }, - { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" }, - { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-test: - name: linux-bionic-cuda11.7-py3.10-gcc7 - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3_10-gcc7-build - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7 - docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }} - - linux-focal-py3-clang7-mobile-build: - name: linux-focal-py3-clang7-mobile-build - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3-clang7-mobile-build - docker-image-name: pytorch-linux-focal-py3-clang7-asan - build-generates-artifacts: false - - linux-jammy-cuda-11_7-cudnn8-py3_8-clang12-build: - name: linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - - linux-focal-py3-clang7-mobile-custom-build-static: - name: linux-focal-py3-clang7-mobile-custom-build-static - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3-clang7-mobile-custom-build-static - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c - build-generates-artifacts: false - - linux-bionic-py3_8-clang8-xla-build: - name: linux-bionic-py3_8-clang8-xla - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-py3_8-clang8-xla - docker-image-name: xla_base - test-matrix: | - { include: [ - { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" }, - ]} - - linux-bionic-py3_8-clang8-xla-test: - name: linux-bionic-py3_8-clang8-xla - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-py3_8-clang8-xla-build - with: - build-environment: linux-bionic-py3_8-clang8-xla - docker-image: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.test-matrix }} - - win-vs2019-cpu-py3-build: - name: win-vs2019-cpu-py3 - uses: ./.github/workflows/_win-build.yml - with: - build-environment: win-vs2019-cpu-py3 - cuda-version: cpu - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" }, - { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, - ]} - - win-vs2019-cpu-py3-test: - name: win-vs2019-cpu-py3 - uses: ./.github/workflows/_win-test.yml - needs: win-vs2019-cpu-py3-build - with: - build-environment: win-vs2019-cpu-py3 - cuda-version: cpu - test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }} - - win-vs2019-cuda11_7-py3-build: - if: github.event_name == 'pull_request' - name: win-vs2019-cuda11.7-py3 - uses: ./.github/workflows/_win-build.yml - with: - build-environment: win-vs2019-cuda11.7-py3 - cuda-version: "11.7" - sync-tag: win-cuda-build - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" }, - { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-bazel-test: - name: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test - uses: ./.github/workflows/_bazel-build-test.yml - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - - linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single: - name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single - uses: ./.github/workflows/_android-build-test.yml - with: - build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c - - linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit: - name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit - uses: ./.github/workflows/_android-build-test.yml - with: - build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c - - linux-focal-py3_8-gcc7-mobile-lightweight-dispatch-build: - name: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build - docker-image-name: pytorch-linux-focal-py3.8-gcc7 - build-generates-artifacts: false - - linux-focal-rocm5_4_2-py3_8-build: - # don't run build twice on master - if: github.event_name == 'pull_request' - name: linux-focal-rocm5.4.2-py3.8 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-focal-rocm5.4.2-py3.8 - docker-image-name: pytorch-linux-focal-rocm-n-py3 - sync-tag: rocm-build - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-sm86-build: - name: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - uses: ./.github/workflows/_linux-build.yml - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - cuda-arch-list: 8.6 - test-matrix: | - { include: [ - { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - ]} - - linux-bionic-cuda11_7-py3_10-gcc7-sm86-test: - name: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build - with: - build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86 - docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }} diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml deleted file mode 100644 index 8d55f6a9479ca..0000000000000 --- a/.github/workflows/run_torchbench.yml +++ /dev/null @@ -1,103 +0,0 @@ -name: TorchBench CI (pytorch-linux-py3.8-cu116) -on: - pull_request: - -env: - PYTHON_VERSION: "3.8" - # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19 - NUMPY_VERSION: "1.21.2" - SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh" - PR_NUM: ${{ github.event.number }} - PR_BODY: ${{ github.event.pull_request.body }} - PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} - PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} - -jobs: - run-torchbench: - # We don't accept running on non-pytorch repos because of security concerns - # Only run the job when the body contains magic word "RUN_TORCHBENCH:" - if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.body, 'RUN_TORCHBENCH:') }} - runs-on: [self-hosted, bm-runner] - # Set to 12 hours - timeout-minutes: 720 - steps: - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - path: pytorch - - name: Update self-hosted PyTorch - run: | - pushd "${HOME}"/pytorch - git remote prune origin - git fetch - popd - - name: Create conda environment and install deps - run: | - conda create -y -n pr-ci python="${PYTHON_VERSION}" - # shellcheck source=/dev/null - . "${SETUP_SCRIPT}" - conda activate pr-ci - conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \ - setuptools cmake=3.22.* typing-extensions boto3 \ - pillow pytest tabulate gitpython git-lfs tqdm psutil - pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html - - name: Setup TorchBench branch - run: | - # shellcheck source=/dev/null - . "${SETUP_SCRIPT}" - conda activate pr-ci - PR_BODY_FILE=/tmp/pr-body.txt - echo "$PR_BODY" > ${PR_BODY_FILE} - python pytorch/.github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch - - name: Checkout TorchBench - uses: malfet/checkout@silent-checkout - with: - repository: pytorch/benchmark - path: benchmark - lfs: false - ref: ${{ env.TORCHBENCH_BRANCH }} - - name: GPU Info - run: | - nvidia-smi - - name: Run TorchBench - run: | - set -x - pushd "${HOME}"/pytorch - PR_MERGE_BASE=$(git merge-base "$PR_BASE_SHA" "$PR_HEAD_SHA") - popd - PR_BODY_FILE=/tmp/pr-body.txt - echo "$PR_BODY" > ${PR_BODY_FILE} - # shellcheck source=/dev/null - . "${SETUP_SCRIPT}" - conda activate pr-ci - python3 pytorch/.github/scripts/run_torchbench.py \ - --pr-body "$PR_BODY_FILE" \ - run \ - --pytorch-path "${HOME}"/pytorch \ - --torchbench-path "${PWD}"/benchmark \ - --pr-num "$PR_NUM" \ - --pr-base-sha "$PR_MERGE_BASE" \ - --pr-head-sha "$PR_HEAD_SHA" - - name: Upload result to S3 - run: | - # shellcheck source=/dev/null - . "${SETUP_SCRIPT}" - conda activate pr-ci - python3 pytorch/.github/scripts/run_torchbench.py \ - upload-s3 \ - --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}" - - name: Remove conda environment and cleanup - run: | - conda env remove --name pr-ci - rm /tmp/pr-body.txt - - name: Upload artifact - uses: actions/upload-artifact@v3 - with: - name: TorchBench result - path: ~/.torchbench/bisection/pr${{ github.event.number }} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm index 978162aed855a..4e76c172fb6e9 100644 --- a/aten/src/ATen/native/mps/OperationUtils.mm +++ b/aten/src/ATen/native/mps/OperationUtils.mm @@ -265,7 +265,7 @@ void printTensorNDArray(const Tensor& t) { id srcBuf = getMTLBufferStorage(src); bool sliceViewTensor = canSliceViewTensor(src, mpsShape); // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose()) - if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) { + if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) { Tensor emptyShell = Tensor(); // use "_tensor" from Placeholder to retain view's output during its usage in other ops _tensor = gatherViewTensor(src, emptyShell); diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm index 9e643ebf29390..440cde4140f45 100644 --- a/aten/src/ATen/native/mps/operations/Activation.mm +++ b/aten/src/ATen/native/mps/operations/Activation.mm @@ -1208,8 +1208,7 @@ void elu_variants_out_mps ( { CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} MPSGraphTensor *gradOutputTensor_ = nil; - MPSGraphTensor *inputTensor_ = nil; - MPSGraphTensor *resultTensor_ = nil; + MPSGraphTensor *selfOrResultTensor_ = nil; MPSGraphTensor *gradInputTensor_ = nil; }; @@ -1218,7 +1217,7 @@ void elu_variants_out_mps ( MPSStream* stream = getCurrentMPSStream(); @autoreleasepool { - string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" + + string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" + to_string(alpha.to()) + ":" + to_string(scale.to()) + ":" + to_string(input_scale.to()) + ":" + @@ -1235,18 +1234,14 @@ void elu_variants_out_mps ( newCachedGraph = new CachedGraph(mpsGraph); MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); - - MPSGraphTensor* inputTensor = nil; - MPSGraphTensor* resultTensor = nil; - + MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result); MPSGraphTensor* lessThanZeroGradTensor = nil; if(is_result) { - resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result); MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to() shape:@[@1] dataType:getMPSDataType(grad_output.scalar_type())]; - MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor + MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor secondaryTensor:alphaTensor name:nil]; auto constMul = scale.to() * input_scale.to(); @@ -1258,11 +1253,10 @@ void elu_variants_out_mps ( name:nil]; } else { - inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result); MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to() shape:@[@1] dataType:getMPSDataType(grad_output.scalar_type())]; - MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor + MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor secondaryTensor:inputScaleTensor name:nil]; MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor @@ -1282,7 +1276,7 @@ void elu_variants_out_mps ( MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[@1] dataType:getMPSDataType(grad_output.scalar_type())]; - MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor + MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor secondaryTensor:zeroTensor name:nil]; MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor @@ -1294,8 +1288,7 @@ void elu_variants_out_mps ( name:nil]; newCachedGraph->gradOutputTensor_ = gradOutputTensor; - newCachedGraph->inputTensor_ = inputTensor; - newCachedGraph->resultTensor_ = resultTensor; + newCachedGraph->selfOrResultTensor_ = selfOrResultTensor; newCachedGraph->gradInputTensor_ = gradInputTensor; } return newCachedGraph; @@ -1304,28 +1297,14 @@ void elu_variants_out_mps ( } Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp); - Placeholder selfPlaceholder = Placeholder(); - Placeholder resultPlaceholder = Placeholder(); - if(is_result) - resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp); - else - selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp); + Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = nil; - - if(is_result) - feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData() - }; - else - feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData() + }; NSDictionary* results = @{ gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() }; @@ -1840,7 +1819,7 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { using namespace mps; Tensor grad_input = at::empty_like(self, self.suggest_memory_format()); - Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous); + Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous); if (grad_output.numel() == 0) { return std::tuple{grad_input, weight_grad}; } diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm index c730eccfe944e..6569e59086fc9 100644 --- a/aten/src/ATen/native/mps/operations/BinaryOps.mm +++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm @@ -26,6 +26,8 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock) { + TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ), + "MPS support binary op with uint8 natively starting from macOS 13.0"); TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) && (self.scalar_type() == ScalarType::Long || (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))), diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm index 3cd442099f5ca..66c6eac098d8f 100644 --- a/aten/src/ATen/native/mps/operations/Convolution.mm +++ b/aten/src/ATen/native/mps/operations/Convolution.mm @@ -56,14 +56,15 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_, descriptor_.groups = groups; } -Tensor _mps_convolution( +Tensor _mps_convolution_impl( const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups) { + int64_t groups, + c10::optional input_shape) { TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS"); namespace native_mps = at::native::mps; @@ -82,7 +83,16 @@ Tensor _mps_convolution( auto memory_format = input_t.suggest_memory_format(); bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast); + bool gather_input_data = true; + // Perform the convolution directly in NCHW if the tensor is already contiguous in memory + if (is_channels_last && input_t.is_contiguous(memory_format)) { + is_channels_last = false; + gather_input_data = false; + memory_format = MemoryFormat::Contiguous; + } auto output_t = at::empty( + input_shape.has_value() ? + input_shape.value() : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation), input->scalar_type(), @@ -212,7 +222,7 @@ Tensor _mps_convolution( cachedGraph = static_cast(tmpCachedGraph); } - auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape); + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape, gather_input_data); auto weightsPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_t); auto biasPlaceholder = native_mps::Placeholder(); // Reshape the bias to be broadcastable with output of conv2d @@ -237,21 +247,35 @@ Tensor _mps_convolution( return *output; } +Tensor _mps_convolution( + const Tensor& input_t, + const Tensor& weight_t, + const c10::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups) { + return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt); +} + Tensor mps_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_, + IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { namespace native_mps = at::native::mps; using namespace mps; CheckedFrom c = "mps_convolution_backward_input"; - TensorArg grad_output{ grad_output_, "grad_output", 1 }, - weight{ weight_, "weight", 2 }; + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + weight{ weight_t, "weight", 2 }; checkAllSameType(c, {grad_output, weight}); checkAllSameGPU(c, {grad_output, weight}); - auto memory_format = grad_output_.suggest_memory_format(); + auto memory_format = grad_output_t.suggest_memory_format(); bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast); - Tensor grad_output_t = grad_output_.contiguous(memory_format); - Tensor weight_t = weight_.contiguous(memory_format); - MPSShape* weightShape = getMPSShape(weight_); + bool gather_input_data = true; + if (is_channels_last && grad_output_t.is_contiguous(memory_format)) { + is_channels_last = false; + gather_input_data = false; + memory_format = MemoryFormat::Contiguous; + } auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt); // Avoid "grad_input" when this is being used as transposed convolution @@ -327,10 +351,10 @@ Tensor mps_convolution_backward_input( } MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape); - MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape); + MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t); MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor; - if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) { + if (is_channels_last) { gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose); } MPSGraphTensor* gradInputTensor; @@ -358,8 +382,8 @@ Tensor mps_convolution_backward_input( cachedGraph = static_cast(tmpCachedGraph); } - auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape); - auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape); + auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data); + auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t); auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input); NSDictionary *feeds = @{ @@ -377,16 +401,19 @@ Tensor mps_convolution_backward_input( } Tensor mps_convolution_backward_weights( - IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_, + IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { namespace native_mps = at::native::mps; using namespace mps; CheckedFrom c = "mps_convolution_backward_weights"; - auto memory_format = input_.suggest_memory_format(); + auto memory_format = grad_output_t.suggest_memory_format(); bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast); - - auto grad_output_t = grad_output_.to(memory_format); - auto input_t = input_.to(memory_format); + bool gather_input_data = true; + if (is_channels_last && input_t.is_contiguous(memory_format)) { + is_channels_last = false; + gather_input_data = false; + memory_format = MemoryFormat::Contiguous; + } MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format); @@ -475,7 +502,7 @@ Tensor mps_convolution_backward_weights( MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor; - if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) { + if (is_channels_last) { gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose); } @@ -505,8 +532,8 @@ Tensor mps_convolution_backward_weights( cachedGraph = static_cast(tmpCachedGraph); } - auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape); - auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); + auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data); + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, nil, gather_input_data); auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t); NSDictionary *feeds = @{ @@ -525,12 +552,9 @@ Tensor mps_convolution_backward_weights( } std::tuple mps_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - Tensor grad_input, grad_weight, grad_bias; if (input.numel() == 0) { if (output_mask[0]) { @@ -576,10 +600,10 @@ Tensor _mps_convolution_transpose( Tensor mps_convolution_transpose_backward_input( const Tensor& grad_output_t, const Tensor& weight_t, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups) + int64_t groups, IntArrayRef input_shape) { - return at::_mps_convolution( - grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups); + return _mps_convolution_impl( + grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape); } Tensor mps_convolution_transpose_backward_weight( @@ -595,15 +619,12 @@ Tensor mps_convolution_transpose_backward_weight( std::tuple mps_convolution_transpose_backward( - const Tensor& input, const Tensor& grad_output_t, const Tensor& weight, + const Tensor& input, const Tensor& grad_output, const Tensor& weight, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - Tensor grad_input, grad_weight; if (output_mask[0]) { - grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups); + grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes()); } if (output_mask[1]) { grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups); diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm index e4c673145adaa..16f5718dd29c0 100644 --- a/aten/src/ATen/native/mps/operations/Copy.mm +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -251,8 +251,11 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { bool returnGatherOutput = dst_.is_contiguous(); Tensor src; auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format()); + const bool sameDataType = src_.dtype() == dst_.dtype(); - if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) { + if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) || + // the copy_cast path requires storage_offset to be applied before casting + (src_.storage_offset() && !sameDataType)) { Tensor emptyShell = Tensor(); src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell); @@ -282,7 +285,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { src._set_neg(src_.is_neg()); const size_t src_size = src.nbytes(); - if (src.dtype() == dst_.dtype()) { + if (sameDataType) { MPSStream* stream = getCurrentMPSStream(); // for GPU to GPU copies we only encode to stream's command buffer (no flushing) stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset); @@ -297,22 +300,27 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { TORCH_CHECK(dst.defined(), "dst is undefined"); TORCH_CHECK(src.defined(), "src is undefined"); + bool needs_broadcasting = false; + if (src.numel() == 0 || dst.is_same(src)) { return dst; } if (dst.numel() == 0) { dst.resize_as_(src); } + if (dst.dim() > src.dim()) { + needs_broadcasting = true; + } if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) { - return copy_from_mps_(dst, src, non_blocking); + return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking); } if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) { - return copy_to_mps_(dst, src, non_blocking); + return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking); } if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) { - return copy_kernel_mps(dst, src, non_blocking); + return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking); } TORCH_INTERNAL_ASSERT( src.device().type() == DeviceType::MPS, diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm index 310cbb7bf9370..8522ac920275f 100644 --- a/aten/src/ATen/native/mps/operations/Indexing.mm +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps( MPSGraphTensor* reshapedIndicesTensor = indicesTensor; + MPSGraphTensor* castGradTensor = incomingGradTensor; + MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type()); + // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16 + if (dataType == MPSDataTypeFloat16) { + castGradTensor = [mpsGraph castTensor: incomingGradTensor + toType: MPSDataTypeFloat32 + name: @"castGradTensor"]; + } if (num_indices_dims != 0) { reshapedIndicesTensor = [mpsGraph expandDimsOfTensor: indicesTensor axes: @[@-1] name: nil]; } - auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor + auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor indicesTensor: reshapedIndicesTensor shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape)) batchDimensions: 0 mode: MPSGraphScatterModeAdd name: @"edb"]; - + if (dataType == MPSDataTypeFloat16) { + outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor + toType: MPSDataTypeFloat16 + name: @"castGradTensor"]; + } newCachedGraph->incomingGradTensor_ = incomingGradTensor; newCachedGraph->indicesTensor_ = indicesTensor; newCachedGraph->outgoingGradTensor_ = outgoingGradTensor; diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm index 2b9272d467595..08727fed8265c 100644 --- a/aten/src/ATen/native/mps/operations/Pooling.mm +++ b/aten/src/ATen/native/mps/operations/Pooling.mm @@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output, pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format); + auto output_memory_format = output.suggest_memory_format(); // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors // by simply restriding them (instead of calling the costly Contiguous()). if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) { @@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output, outputSizes.insert(outputSizes.begin(), nbatch); } output.resize_(outputSizes); - } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) { + } else if (output_memory_format == MemoryFormat::ChannelsLast) { output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous); + output_memory_format = MemoryFormat::Contiguous; } if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) { @@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output, } runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results); + + if (output_memory_format != suggested_memory_format) { + const_cast(output) = output.to(suggested_memory_format); + } } } @@ -356,6 +362,8 @@ Tensor mps_max_pool2d_backward( const Tensor& output, const Tensor& indices) { + auto indices_memory_format = indices.suggest_memory_format(); + mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) { MPSGraph* mpsGraph = cachedGraph.graph(); NSArray* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor @@ -366,6 +374,10 @@ Tensor mps_max_pool2d_backward( }; mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride, padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices"); + + if (indices_memory_format == MemoryFormat::ChannelsLast) { + const_cast(indices) = indices.to(MemoryFormat::ChannelsLast); + } } TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)( diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm index f858714fb82d5..f47dd910dc234 100644 --- a/aten/src/ATen/native/mps/operations/ReduceOps.mm +++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm @@ -139,6 +139,8 @@ void reduction_out_mps( MPSReductionType reduction_type, const std::string& func_name) { + // issue 103641234, reduction ops does not have int64 support + TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 reduction ops, casting it to int32"); IntArrayRef input_shape = input_t.sizes(); if (opt_dim.has_value()) { @@ -163,6 +165,9 @@ void reduction_out_mps( if (reduction_type == MPSReductionType::PROD) { output_t.fill_(1); } + else if (reduction_type == MPSReductionType::SUM) { + output_t.zero_(); + } return; } @@ -197,7 +202,10 @@ void reduction_out_mps( (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) { inputCastDtype = getMPSDataType(dtype.value()); } else if (input_type != MPSDataTypeInt32 && - input_type != MPSDataTypeFloat32) { + input_type != MPSDataTypeFloat32 && + input_type != MPSDataTypeFloat16) { + inputCastDtype = MPSDataTypeFloat32; + } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) { inputCastDtype = MPSDataTypeFloat32; } @@ -241,7 +249,7 @@ void reduction_out_mps( axes:wrappedAxes name:nil]; } else if (reduction_type == MPSReductionType::TRACE) { - MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor + MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor numLower:0 numUpper:0 name:nil]; diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm index d46ce356318e2..bee82fcc24803 100644 --- a/aten/src/ATen/native/mps/operations/RnnOps.mm +++ b/aten/src/ATen/native/mps/operations/RnnOps.mm @@ -30,10 +30,15 @@ std::vector biases; std::vector recurrent_biases; for (size_t i = 0; i < num_layers; i+=1) { - kernel_weights.push_back(params[i*4]); - recurrent_kernel_weights.push_back(params[i*4+1]); - biases.push_back(params[i*4+2]); - recurrent_biases.push_back(params[i*4+3]); + if (has_biases) { + kernel_weights.push_back(params[i*4]); + recurrent_kernel_weights.push_back(params[i*4+1]); + biases.push_back(params[i*4+2]); + recurrent_biases.push_back(params[i*4+3]); + } else { + kernel_weights.push_back(params[i*2]); + recurrent_kernel_weights.push_back(params[i*2+1]); + } } struct CachedGraph : public MPSCachedGraph { @@ -71,8 +76,10 @@ for (size_t i = 0; i < num_layers; i += 1) { [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))]; [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))]; - [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; - [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + if(has_biases) { + [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; + [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + } } MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor]; @@ -109,9 +116,12 @@ NSMutableArray* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; NSMutableArray* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; for(int i = 0; i < num_layers; i++) { - MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] - secondaryTensor:recurrentBiasList[i] - name:nil]; + MPSGraphTensor* biasTensor = nil; + if(has_biases) { + biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] + secondaryTensor:recurrentBiasList[i] + name:nil]; + } outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_ recurrentWeight:recurrentKernelWeightsList[i] inputWeight:kernelWeightsList[i] @@ -121,7 +131,6 @@ descriptor:opDesc name:nil]; - stateTensor_ = [mpsGraph sliceTensor:stateTensor dimension:0 start:i @@ -196,12 +205,14 @@ for (size_t i = 0; i < num_layers; i+=1) { kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]); recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]); - bias = Placeholder([biasList objectAtIndex:i], biases[i]); - recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()]; [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()]; - [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; - [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + if(has_biases) { + bias = Placeholder([biasList objectAtIndex:i], biases[i]); + recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); + [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; + [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + } } Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensors_[0], input); @@ -250,10 +261,15 @@ std::vector biases; std::vector recurrent_biases; for (size_t i = 0; i < num_layers; i+=1) { - kernel_weights.push_back(params[i*4]); - recurrent_kernel_weights.push_back(params[i*4+1]); - biases.push_back(params[i*4+2]); - recurrent_biases.push_back(params[i*4+3]); + if(has_biases) { + kernel_weights.push_back(params[i*4]); + recurrent_kernel_weights.push_back(params[i*4+1]); + biases.push_back(params[i*4+2]); + recurrent_biases.push_back(params[i*4+3]); + } else { + kernel_weights.push_back(params[i*2]); + recurrent_kernel_weights.push_back(params[i*2+1]); + } } struct CachedGraph : public MPSCachedGraph { @@ -296,8 +312,10 @@ for (size_t i = 0; i < num_layers; i += 1) { [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))]; [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))]; - [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; - [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + if(has_biases) { + [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; + [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + } } MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input)); @@ -349,9 +367,15 @@ cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd axis:0 name:nil]; - MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] - secondaryTensor:recurrentBiasList[i] - name:nil]; + MPSGraphTensor* biasTensor = nil; + if(has_biases) { + biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] + secondaryTensor:recurrentBiasList[i] + name:nil]; + } else { + biasTensor = [mpsGraph constantWithScalar:0.0 + dataType:inputTensor.dataType]; + } MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor dimension:0 @@ -391,7 +415,6 @@ descriptor: opDesc name: nil]; - gradientTensor_ = [outputs objectAtIndex:0]; [gradOutputArray addObject:[outputs objectAtIndex:0]]; [gradRecWeightsArray addObject:[outputs objectAtIndex:1]]; @@ -445,18 +468,20 @@ for (size_t i = 0; i < num_layers; i+=1) { kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]); recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]); - bias = Placeholder([biasList objectAtIndex:i], biases[i]); - recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()]; [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()]; - [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; - [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + if(has_biases) { + bias = Placeholder([biasList objectAtIndex:i], biases[i]); + recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); + [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; + [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + } } Tensor output = at::empty_like(input); Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]); Tensor grad_weights = at::empty_like(kernel_weights[0]); - Tensor grad_bias = at::empty_like(biases[0]); + Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options()); Tensor grad_state = at::empty_like(hx[0]); Tensor grad_cell_state = at::empty_like(hx[1]); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensors_[0], output); @@ -482,13 +507,15 @@ Tensor output = at::empty_like(input); Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]); Tensor grad_weights = at::empty_like(kernel_weights[i]); - Tensor grad_bias = at::empty_like(biases[i]); + Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options()); Tensor grad_state = at::empty_like(hx[0]); Tensor grad_cell_state = at::empty_like(hx[1]); weights.push_back(grad_weights); weights.push_back(grad_rec_weights); - weights.push_back(grad_bias); - weights.push_back(grad_bias); + if(has_biases) { + weights.push_back(grad_bias); + weights.push_back(grad_bias); + } gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output); gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights); gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights); diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index a869ff3379aa8..0c6e5b06d0898 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -16,6 +16,8 @@ bool is_empty_tensor(const Tensor& self) { void unary_op(const Tensor& self, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock, is_noop_p is_noop = is_empty_tensor) { + TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ), + "MPS support unary op with uint8 natively starting from macOS 13.0"); if (!output.is_same_size(self)) { output.resize_(self.sizes()); } diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm index 17895e19c7d76..3b781dea08f48 100644 --- a/aten/src/ATen/native/mps/operations/UpSample.mm +++ b/aten/src/ATen/native/mps/operations/UpSample.mm @@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input, } else { native::upsample_2d_common_check(input.sizes(), output_size); } + Tensor out; + if (!output.is_contiguous()) { + out = at::empty_like(output, MemoryFormat::Contiguous); + } + bool centerResults = false; MPSGraphResizeMode resizeMode = MPSGraphResizeNearest; MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor; @@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input, MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease]; Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false); NSDictionary* feeds = @{ inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), @@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input, outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() }; runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + if (out.has_storage()) { + output.copy_(out); + } } } diff --git a/test/cuda_results.yaml b/test/cuda_results.yaml new file mode 100644 index 0000000000000..bc6e0948ae569 --- /dev/null +++ b/test/cuda_results.yaml @@ -0,0 +1,102 @@ +ConsistencyTest: { + nn.functional.conv_transpose2d: + [[[7.399066925048828, 4.4053635597229, -25.85348129272461, + 58.88909149169922, -88.75193786621094, -18.98126983642578, 9.437820434570312], + [-59.78305435180664, -65.34088134765625, -108.04747009277344, 196.6062469482422, + 71.39350891113281, 37.8786735534668, -69.55322265625], [92.78504943847656, + 91.24403381347656, -94.33301544189453, 9.261059761047363, -182.10206604003906, + 141.4270477294922, 146.89010620117188], [-14.363212585449219, 43.454036712646484, + -76.1098403930664, 242.9479522705078, 198.1458282470703, -49.77315139770508, + 5.891449451446533], [-43.56822967529297, 4.782844066619873, -29.526945114135742, + 65.15388488769531, 161.29757690429688, 118.60847473144531, 27.08570671081543], + [68.29853057861328, -11.507468223571777, 2.044086217880249, 11.003862380981445, + 34.993282318115234, -21.256723403930664, 91.49512481689453], [-70.4466781616211, + 69.04386138916016, 7.764842987060547, 7.61972713470459, -28.99899673461914, + 54.575748443603516, -5.762258052825928]], [[-36.238487243652344, 37.29551696777344, + -22.012331008911133, -30.1353702545166, 33.82851028442383, 33.00322341918945, + 2.7218000888824463], [-7.999058246612549, 122.72489929199219, -1.0639530420303345, + 2.9564287662506104, -143.1276092529297, -110.75650024414062, 48.0764274597168], + [-91.0599136352539, -11.656601905822754, 69.62447357177734, 88.12522888183594, + 337.3008728027344, -76.9416732788086, -110.24406433105469], [-108.1512451171875, + 98.42401123046875, 142.46144104003906, -127.48089599609375, -3.367496967315674, + 86.82833099365234, 86.29623413085938], [-14.339198112487793, -52.287410736083984, + 171.43614196777344, 200.14817810058594, 200.35476684570312, -189.4150390625, + -46.86980056762695], [30.196495056152344, 25.22877311706543, 95.29426574707031, + 4.455311298370361, 118.48747253417969, 87.11080932617188, -83.6124038696289], + [-2.5434072017669678, 91.8791732788086, -10.615175247192383, -12.58531379699707, + -49.3439826965332, 33.37324523925781, -5.983145713806152]], [[4.551003932952881, + 15.84842586517334, -46.354671478271484, 14.721636772155762, 39.01048278808594, + 49.70054244995117, -18.268564224243164], [16.728954315185547, 129.43505859375, + -4.6139116287231445, -3.382319688796997, -238.76353454589844, 13.42194938659668, + 40.393280029296875], [-2.335604429244995, -85.94283294677734, -142.2253875732422, + 135.27537536621094, 18.01512336730957, -26.331714630126953, -33.35443878173828], + [-79.17593383789062, -93.72674560546875, -110.94194030761719, -61.455223083496094, + 6.811624526977539, 129.06478881835938, 12.435402870178223], [10.859378814697266, + 41.3059196472168, 143.55824279785156, -41.754737854003906, -235.32406616210938, + -70.98460388183594, 130.46929931640625], [193.57574462890625, -142.5060272216797, + -102.45012664794922, 124.68048095703125, 136.05215454101562, -9.650590896606445, + -45.59521484375], [-37.829593658447266, 39.12519454956055, 9.293094635009766, + -18.8004093170166, -0.7294210195541382, 51.884910583496094, 36.15913391113281]], + [[-15.651233673095703, 16.31340980529785, -26.752052307128906, 6.281721115112305, + 43.765541076660156, -13.097319602966309, -30.443206787109375], [10.67841911315918, + 66.1829605102539, -9.394262313842773, -131.45101928710938, -38.621002197265625, + 65.9507064819336, 48.76960372924805], [-76.0918197631836, -9.108996391296387, + 13.64936637878418, 96.7411880493164, 124.2474365234375, -111.50318145751953, + -42.397071838378906], [-83.31562805175781, 32.27967071533203, 250.08163452148438, + 58.24131393432617, 129.95318603515625, -10.683560371398926, -123.84668731689453], + [-11.536887168884277, -15.220125198364258, 197.18821716308594, -31.680112838745117, + -81.35874938964844, 157.96974182128906, 105.61251831054688], [78.15926361083984, + -84.49744415283203, -73.91180419921875, 86.370361328125, 77.87918090820312, + 55.3555908203125, -7.273794651031494], [25.232547760009766, 30.352109909057617, + 53.722267150878906, 44.87421798706055, 44.618812561035156, 4.511796951293945, + 9.039834976196289]]] +} +UnitTest: { + norm: + [ + { + dtype: f16, + args: [[[ 8.9453, 4.0859, 0.1230, 2.1367, -5.0000], + [ 7.2773, -4.6953, -3.5586, 8.2812, -0.8789], + [ 0.7119, -1.4854, 6.8633, -7.9805, -3.6562], + [-1.0195, -7.2695, -0.0264, -3.5078, -0.2900], + [ 8.7656, 5.8984, -2.3125, -0.0352, 5.2812]],], + params: [0.5,], + res: [2000.] + }, + { + dtype: f16, + args: [[[[ 8.9219, 3.0508, -3.0234, -5.6250, -5.3516], + [-5.8906, 5.2109, -7.2500, 7.3047, -0.1846], + [-2.1367, -8.8047, -3.4727, -3.0859, 4.9062], + [ 2.1797, -8.5078, 6.1445, -5.0547, 2.8828], + [-2.6191, 4.6680, -4.1758, 8.7734, -5.4844]], + + [[-5.8984, 7.3281, -7.3672, -0.0879, 7.0039], + [ 2.0117, -6.4258, 8.6250, 2.5137, -2.2676], + [-7.2578, 1.6875, 7.8750, 7.5078, 0.8350], + [-4.8164, -3.6914, -3.9199, 4.9219, -4.6680], + [ 5.0547, -7.1289, 2.3633, 3.7793, -7.4375]], + + [[-8.6953, -3.8750, 0.8965, -4.4453, 6.1328], + [ 8.6719, 2.5586, -3.0664, -7.7891, 2.5234], + [ 5.8008, 0.5977, 4.9219, 3.0156, 3.6211], + [-6.0898, -3.4883, 2.6543, 7.1992, 5.9414], + [-3.6035, 8.3906, 2.2070, -1.1162, 7.2852]], + + [[-2.4531, -2.9180, 6.2422, -6.3711, -8.3516], + [ 3.3398, -8.5078, -8.9375, -2.0312, -4.3320], + [-1.4326, -4.5000, -0.3252, -6.8555, -8.2969], + [ 5.8438, 5.6094, -6.6797, -0.0439, 3.6035], + [ 4.5859, 7.1016, -0.8086, 5.6953, 0.5098]], + + [[ 3.0859, 4.4844, 0.6152, 7.9609, -7.6562], + [-0.7998, -3.4023, 5.7734, -2.4785, 5.9219], + [ 7.1094, 1.4502, -7.1289, 4.7188, -4.8359], + [ 2.7422, -1.9512, 5.6602, -3.6387, -8.6953], + [-4.6953, 0.2900, 2.7148, -0.0176, 7.6992]]],], + params: [1.5], + res: [125.2500] + }, + ], +} diff --git a/test/test_modules.py b/test/test_modules.py index 2ae17f5f8cf85..9c244fb65e60b 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -10,12 +10,23 @@ from torch.testing._internal.common_cuda import with_tf32_off from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta) +from torch.testing._internal.common_dtype import get_all_dtypes from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode from torch.testing._internal.common_utils import ( TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, - gradgradcheck, skipIfMps, skipIfTorchInductor) + gradgradcheck, skipIfTorchInductor) from unittest.mock import patch, call +MPS_DTYPES = get_all_dtypes() +for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]: + del MPS_DTYPES[MPS_DTYPES.index(t)] + +def _get_mps_error_msg(device, dtype, op, mps_blocklist): + if torch.backends.mps.is_available() and device == "mps" and dtype not in MPS_DTYPES: + return f"MPS doesn't support {str(dtype)} datatype" + if op.name.startswith(tuple(mps_blocklist)): + return "MPS doesn't support op " + str(op.name) + return None class TestModule(TestCase): _do_cuda_memory_leak_check = True @@ -33,7 +44,8 @@ def _assert_module_parameters_and_buffer_are(self, module, device, dtype): def _check_module(items, name, device=device, dtype=dtype): for item_name, item in items: self.assertEqual( - item.device, device, + # workaround for the tests checking the device (mps:0 with mps) + item.device.type, device.type, f'{name} {item_name} is on device {item.device} instead of the expected device {device}') if item.dtype.is_floating_point: self.assertEqual( @@ -42,9 +54,16 @@ def _check_module(items, name, device=device, dtype=dtype): _check_module(module.named_parameters(), "Parameter") _check_module(module.named_buffers(), "Buffer") - @skipIfMps # the test doesn't work on MPS as double types are not supported @modules(module_db) def test_forward(self, device, dtype, module_info, training): + MPS_BLOCKLIST = [ + "nn.LSTM" # segfault + ] + + msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=False, training=training) @@ -84,6 +103,10 @@ def test_forward(self, device, dtype, module_info, training): # They should be applied to any created parameters and buffers. @modules(module_db) def test_factory_kwargs(self, device, dtype, module_info, training): + msg = _get_mps_error_msg(device, dtype, module_info, []) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=False, training=training) @@ -198,6 +221,11 @@ def _to_device1(objs): @modules(module_db) def test_repr(self, device, dtype, module_info, training): # Test module can be represented with repr and str without errors. + + msg = _get_mps_error_msg(device, dtype, module_info, []) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=False, training=training) @@ -211,10 +239,19 @@ def test_repr(self, device, dtype, module_info, training): m.__repr__() str(m) - @skipIfMps @modules(module_db) def test_pickle(self, device, dtype, module_info, training): # Test that module can be pickled and unpickled. + + MPS_BLOCKLIST = [ + "nn.LSTM" # hard crash + ] + + msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST) + if msg is not None: + self.skipTest(msg) + + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=False, training=training) @@ -249,6 +286,15 @@ def test_pickle(self, device, dtype, module_info, training): def test_check_inplace(self, device, dtype, module_info, training): # Check if the inplace variant of the module gives the same result as the out of place # variant. + + MPS_BLOCKLIST = [ + "nn.ELU" # hard crash + ] + + msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=True, training=training) @@ -326,11 +372,21 @@ def inner_zero_grad(obj): obj.grad = None self._traverse_obj(obj, inner_zero_grad) - @skipIfMps @modules(module_db) @skipIfTorchInductor("to be fixed") def test_non_contiguous_tensors(self, device, dtype, module_info, training): # Check modules work with non-contiguous tensors + MPS_BLOCKLIST = [ + # hard crashes + "nn.GRU", + "nn.LSTM", + "nn.RNN" + ] + + msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, @@ -582,10 +638,18 @@ def check_backward(cpu_output, gpu_output): for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs): check_backward(cpu_output, gpu_output) - @skipIfMps @modules(module_db) @skipIfTorchInductor("to be fixed") def test_memory_format(self, device, dtype, module_info, training): + MPS_BLOCKLIST = [ + "nn.BatchNorm3d", # failed assert + "nn.LSTM", # segfault + ] + + msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST) + if msg is not None: + self.skipTest(msg) + is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6) # TODO tighten it to a specific module atol, rtol = (3e-3, 7e-3) if is_sm86 else (None, None) @@ -682,9 +746,12 @@ def inner_check_out_mem_format(output): # Test whether train and eval modes differ for each module. Use to verify # that the ModuleInfo entry flag is correct. - @skipIfMps # the test doesn't work on MPS as double types are not supported @modules(module_db, train_eval_mode=TrainEvalMode.train_only) def test_if_train_and_eval_modes_differ(self, device, dtype, module_info, training): + msg = _get_mps_error_msg(device, dtype, module_info, []) + if msg is not None: + self.skipTest(msg) + module_cls = module_info.module_cls module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, requires_grad=False, training=training) diff --git a/test/test_mps.py b/test/test_mps.py index b3740b5cd1148..e983760d0951c 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -16,6 +16,8 @@ import torch.nn as nn import torch.nn.functional as F import itertools +import yaml +import platform from collections import defaultdict from torch import inf from torch.nn import Parameter @@ -26,9 +28,10 @@ from torch.testing import make_tensor from torch.testing._comparison import TensorLikePair from torch.testing._internal.common_dtype import get_all_dtypes, integral_types +import torch.mps import torch.backends.mps from torch.distributions import Uniform, Exponential -from functools import partial +from functools import partial, reduce from torch.testing._internal.common_methods_invocations import ( op_db, @@ -62,6 +65,8 @@ TestCase = object # noqa: F811 NNTestCase = object # noqa: F811 +product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2])) + # Determine whether to enable MPS memory leak check (uses same code as CUDA). TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1' @@ -371,6 +376,15 @@ def test_avg_pool2d_ceil_mode(self): class TestMPS(TestCaseMPS): + def help_extra_unit(self, opname, op): + if opname not in OP_UNIT_TEST: + return + for test in OP_UNIT_TEST[opname]: + mps_args = test.sample() + mps_out = op(*mps_args) + mps_out = (mps_out, ) if isinstance(mps_out, torch.Tensor) else mps_out + self.assertEqual(test.expected(), mps_out) + def test_exp(self, device="mps", dtype=torch.float): for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()): b = torch.arange(18, device="cpu") / 3 * math.pi @@ -432,6 +446,53 @@ def helper(val, shape): helper(0, [1024]) helper(0.2, [2, 3]) + def test_mm(self): + B = torch.ones(5, 6).to("mps") + C = torch.ones(6, 5).to("mps") + D = torch.mm(B, C).cpu() + torch.testing.assert_close(D, torch.full((5, 5), 6.0)) + + def test_linalg_cross(self): + def helper(dtype): + device = "mps" + if dtype is torch.int32 or dtype is torch.int64: + x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device) + y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device) + else: + x = torch.rand(100, 3, 100, dtype=dtype, device=device) + y = torch.rand(100, 3, 100, dtype=dtype, device=device) + x_cpu = x.to("cpu") + y_cpu = y.to("cpu") + res1 = torch.linalg.cross(x, y, dim=1) + res2 = torch.tensor((), dtype=dtype, device=device) + res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1) + res2_cpu = torch.tensor((), dtype=dtype, device="cpu") + torch.linalg.cross(x, y, dim=1, out=res2) + torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu) + self.assertEqual(res1, res2) + self.assertEqual(res1, res1_cpu) + self.assertEqual(res2, res2_cpu) + + # test for broadcastable inputs + if dtype is torch.int32 or dtype is torch.int64: + x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device) + y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device) + else: + x = torch.rand(1, 3, 2, dtype=dtype, device=device) + y = torch.rand(4, 3, 1, dtype=dtype, device=device) + x_cpu = x.to("cpu") + y_cpu = y.to("cpu") + res1 = torch.linalg.cross(x, y, dim=1) + res2 = torch.tensor((), dtype=dtype, device=device) + res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1) + res2_cpu = torch.tensor((), dtype=dtype, device="cpu") + torch.linalg.cross(x, y, dim=1, out=res2) + torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu) + self.assertEqual(res1, res2) + self.assertEqual(res1, res1_cpu) + self.assertEqual(res2, res2_cpu) + [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]] + def test_cdist_large(self, device="mps"): for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']: x = torch.randn(100, 10, device=device) @@ -577,53 +638,6 @@ def test_cdist_norm_batch(self, device="mps"): expected = self._brute_cdist(x, y, p=p) self.assertEqual(expected, actual) - def test_mm(self): - B = torch.ones(5, 6).to("mps") - C = torch.ones(6, 5).to("mps") - D = torch.mm(B, C).cpu() - torch.testing.assert_close(D, torch.full((5, 5), 6.0)) - - def test_linalg_cross(self): - def helper(dtype): - device = "mps" - if dtype is torch.int32 or dtype is torch.int64: - x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device) - y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device) - else: - x = torch.rand(100, 3, 100, dtype=dtype, device=device) - y = torch.rand(100, 3, 100, dtype=dtype, device=device) - x_cpu = x.to("cpu") - y_cpu = y.to("cpu") - res1 = torch.linalg.cross(x, y, dim=1) - res2 = torch.tensor((), dtype=dtype, device=device) - res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1) - res2_cpu = torch.tensor((), dtype=dtype, device="cpu") - torch.linalg.cross(x, y, dim=1, out=res2) - torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu) - self.assertEqual(res1, res2) - self.assertEqual(res1, res1_cpu) - self.assertEqual(res2, res2_cpu) - - # test for broadcastable inputs - if dtype is torch.int32 or dtype is torch.int64: - x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device) - y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device) - else: - x = torch.rand(1, 3, 2, dtype=dtype, device=device) - y = torch.rand(4, 3, 1, dtype=dtype, device=device) - x_cpu = x.to("cpu") - y_cpu = y.to("cpu") - res1 = torch.linalg.cross(x, y, dim=1) - res2 = torch.tensor((), dtype=dtype, device=device) - res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1) - res2_cpu = torch.tensor((), dtype=dtype, device="cpu") - torch.linalg.cross(x, y, dim=1, out=res2) - torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu) - self.assertEqual(res1, res2) - self.assertEqual(res1, res1_cpu) - self.assertEqual(res2, res2_cpu) - [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]] - def test_cross(self): a = torch.randn(4, 3, device="mps") b = torch.randn(4, 3, device="mps") @@ -640,6 +654,13 @@ def test_addmm(self): D = torch.addmm(A, B, C).to("cpu") torch.testing.assert_close(D, torch.full((5, 5), 7.0)) + def test_addr(self): + A = torch.ones(5, 10).to("mps") + B = torch.ones(5).to("mps") + C = torch.ones(10).to("mps") + D = torch.addr(A, B, C).to("cpu") + torch.testing.assert_close(D, torch.full((5, 10), 2.0)) + def test_bmm(self): batch1_cpu = torch.randn(10, 3, 4) batch2_cpu = torch.randn(10, 4, 5) @@ -653,13 +674,6 @@ def test_bmm(self): self.assertEqual(output_cpu, output_mps) self.assertEqual(output_cpu.size(), output_mps.size()) - def test_addr(self): - A = torch.ones(5, 10).to("mps") - B = torch.ones(5).to("mps") - C = torch.ones(10).to("mps") - D = torch.addr(A, B, C).to("cpu") - torch.testing.assert_close(D, torch.full((5, 10), 2.0)) - def test_trace(self): M_cpu = torch.randn(3, 3) M_mps = M_cpu.detach().clone().to("mps") @@ -1212,7 +1226,7 @@ def test_norm(self): self.assertEqual(res, res_cpu) c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps") - c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu") + c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu") res = torch.norm(c, dim=0) res_cpu = torch.norm(c_cpu, dim=0) @@ -1237,6 +1251,8 @@ def test_norm(self): res_cpu = torch.norm(d_cpu[0, :, :]), torch.norm(d_cpu[1, :, :]) self.assertEqual(res, res_cpu) + self.help_extra_unit('norm', torch.norm) + def test_layer_norm(self): # TODO: Test non-contiguous def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32): @@ -1783,6 +1799,15 @@ def test_slice_reshape(self): x_cpu = x_cpu + 2 self.assertEqual(x, x_cpu) + def test_slice_casting(self): + # generate random binary numbers + cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8) + mps_in = cpu_in.detach().clone().to("mps") + # check copy_cast(unit8 -> bool) on tensors with storage offset + cpu_out = cpu_in[:, :, 11 : 12, :12].to(torch.bool) + mps_out = mps_in[:, :, 11 : 12, :12].to(torch.bool) + self.assertEqual(cpu_out, mps_out) + def test_slice_reshape_contg_view(self): import torch @@ -1818,12 +1843,6 @@ def test_view_slice(self): actual_pts[i, j] = X[pts[i, j], j] self.assertEqual(actual_pts[i, j], actual_pts_mps[i, j]) - def test_slice_scatter(self): - shape = (4, 4) - tensor = torch.randint(10, shape, device="mps") - tensor_before = tensor.clone() - torch.empty(shape[0], shape[1] * 2, device="mps")[:, ::2].copy_(tensor) - torch.testing.assert_close(tensor, tensor_before) def test_slice(self): values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] @@ -1983,99 +2002,6 @@ def helper(shape, repeats): helper((3, 4, 5), (2, 3, 4, 5)) helper((3, 4, 5), (2, 2, 2)) - def test_torch_repeat_interleave(self, device="mps"): - y = torch.tensor([[1, 2], [3, 4]], device=device) - # exercise single argument function signature - temp = y.repeat_interleave(2) - self.assertEqual(torch.Size([8]), temp.size()) - - for dtype in [torch.int, torch.long]: - lengths = torch.tensor([1, 2], dtype=dtype, device="mps") - output_size = torch.sum(lengths) - a = torch.repeat_interleave( - y, - lengths, - dim=0, - ) - self.assertEqual(a.dtype, y.dtype) - self.assertEqual(a.size(), torch.Size([3, 2])) - - a_with_output = torch.repeat_interleave( - y, - lengths, - dim=0, - output_size=output_size, - ) - self.assertEqual(a_with_output.dtype, y.dtype) - self.assertEqual(a_with_output.size(), torch.Size([3, 2])) - - def test_repeat_interleave(self, device="mps"): - x = torch.tensor([0, 1, 2, 3], device=device) - expected = torch.tensor([1, 2, 2, 3, 3, 3], dtype=torch.int32, device=device) - self.assertEqual(torch.repeat_interleave(x), expected) - - with self.assertRaises(RuntimeError): - torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2)) - - with self.assertRaises(RuntimeError): - torch.repeat_interleave(torch.arange(4.0, device=device)) - - with self.assertRaises(RuntimeError): - torch.repeat_interleave(torch.tensor([1, 2, -1, 3, 4], device=device)) - - y = torch.tensor([[1, 2], [3, 4]], device=device) - - y1_v1 = torch.repeat_interleave(y, 2) - y1_v2 = torch.repeat_interleave(y, torch.tensor(2, device=device)) - y1_v3 = torch.repeat_interleave(y, torch.tensor([2], device=device)) - y1_expect = torch.tensor([1, 1, 2, 2, 3, 3, 4, 4], device=device) - self.assertEqual(y1_v1, y1_expect) - self.assertEqual(y1_v2, y1_expect) - self.assertEqual(y1_v3, y1_expect) - - y2 = torch.repeat_interleave(y, 3, dim=1) - y2_expect = torch.tensor([[1, 1, 1, 2, 2, 2], - [3, 3, 3, 4, 4, 4]], device=device) - self.assertEqual(y2, y2_expect) - - y3 = torch.repeat_interleave(y, torch.tensor([1, 2], device=device), dim=0) - y3_expect = torch.tensor([[1, 2], - [3, 4], - [3, 4]], device=device) - self.assertEqual(y3, y3_expect) - - with self.assertRaises(RuntimeError): - torch.repeat_interleave(y, torch.tensor([1, 2, 3], device=device), dim=0) - - with self.assertRaises(RuntimeError): - torch.repeat_interleave(y, torch.arange(9, device=device).reshape(3, 3), dim=0) - - # test zero sized dimension - x = torch.zeros((5, 0), device=device) - y = torch.repeat_interleave(x, repeats=3, dim=1) - self.assertEqual(y, x.new_zeros(5, 0, device=device)) - - x = torch.tensor([], dtype=torch.int64, device=device) - y = torch.repeat_interleave(x, x) - self.assertEqual(y, x) - - def test_repeat_interleave_simple(self): - def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None): - x = torch.randn(shape, dtype=dtype, device="mps") - x_cpu = x.detach().clone().cpu() - - num_repeats_cpu = num_repeats.detach().clone().cpu() - - repeats = torch.repeat_interleave(x, num_repeats, dim) - repeats_cpu = torch.repeat_interleave(x_cpu, num_repeats_cpu, dim) - - self.assertEqual(repeats, repeats_cpu) - helper(shape=3, num_repeats=torch.tensor([100], device="mps")) - helper(shape=(2, 2), num_repeats=torch.tensor([3, 3], device="mps"), dim=0) - helper(shape=(10, 15, 8), num_repeats=torch.arange(10, device="mps"), dim=0) - helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1) - helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2) - def test_count_nonzero(self): def helper(dtype): n = [ @@ -2151,6 +2077,15 @@ def test_to(self): x_mps = x_cpu.to('mps') self.assertEqual(x_mps.to(torch.float32), x_cpu.to(torch.float32)) + @unittest.skipIf(True, "non-contiguous tensor to mps is incorrect.") + def test_to_non_contiguous(self): + x = torch.arange(16, dtype=torch.float32).reshape(2, 2, 2, 2) + x1 = x[:, :, :1, :] + x2 = x[:, :, 1:, :] + self.assertFalse(x1.is_contiguous()) + self.assertFalse(x2.is_contiguous()) + self.assertEqual(x1, x1.detach().to("mps")) + self.assertEqual(x2, x2.detach().to("mps")) def test_setitem_scalar(self) -> None: device = 'mps' @@ -2224,9 +2159,9 @@ def test_storage_offset_greater_than_src_nbytes(self): tensor_list.append(t) for i in range(0, n_tensors - 1): - t = tensor_list[i].view(1, n_tensor_elems) + t = tensor_list[i].view(1, 784) t_mps = t.to("mps") - self.assertEqual(t, t_mps.cpu(), f"i={i}") + self.assertEqual(t, t_mps.cpu()) # See https://github.com/pytorch/pytorch/issues/82427 # and https://github.com/pytorch/pytorch/issues/83692 @@ -2238,6 +2173,7 @@ def test_full_bugs(self): y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8) self.assertEqual(y_mps, y_cpu) + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") # See https://github.com/pytorch/pytorch/issues/84995 def test_div_bugs(self): for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']): @@ -2304,6 +2240,7 @@ def ensure_tuple(x): self.assertEqual(expected_inverse.view(additional_shape), y_inverse) self.assertEqual(expected_counts, y_counts) + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") def test_unique_all_dtypes(self, device="mps"): def helper(dtype): def ensure_tuple(x): @@ -2359,7 +2296,7 @@ def ensure_tuple(x): if k == i: count += 1 self.assertEqual(j, count) - [helper(dtype) for dtype in [torch.float32, torch.int64, torch.int32, torch.int16, torch.uint8]] + [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]] def test_unique(self): def helper(x, return_inverse, return_counts): @@ -2371,12 +2308,12 @@ def helper(x, return_inverse, return_counts): self.assertEqual(result, result_cpu) helper(torch.tensor([1, 2, 4, 2, 1]), False, False) - helper(torch.randint(3, (10, )), False, False) - helper(torch.randint(3, (10, )), True, False) - helper(torch.randint(3, (10, )), False, True) - helper(torch.randint(3, (10, )), True, True) - helper(torch.randint(3, (1, )), True, True) - helper(torch.randint(3, (0, )), True, True) + helper(torch.randint(3, (10,)), False, False) + helper(torch.randint(3, (10,)), True, False) + helper(torch.randint(3, (10,)), False, True) + helper(torch.randint(3, (10,)), True, True) + helper(torch.randint(3, (1,)), True, True) + helper(torch.randint(3, (0,)), True, True) def test_unique_consecutive(self): def helper(x, dim, return_inverse, return_counts): @@ -2388,13 +2325,13 @@ def helper(x, dim, return_inverse, return_counts): self.assertEqual(result, result_cpu) helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False) - helper(torch.randint(3, (10, )), 0, False, False) - helper(torch.randint(3, (10, )), 0, True, False) - helper(torch.randint(3, (10, )), 0, False, True) - helper(torch.randint(3, (10, )), 0, True, True) - helper(torch.randint(3, (10, )), 0, True, True) - helper(torch.randint(3, (1, )), 0, True, True) - helper(torch.randint(3, (0, )), 0, True, True) + helper(torch.randint(3, (10,)), 0, False, False) + helper(torch.randint(3, (10,)), 0, True, False) + helper(torch.randint(3, (10,)), 0, False, True) + helper(torch.randint(3, (10,)), 0, True, True) + helper(torch.randint(3, (10,)), 0, True, True) + helper(torch.randint(3, (1,)), 0, True, True) + helper(torch.randint(3, (0,)), 0, True, True) helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False) helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True) @@ -2437,134 +2374,6 @@ def test_from_numpy_non_contiguous(self): t_mps = torch.tensor(a, device="mps") self.assertEqual(t_cpu, t_mps.to("cpu")) - # See https://github.com/pytorch/pytorch/issues/86954 - def test_copy_non_contiguous(self): - x = torch.arange(27).reshape(3, 3, 3).permute(2, 0, 1) - self.assertFalse(x.is_contiguous()) - y = x.to('mps') - self.assertFalse(y.is_contiguous()) - self.assertEqual(x, y.to('cpu')) - - x = torch.arange(4**3).reshape(4, 4, 4).permute((2, 0, 1))[1:, ::2] - y = x.to('mps') - self.assertEqual(x, y.to('cpu')) - - x = torch.full((4, 4, 4, 4), 13, device="cpu") - y = torch.full((4, 4, 4, 4), 13, device="mps") - z = torch.arange(4**4).reshape(4, 4, 4, 4).permute(3, 2, 0, 1)[1::, ::2] - x.permute(3, 2, 1, 0)[1::, ::2] = z - # As y is on MPS and z on CPU, this dispatches to a copy operator - y.permute(3, 2, 1, 0)[1::, ::2] = z - self.assertEqual(x, y.to('cpu')) - - # See https://github.com/pytorch/pytorch/pull/84742 - # and https://github.com/pytorch/pytorch/pull/78319 - def test_binops_dtype_precedence(self): - # Test dtype precedence (casting order) in binary operations by comparing to CPU result - # Example values for all dtypes supported on the MPS backend - sample_vals = { - torch.bool: [False, True], - torch.int16: [-15, 0, 1, 10], - torch.int32: [-376, 0, 1, 13], - torch.int64: [-8, 0, 1, 77], - torch.float16: [-234.5, 0.0, 1.0, 2.0], - torch.float32: [-1.0, 0.0, 0.1, 111.99], - } - # Test all combinations of dtypes, operations, dimensionality - for dtype1, dtype2, binop in itertools.product( - sample_vals.keys(), sample_vals.keys(), ['add', 'sub', 'mul', 'div']): - # bool minus bool is generally unsupported, so skip - if binop == 'sub' and (dtype1 == torch.bool or dtype2 == torch.bool): - continue - full_shape = (10,) - for val1, val2 in itertools.product(sample_vals[dtype1], sample_vals[dtype2]): - # print(f'{dtype1},{dtype2}: ({val1}).{binop}({val2})') - # print(getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop) - # (torch.tensor(val2, dtype=dtype2, device='mps'))) - # print(getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop) - # (torch.tensor(val2, dtype=dtype2, device='cpu'))) - self.assertEqual( - getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop) - (torch.tensor(val2, dtype=dtype2, device='mps')), - getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop) - (torch.tensor(val2, dtype=dtype2, device='cpu'))) - self.assertEqual( - getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop) - (torch.tensor([val2], dtype=dtype2, device='mps')), - getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop) - (torch.tensor([val2], dtype=dtype2, device='cpu'))) - self.assertEqual( - getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop) - (torch.tensor([val2], dtype=dtype2, device='mps')), - getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop) - (torch.tensor([val2], dtype=dtype2, device='cpu'))) - self.assertEqual( - getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop) - (torch.tensor(val2, dtype=dtype2, device='mps')), - getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop) - (torch.tensor(val2, dtype=dtype2, device='cpu'))) - # Test tensors created with torch.full - x1 = torch.full(full_shape, val1, dtype=dtype1, device='mps') - y1 = torch.tensor(val2, dtype=dtype2, device='mps') - x2 = torch.full(full_shape, val1, dtype=dtype1, device='cpu') - y2 = torch.tensor(val2, dtype=dtype2, device='cpu') - self.assertEqual(getattr(x1, binop)(y1), getattr(x2, binop)(y2)) - x3 = torch.tensor(val1, dtype=dtype1, device='mps') - y3 = torch.full(full_shape, val2, dtype=dtype2, device='mps') - x4 = torch.tensor(val1, dtype=dtype1, device='cpu') - y4 = torch.full(full_shape, val2, dtype=dtype2, device='cpu') - self.assertEqual(getattr(x3, binop)(y3), getattr(x4, binop)(y4)) - self.assertEqual( - getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop) - (torch.full(full_shape, val2, dtype=dtype2, device='mps')), - getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop) - (torch.full(full_shape, val2, dtype=dtype2, device='cpu'))) - - def test_nansum(self): - def helper(dtype, noncontiguous, dim): - zero_cpu = torch.zeros((), dtype=dtype) - - # Randomly scale the values - scale = random.randint(10, 100) - x_cpu: torch.Tensor = make_tensor( - (5, 5), dtype=dtype, device='cpu', - low=-scale, high=scale, noncontiguous=noncontiguous) - - if dtype.is_floating_point: - nan_mask_cpu = x_cpu < (0.2 * scale) - x_no_nan_cpu = torch.where(nan_mask_cpu, zero_cpu, x_cpu) - x_cpu[nan_mask_cpu] = np.nan - else: - x_no_nan_cpu = x_cpu - - x_mps = x_cpu.to('mps') - actual_out_mps = torch.empty(0, dtype=dtype, device='mps') - expect_out_cpu = torch.empty(0, dtype=dtype) - dim_kwargs = {"dim": dim} if dim is not None else {} - expect = torch.sum(x_no_nan_cpu, **dim_kwargs) - - actual_cpu = torch.nansum(x_cpu, **dim_kwargs) - # Sanity check on CPU - self.assertEqual(expect, actual_cpu) - - # Test MPS - actual_mps = torch.nansum(x_mps, **dim_kwargs) - # Test out= variant - torch.nansum(x_mps, out=actual_out_mps, **dim_kwargs) - torch.nansum(x_cpu, out=expect_out_cpu, **dim_kwargs) - self.assertEqual(expect, actual_mps) - self.assertEqual(expect_out_cpu, actual_out_mps) - - args = itertools.product( - (torch.float16, torch.float32, torch.int32, torch.int64), # dtype - (True, False), # noncontiguous - (0, 1, None), # dim - ) - - for dtype, noncontiguous, dim in args: - with self.subTest(dtype=dtype, noncontiguous=noncontiguous, dim=dim): - helper(dtype, noncontiguous, dim) - def test_cumsum_all_dtypes(self): def helper(dtype): t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype) @@ -2582,22 +2391,32 @@ def helper(dtype): e_string = str(e) self.assertEqual(e_string, "MPS does not support cumsum op with int64 input") - def test_cumsum_minus_one_axis(self): - def helper(dtype): - # Test with axis -1 - cpu_x = None - if(dtype == torch.float32): - cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32) - else: - cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32) + def test_gelu_tanh(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) x = cpu_x.detach().clone().to('mps') - cpu_y = cpu_x.cumsum(-1) - y = x.cumsum(-1) + gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh') + gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh') + self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu) - self.assertEqual(y, cpu_y) + helper((2, 8, 4, 5)) - [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]] + # # Failures due to precision issues, enable after resolving from mps + # def test_div_floor_int(self): + # def helper(shape, dtype): + # cpu_x = torch.randint(-9999, -1,shape, device='cpu', dtype=dtype) + # x = cpu_x.detach().clone().to('mps') + + # cpu_y = torch.randint(1, 9999, shape, device='cpu', dtype=dtype) + # y = cpu_y.detach().clone().to('mps') + + # div_result = torch.div(x, y,rounding_mode='floor') + # div_result_cpu = torch.div(cpu_x, cpu_y, rounding_mode='floor') + # self.assertEqual(div_result, div_result_cpu) + + # helper((2, 8, 4, 5), torch.int16) + # helper((2, 8, 4, 5), torch.int32) def test_median_int16(self): def helper(shape, dtype): @@ -2610,6 +2429,23 @@ def helper(shape, dtype): helper((2, 8, 4, 5), torch.int16) + def test_cumsum_minus_one_axis(self): + def helper(dtype): + # Test with axis -1 + cpu_x = None + if dtype == torch.float32: + cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32) + else: + cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32) + x = cpu_x.detach().clone().to('mps') + + cpu_y = cpu_x.cumsum(-1) + y = x.cumsum(-1) + + self.assertEqual(y, cpu_y) + + [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]] + class TestLogical(TestCaseMPS): def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False): return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad) @@ -2762,6 +2598,20 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self): class TestNLLLoss(TestCaseMPS): + def test_nll2d_loss_backward(self, device='mps'): + a = torch.randn(3, 5, requires_grad=True, device=device) + b = torch.tensor([1, 0, 4], device=device) + loss = nn.NLLLoss() + out = loss(a, b) + self.assertIsNone(out.grad_fn._saved_weight) + loss = nn.NLLLoss(weight=torch.ones((5,), device=device)) + out = loss(a, b) + self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,))) + + out.sum().backward() + with self.assertRaisesRegex(RuntimeError, "after they have already been freed"): + out.grad_fn._saved_weight + def test_nll_loss_mismatched_batch(self, device='mps'): x = torch.randn((10, 3), requires_grad=True, device=device) # t should have size (10,) @@ -2823,13 +2673,13 @@ def _nll_loss_helper(self, input_size, reduction, expected): input = torch.rand(input_size, requires_grad=True, device='cpu') num_channels = input_size[1] target_size = (input_size[0], ) + tuple(input_size[2:]) - target = torch.randint(num_channels, target_size, device='cpu') weights = torch.randn(num_channels) + weights_mps = weights.to("mps") + target = torch.randint(num_channels, target_size, device='cpu') # MPS input_mps = input.detach().clone().to('mps').requires_grad_() target_mps = target.detach().clone().to('mps') - weights_mps = weights.to("mps") output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction) output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction) @@ -3366,6 +3216,7 @@ def test_eq(self): self.assertEqual(result_cpu, result_mps.to('cpu')) + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") def test_signed_vs_unsigned_comparison(self): cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8) mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8) @@ -4519,30 +4370,10 @@ def helper(shape): helper((5, 9, 7, 4)) helper((50, 20, 7, 4)) - def test_sort(self): - for SIZE in (4, 2049): - device = 'mps' - x = torch.rand(4, SIZE, device=device) - res1val, res1ind = torch.sort(x) - - res2val = torch.tensor((), device=device) - res2ind = torch.tensor((), device=device, dtype=torch.long) - torch.sort(x, out=(res2val, res2ind)) - self.assertEqual(res1val, res2val, atol=0, rtol=0) - self.assertEqual(res1ind, res2ind, atol=0, rtol=0) - self.assertEqual(torch.argsort(x), res1ind) - self.assertEqual(x.argsort(), res1ind) - - self.assertEqual( - torch.sort(torch.tensor((50, 40, 30, 20, 10), device=device))[0], - torch.tensor((10, 20, 30, 40, 50), device=device), - atol=0, rtol=0 - ) - def test_upsample_nearest2d(self): - def helper(N, C, H, W): + def helper(N, C, H, W, memory_format): inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float, - requires_grad=True).reshape(N, C, H, W) + requires_grad=True).reshape(N, C, H, W).to(memory_format=memory_format) inputCPU.retain_grad() inputMPS = inputCPU.detach().to('mps').requires_grad_() @@ -4568,8 +4399,9 @@ def helper(N, C, H, W): self.assertEqual(inputCPU.grad, inputMPS.grad) - helper(1, 1, 4, 4) - helper(7, 5, 3, 2) + for memory_format in [torch.channels_last, torch.contiguous_format]: + helper(1, 1, 4, 4, memory_format=memory_format) + helper(7, 5, 3, 2, memory_format=memory_format) def test_upsample_bilinear2d(self): def helper(N, C, H, W): @@ -4604,6 +4436,7 @@ def helper(N, C, H, W): helper(1, 1, 4, 4) helper(7, 5, 3, 2) + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") def test_interpolate(self): def helper(shape, output_size, scales, mode, align_corners=False): inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) @@ -4753,6 +4586,8 @@ def helper(shape, padding, op, value=0): helper((2, 1, 6, 8), 2, nn.ReplicationPad2d) # verify if a change in shape of padding would cause problems with graph caching helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d) + # negative padding + helper((1, 3, 4, 4), (-1, 1, -2, 1), nn.ReplicationPad2d) # Constant Pad 2D helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d) # input size < pad size @@ -4772,10 +4607,10 @@ def helper(shape, padding, op, value=0): helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d) # Constant Pad 3D helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d) - # input size < pad size - helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d) # check the workaround for the right padding bug in Monterey helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d) + # input size < pad size + helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d) # Test stack forward def test_stack(self): @@ -5288,17 +5123,6 @@ def _gelu_ref(X): finally: torch.set_num_threads(num_threads) - def test_gelu_tanh(self): - def helper(shape): - cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) - x = cpu_x.detach().clone().to('mps') - - gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh') - gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh') - self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu) - - helper((2, 8, 4, 5)) - # Test hardtanh def test_hardtanh(self): def helper(shape, min_val, max_val, inplace=False): @@ -5475,14 +5299,14 @@ def helper(shape): # Test index add def test_index_add(self): - def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dtype=torch.int32): - cpu_x = torch.randn(shape, device='cpu', dtype=x_dtype, requires_grad=False) + def helper(shape, dim, index, source_shape, alpha, idx_dtype=torch.int32): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) x = cpu_x.detach().clone().to('mps') cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype) idx = cpu_idx.detach().clone().to('mps') - cpu_source = torch.randn(source_shape, device='cpu', dtype=x_dtype, requires_grad=False) + cpu_source = torch.randn(source_shape, device='cpu', dtype=torch.float, requires_grad=False) source = cpu_source.detach().clone().to('mps') idx_result = torch.index_add(x, dim=dim, index=idx, source=source, alpha=alpha) @@ -5498,8 +5322,6 @@ def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dt # test result dim=1 helper((2,), 0, [1], (1,), 6.0) helper(2, 0, 1, 1, 6) - # test float16 - helper((2,), 0, [1], (1,), 6.0, x_dtype=torch.float16) # Test flip def test_flip(self): @@ -5543,23 +5365,6 @@ def helper(shape, dim, index, idx_dtype=torch.int32): helper((2, 8, 4, 5), 2, [3, 0, 1]) helper((2, 8, 4, 5), 3, [2, 3, 0]) helper((2, 3, 3), -1, [1, 2]) - helper((), 0, [0]) - helper((5), 0, []) - - def test_index_select_scalar(self): - def helper(value, dim, index, idx_dtype=torch.int32): - cpu_x = torch.tensor(value, device='cpu', dtype=torch.float, requires_grad=False) - x = cpu_x.detach().clone().to('mps') - - cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype) - idx = cpu_idx.detach().clone().to('mps') - - idx_result = torch.index_select(x, dim=dim, index=idx) - idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx) - - self.assertEqual(idx_result, idx_result_cpu) - - helper(22, 0, []) def test_embedding_dense_backward(self): def helper(n, d, m, idx): @@ -5938,13 +5743,6 @@ def test_arange_empty(self): y_cpu = torch.arange(0, 0, 1, out=out_cpu) self.assertEqual(y_mps, y_cpu) - # Test rgange - def test_range(self): - self.assertEqual(np.arange(11, dtype=np.float32), torch.range(0, 10, device='mps')) - self.assertEqual(np.arange(7, 0, -1, dtype=np.float32), torch.range(7, 1, -1, device='mps')) - self.assertEqual(np.array([1.0000, 1.3000, 1.6000, 1.9000], dtype=np.float32), torch.range(1, 2, .3, device='mps')) - self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(0, 6.3, device='mps')) - # Test softmax def test_softmax(self): def helper(shape, dim, channels_last=False): @@ -6183,25 +5981,24 @@ def test_device_synchronize(self): torch.mps.synchronize() def test_mps_allocator_module(self): - # first garbage collect and empty the cached blocks + # limit memory allocations up to 1.5x of recommended maximum size from Metal API + torch.mps.set_per_process_memory_fraction(1.5) + + # just running some ops to allocate buffers + net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\ + .to(device='mps', dtype=torch.float) + + x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True) + x = net1(x) + print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, " + f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n") gc.collect() + # running this test alone will not release any buffers as they are in use. + # however, running along with other tests should release the cached allocations. torch.mps.empty_cache() - # measure memory allocations from MPSAllocator - current_alloc_before = torch.mps.current_allocated_memory() - # after garbage collection and emptying the cache the - # current_allocated_memory must be zero - self.assertTrue(current_alloc_before == 0) - # measure total memory allocations from Metal driver - driver_alloc_before = torch.mps.driver_allocated_memory() - # allocate a new 8 MB tensor to force allocation of a new Metal Heap - x = torch.ones(1024 * 1024 * 8, device="mps") - # get memory allocations after allocating tensor x - current_alloc_after = torch.mps.current_allocated_memory() - driver_alloc_after = torch.mps.driver_allocated_memory() - # current and driver memory allocations must have - # grown at this point - self.assertTrue(current_alloc_after > current_alloc_before) - self.assertTrue(driver_alloc_after > driver_alloc_before) + x.backward(torch.randn_like(x)) + print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, " + f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n") # Test random_.to and random_.from def test_random(self): @@ -6369,65 +6166,18 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True): helper(np.array([1, 1, 1, 1, 1]), (0 + 1 + 2 + 3 + 4) / 5, (6 - 2 * 2), 10000) helper(np.array([[1, 1, 1, 1, 1, 1, 1]]), 0, 0, 7, False) - def test_cumsum_dim_check(self): - x = torch.rand((3, 3), device="mps") - self.assertEqual(x.cumsum(1), x.cumsum(-1)) - self.assertEqual(x.cumsum(0), x.cumsum(-2)) - self.assertRaises(IndexError, lambda: x.cumsum(2)) - self.assertRaises(IndexError, lambda: x.cumsum(-3)) - - -class TestTopK(TestCase): - def _test_topk(self, shape, largest): - cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) - x = cpu_x.detach().clone().to('mps') - if isinstance(shape, tuple): - for curr_dim, dim_size in enumerate(shape): - for k in range(1, dim_size + 1): - topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest) - topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest) - self.assertEqual(topk_values, topk_values_cpu) - self.assertEqual(topk_indices, topk_indices_cpu) - else: - for k in range(1, shape): - topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest) - topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest) - self.assertEqual(topk_values, topk_values_cpu) - self.assertEqual(topk_indices, topk_indices_cpu) - - def test_topk(self): - largest_vals = [True, False] - shapes = [ - # Zero Element Tensors - 0, - (1, 0), - (0, 1), - (1, 0, 1), - # Multiple Element Tensors - 1, - 2, - (5, 1), - (1, 5), - (5, 9, 7, 4), - ] - - for shape in shapes: - for largest_val in largest_vals: - with self.subTest(shape=shape, largest_val=largest_val): - self._test_topk(shape, largest_val) - class TestNNMPS(NNTestCase): def _create_basic_net(self): class Layer(nn.Module): def __init__(self): - super().__init__() + super(Layer, self).__init__() self.layer_dummy_param = Parameter(torch.empty(3, 5)) self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7)) class Net(nn.Module): def __init__(self): - super().__init__() + super(Net, self).__init__() self.l1 = Layer() self.dummy_param = Parameter(torch.empty(3, 5)) self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1)) @@ -6526,9 +6276,7 @@ def test_zero_grad(self): self.assertIsNotNone(module.bias.grad) self.assertGreater(module.weight.grad.data.abs().sum(), 0) self.assertGreater(module.bias.grad.data.abs().sum(), 0) - - # Force set to zeros. - module.zero_grad(set_to_none=False) + module.zero_grad(set_to_none=False) # Force set to zeros. self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_()) self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_()) @@ -6536,7 +6284,6 @@ def test_zero_grad(self): self.assertIsNone(module.weight.grad) self.assertIsNone(module.bias.grad) - def test_no_grad(self): for dtype in [torch.bfloat16, torch.float, torch.double]: module = nn.Conv2d(2, 5, kernel_size=3, padding=1).to(dtype) @@ -6650,33 +6397,6 @@ def attention2(key, *, workaround=False, device): r2_cpu = r2.to("cpu") self.assertEqual(r1, r2_cpu) - def test_group_norm_backward(self, device='mps'): - # See https://github.com/pytorch/pytorch/issues/88331 for more detail - shape = [1, 4, 16, 16] - x = torch.full(shape, 7.0, device=device) - - target = torch.ones((1, 3, 128, 128), device=device) - - conv_in = nn.Conv2d(4, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device) - conv_out = nn.Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device) - norm = nn.GroupNorm(32, 128, eps=1e-6, affine=True, device=device) - - with torch.enable_grad(): - x = x.detach().requires_grad_() - out = 5.5 * x - out = conv_in(out) - out = out + norm(out) - out = out + norm(out) - out = out + norm(out) - out = F.interpolate(out, scale_factor=8.0, mode="nearest") - out = norm(out) - out = conv_out(out) - - loss = (out - target).norm(dim=-1).sum() - grad = -torch.autograd.grad(loss, x)[0] - self.assertFalse(grad.detach().isnan().any().item(), 'NaN gradients returned by autograd') - - # def test_conv2d_same_padding(self, device='mps'): # x = torch.rand(1, 1, 10, 11, device=device) # y = torch.rand(1, 1, 4, 5, device=device) @@ -7491,10 +7211,12 @@ def test_T(self, device="mps"): self.assertEqual(t2, t1) b = torch.randn(10, device=device) self.assertEqual(b, b.T) + scalar = torch.tensor(5, device=device) + self.assertEqual(scalar, scalar.T) def test_transposes(self, device="mps", dtype=torch.float32): for op in ("T", "H", "mT", "mH", "adjoint"): - shapes = ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),) + shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),) for shape in shapes: a = make_tensor(shape, device=device, dtype=dtype) t1 = getattr(a, op) @@ -7711,7 +7433,8 @@ def test_conv_transpose_1d_nn_functional(self): def test_conv_backward_1d_channels_last(self): def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1): # https://github.com/pytorch/pytorch/issues/84511 - conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups) + conv_cpu = torch.nn.Conv1d( + in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).requires_grad_() conv_mps = torch.nn.Conv1d( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps") conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True) @@ -7751,15 +7474,89 @@ def test_conv1d_contiguous(self): def test_conv2d_all_strides_paddings(self): # https://github.com/pytorch/pytorch/issues/83180 - y_cpu = torch.randn(2, 2, 3, 6) - y_gpu = y_cpu.to(device='mps') - for strideX in range(1, 4): - for strideY in range(1, 4): - conv_cpu = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=(strideX, strideY)) - conv_gpu = copy.deepcopy(conv_cpu).to(device='mps') - x_cpu = conv_cpu(y_cpu) - x_gpu = conv_gpu(y_gpu) - self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05) + def helper(N, C, H, W, groups, input_mem_format, weight_mem_format, permute_data): + x_cpu = torch.randn(N, C, H, W).to(memory_format=input_mem_format).requires_grad_() + x_mps = x_cpu.detach().clone().to(device='mps').requires_grad_() + + if permute_data: + x_cpu.permute(0, 2, 3, 1) + x_mps.permute(0, 2, 3, 1) + + for strideX in range(1, 4): + for strideY in range(1, 4): + conv_cpu = torch.nn.Conv2d( + in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY)).requires_grad_() + conv_cpu.weight.data = conv_cpu.weight.to(memory_format=weight_mem_format).requires_grad_() + + conv_mps = torch.nn.Conv2d( + in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY), device="mps") + conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_() + conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_() + + res_cpu = conv_cpu(x_cpu) + res_mps = conv_mps(x_mps) + self.assertEqual(res_cpu, res_mps.cpu(), rtol=1e-03, atol=1e-05) + + res_cpu = res_cpu.sum().backward() + res_mps = res_mps.sum().backward() + self.assertEqual(res_cpu, res_mps, rtol=2.6e-05, atol=2e-04) + self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04) + self.assertEqual(conv_cpu.bias.grad, conv_mps.bias.grad) + self.assertEqual(x_cpu.grad, x_mps.grad) + + for mem_format_input in [torch.contiguous_format, torch.channels_last]: + for mem_format_weight in [torch.contiguous_format, torch.channels_last]: + for permute_data in [True, False]: + helper(2, 2, 3, 6, 1, mem_format_input, mem_format_weight, permute_data) + helper(10, 10, 4, 6, 2, mem_format_input, mem_format_weight, permute_data) + helper(32, 32, 4, 6, 2, mem_format_input, mem_format_weight, permute_data) + + def test_conv_transpose_2d_strided(self): + def helper(m_cpu, memory_format): + m_mps = copy.deepcopy(m_cpu).requires_grad_() + m_mps.weight.data = m_cpu.weight.data.detach().clone().to("mps").requires_grad_() + m_mps.bias.data = m_cpu.bias.data.detach().clone().to("mps").requires_grad_() + + input_cpu = torch.randn(20, 16, 50, 100).to(memory_format=memory_format).requires_grad_() + input_mps = input_cpu.detach().clone().to("mps") + + output_cpu = m_cpu(input_cpu) + output_mps = m_mps(input_mps) + self.assertEqual(output_cpu, output_mps) + + for mem_format_input in [torch.contiguous_format, torch.channels_last]: + # With square kernels and equal stride + helper(nn.ConvTranspose2d(16, 33, 3, stride=2).requires_grad_(), mem_format_input) + + # non-square kernels and unequal stride and with padding + helper(nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)).requires_grad_(), mem_format_input) + + def test_conv_transpose_2d_specified_output(self): + input_cpu = torch.randn(1, 16, 12, 12) + input_mps = input_cpu.detach().clone().to("mps") + + downsample_cpu = nn.Conv2d(16, 16, 3, stride=2, padding=1) + downsample_mps = nn.Conv2d(16, 16, 3, stride=2, padding=1, device="mps") + downsample_mps.weight.data = downsample_cpu.weight.data.detach().clone().to("mps").requires_grad_() + downsample_mps.bias.data = downsample_cpu.bias.data.detach().clone().to("mps").requires_grad_() + + upsample_cpu = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1) + upsample_mps = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, device="mps") + upsample_mps.weight.data = upsample_cpu.weight.data.detach().clone().to("mps").requires_grad_() + upsample_mps.bias.data = upsample_cpu.bias.data.detach().clone().to("mps").requires_grad_() + + h_cpu = downsample_cpu(input_cpu) + h_mps = downsample_mps(input_mps) + self.assertEqual(h_cpu, h_mps) + + size_cpu = h_cpu.size() + size_mps = h_mps.size() + self.assertEqual(size_cpu, size_mps) + + output_cpu = upsample_cpu(h_cpu, output_size=input_cpu.size()) + output_mps = upsample_mps(h_mps, output_size=input_mps.size()) + self.assertEqual(output_cpu, output_mps) + self.assertEqual(output_cpu.size(), output_mps.size()) def test_conv2d_single_stride(self): y_cpu = torch.randn(2, 2, 3, 6) @@ -8351,6 +8148,7 @@ def test_bool_indices(self, device="mps"): self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device)) self.assertEqual(len(w), 2) + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") def test_bool_indices_accumulate(self, device="mps"): mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device) mask = mask > 0 @@ -8541,6 +8339,7 @@ def helper(device, dtype): self.assertEqual(res.shape, src.shape) [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]] + @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12") def test_index_src_datatype(self): def helper(device, dtype): orig_dtype = dtype @@ -8874,6 +8673,63 @@ def get_results(device): self.assertEqual(cpu_input_grad, mps_input_grad) self.assertEqual(cpu_weight_grad, mps_weight_grad) + def test_RNN_cell_no_broadcasting(self): + def test(cell_module, input, hx, input_size, hidden_size): + cell = cell_module(input_size, hidden_size, device='mps') + self.assertRaises(RuntimeError, lambda: cell(input, hx)) + + def test_all(hidden_size, bad_hx, good_hx, input_size, input): + test(nn.RNNCell, input, bad_hx, input_size, hidden_size) + test(nn.GRUCell, input, bad_hx, input_size, hidden_size) + test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size) + test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size) + + hidden_size = 20 + input_size = 10 + input = torch.randn(3, input_size, device='mps') + bad_hx = torch.randn(1, hidden_size, device='mps') + good_hx = torch.randn(3, hidden_size, device='mps') + + # Test hidden/input batch size broadcasting + test_all(hidden_size, bad_hx, good_hx, input_size, input) + + # Test hx's hidden_size vs module's hidden_size broadcasting + bad_hx = torch.randn(3, 1) + test_all(hidden_size, bad_hx, good_hx, input_size, input) + + # Test input's input_size vs module's input_size broadcasting + bad_input = torch.randn(3, 1) + test_all(hidden_size, good_hx, good_hx, input_size, bad_input) + + def test_LSTM_cell(self): + # this is just a smoke test; these modules are implemented through + # autograd so no Jacobian test is needed + for bias in (True, False): + input = torch.randn(3, 10, device='mps') + hx = torch.randn(3, 20, device='mps') + cx = torch.randn(3, 20, device='mps') + lstm = nn.LSTMCell(10, 20, bias=bias, device='mps') + for _ in range(6): + hx, cx = lstm(input, (hx, cx)) + + (hx + cx).sum().backward() + + def test_LSTM_cell_forward_input_size(self): + input = torch.randn(3, 11, device='mps') + hx = torch.randn(3, 20, device='mps') + cx = torch.randn(3, 20, device='mps') + lstm = nn.LSTMCell(10, 20, device='mps') + self.assertRaises(Exception, lambda: lstm(input, (hx, cx))) + + def test_LSTM_cell_forward_hidden_size(self): + input = torch.randn(3, 10, device='mps') + hx = torch.randn(3, 21, device='mps') + cx = torch.randn(3, 20, device='mps') + lstm = nn.LSTMCell(10, 20, device='mps') + self.assertRaises(Exception, lambda: lstm(input, (hx, cx))) + self.assertRaises(Exception, lambda: lstm(input, (cx, hx))) + + class TestFallbackWarning(TestCase): # TODO: Remove once test_testing.py is running on MPS devices def test_no_warning_on_import(self): @@ -9019,76 +8875,137 @@ def test_serialization_map_location(self): MPS_DTYPES = get_all_dtypes() -for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]: +for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]: del MPS_DTYPES[MPS_DTYPES.index(t)] +abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()} +class UnitTestSample: + def __init__(self, dtype, args, params, out): + requires_grad = (dtype.is_floating_point or dtype.is_complex) + self.args_ = [t.detach().to('mps').requires_grad_(requires_grad) for t in args] + self.params_ = params + self.out_ = out + + def sample(self): + return self.args_ + self.params_ + + def expected(self): + return tuple(self.out_) + +CUDA_RESULT = dict() +OP_UNIT_TEST = dict() +dirname = os.path.dirname(__file__) +filename = os.path.join(dirname, "cuda_results.yaml") +with open(filename) as f: + data = yaml.safe_load(f) + for key, value in data['ConsistencyTest'].items(): + CUDA_RESULT[key] = torch.as_tensor(value) + for key, samples in data['UnitTest'].items(): + unit_tests = [] + for sample in samples: + dtype = abbrs_to_torch_dtype_dict[sample['dtype']] + args = [torch.as_tensor(arg).to(dtype) for arg in sample['args']] + params = sample['params'] + out = [torch.as_tensor(res).to(dtype) for res in sample['res']] + unit_tests.append(UnitTestSample(dtype, args, params, out)) + OP_UNIT_TEST[key] = unit_tests class TestConsistency(TestCaseMPS): + # TODO: This is only used while some ops are being added. # This list should contain all ops and dtypes eventually # This can be generated automatically in the `new_mps_allowlist.txt` file # by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU` # You most likely do NOT want to modify this manually ALLOWLIST_OP = { + 'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'], - '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'], - '__rmatmul__': ['f32'], + '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + '__rmatmul__': ['f32', 'i16', 'i32', 'i64', 'u8'], + '__rmod__': ['f16', 'f32'], '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'], - '__rpow__': ['f16'], + '__rpow__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + '__rsub__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'], - 'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.log_softmax': ['f32'], - 'masked.logaddexp': ['f32'], - 'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.norm': ['f16', 'f32'], - 'masked.normalize': ['f16', 'f32'], - 'masked.softmax': ['f32'], - 'masked.softmin': ['f32'], - 'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.var': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'], - 'acos': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'], + '_native_batch_norm_legit': ['f32'], + '_softmax_backward_data': ['f32'], + 'abs': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'acos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'acosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'addbmm': ['f32'], + 'addbmm': ['f32', 'i16', 'i32', 'i64', 'u8'], 'addcdiv': ['f32'], 'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'], - 'addmm': ['f32'], - 'addmv': ['f32'], - 'addr': ['f32'], + 'addmm': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'addmv': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'addr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'allclose': ['f16', 'f32'], + 'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'aminmax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'angle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'amix': ['f32'], - 'asin': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'atan': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'atan2': ['f32'], - 'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'], + 'argsort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'argwhere': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'as_strided': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'as_strided_scatter': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'asin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'asinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'atan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'atan2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'atanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'baddbmm': ['f32'], + 'baddbmm': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'bernoulli': ['f32'], + 'bfloat16': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'bincount': ['i16', 'i32', 'i64', 'u8'], 'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'], 'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'], 'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'], 'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'], 'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'], 'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'], - 'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'], - 'bmm': ['f32'], + 'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'bmm': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'bool': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'broadcast_shapes': ['f32'], + 'broadcast_tensors': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'bucketize': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cartesian_prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'ceil': ['f32', 'int32', 'int64', 'f16'], + 'cdist': ['f32'], + 'cdouble': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'ceil': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'cfloat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'chalf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cholesky': ['f32'], + 'cholesky_inverse': ['f32'], + 'cholesky_solve': ['f32'], 'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'], 'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], @@ -9096,241 +9013,659 @@ class TestConsistency(TestCaseMPS): 'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'complex': ['f16', 'f32'], 'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'corrcoef': ['f32'], - 'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'], - 'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'], - 'cov': ['f32'], - 'cumsum': ['f16', 'f32', 'int16', 'int32'], + 'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'corrcoef': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'cos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cov': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'cross': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'cummax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cummin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'], 'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'diag': ['f32', 'i32'], - 'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'], - 'diagflat': ['f32', 'i32'], - 'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'], + 'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'diagonal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'diagonal_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'diagonal_scatter': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], 'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'dist': ['f32'], + 'digamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'dist': ['f16', 'f32'], + 'div': ['f16', 'f32', 'u8', 'b8', 'i16', 'i32', 'i64'], 'dot': ['f32', 'i16', 'i32', 'i64', 'u8'], - 'einsum': ['f32'], + 'double': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'dsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'dstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'einsum': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'empty_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'erf': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'exp': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'], + 'erf': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'erfc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'erfinv': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'exp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'expand': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'expand_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'expm1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.fft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.fft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.fftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.fftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.hfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.hfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.hfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ifft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ifft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ifftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ifftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ihfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ihfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.ihfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.irfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.irfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.irfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.rfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.rfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fft.rfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'flip': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fliplr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'flipud': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'floor': ['f32', 'f16', 'i16', 'i32', 'i64'], - 'floor_divide': ['f32', 'f16'], - 'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'], + 'float_power': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'floor': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'floor_divide': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'fmod': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'frac': ['f16', 'f32'], + 'frexp': ['f16', 'f32'], + 'full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'full_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'gradient': ['f16', 'f32', 'i16'], + 'gcd': ['i16', 'i32', 'i64', 'u8'], 'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'geqrf': ['f32'], + 'gradient': ['f16', 'f32', 'i16', 'i32', 'i64'], + 'grid_sampler_2d': ['f32'], 'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'heaviside': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'histc': ['f32'], + 'histogram': ['f32'], + 'histogramdd': ['f32'], + 'hsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'hypot': ['f32'], + 'i0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'igamma': ['f16', 'f32'], + 'igammac': ['f16', 'f32'], + 'index_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'index_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'index_put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'index_reduce': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'inner': ['f32', 'i16', 'i32', 'i64', 'u8'], 'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'isin': ['f32', 'i16', 'i32', 'i64', 'u8'], 'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'isneginf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'isposinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'linalg.matrix_norm': ['f16'], + 'kthvalue': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'lcm': ['i16', 'i32', 'i64', 'u8'], + 'ldexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'lerp': ['f32'], + 'lgamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'linalg.cholesky': ['f32'], + 'linalg.cholesky_ex': ['f32'], + 'linalg.cond': ['f32'], + 'linalg.cross': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'linalg.det': ['f32'], + 'linalg.eig': ['f32'], + 'linalg.eigh': ['f32'], + 'linalg.eigvals': ['f32'], + 'linalg.eigvalsh': ['f32'], + 'linalg.householder_product': ['f32'], + 'linalg.inv': ['f32'], + 'linalg.inv_ex': ['f32'], + 'linalg.ldl_factor': ['f32'], + 'linalg.ldl_factor_ex': ['f32'], + 'linalg.ldl_solve': ['f32'], + 'linalg.lstsq': ['f32'], + 'linalg.lu': ['f32'], + 'linalg.lu_factor': ['f32'], + 'linalg.lu_factor_ex': ['f32'], + 'linalg.lu_solve': ['f32'], + 'linalg.matrix_norm': ['f16', 'f32'], + 'linalg.matrix_power': ['f32'], + 'linalg.matrix_rank': ['f32'], + 'linalg.multi_dot': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'linalg.norm': ['f16', 'f32'], + 'linalg.pinv': ['f32'], + 'linalg.qr': ['f32'], + 'linalg.slogdet': ['f32'], + 'linalg.solve': ['f32'], + 'linalg.solve_ex': ['f32'], + 'linalg.solve_triangular': ['f32'], 'linalg.svd': ['f32'], + 'linalg.svdvals': ['f32'], + 'linalg.tensorinv': ['f32'], + 'linalg.tensorsolve': ['f32'], + 'linalg.vander': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'linalg.vecdot': ['f32'], 'linalg.vector_norm': ['f16', 'f32'], 'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'log': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'log10': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'log2': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'log_softmax': ['f32'], - 'logaddexp': ['f16', 'f32'], - 'logaddexp2': ['f16', 'f32'], + 'log': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'log10': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'log1p': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'log2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'log_softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'], + 'logaddexp': ['f32'], + 'logaddexp2': ['f32'], + 'logcumsumexp': ['f32'], + 'logdet': ['f32'], 'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'logit': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'], - 'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'logsumexp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'lu': ['f32'], + 'lu_solve': ['f32'], + 'lu_unpack': ['f32'], + 'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.amax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.amin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.log_softmax': ['f32'], + 'masked.logaddexp': ['f32'], + 'masked.logsumexp': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.median': ['f32'], + 'masked.norm': ['f16', 'f32'], + 'masked.normalize': ['f16', 'f32'], + 'masked.prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.softmax': ['f32'], + 'masked.softmin': ['f32'], + 'masked.std': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked.var': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'masked_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'matmul': ['f32'], - 'mm': ['f32'], - 'mv': ['f32'], + 'matmul': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'matrix_exp': ['f32'], + 'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'max_pool2d_with_indices_backward': ['f32'], + 'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'mean': ['f16', 'f32'], + 'median': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'mm': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'mode': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'movedim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'msort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'multinomial': ['f32'], + 'mv': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'mvlgamma': ['f32', 'i16', 'i32', 'i64', 'u8'], 'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'], - 'nn.functional.adaptive_max_pool1d': ['f32'], - 'nn.functional.adaptive_max_pool2d': ['f32'], + 'nanmean': ['f16', 'f32'], + 'nanmedian': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'nanquantile': ['f32'], + 'nansum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'narrow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'narrow_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'native_batch_norm': ['f32'], + 'native_dropout_backward': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'native_layer_norm': ['f32'], + 'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'neg': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'new_empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'new_empty_strided': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'new_full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'new_ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'new_zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'nextafter': ['f32'], + 'nn.functional._scaled_dot_product_attention': ['f32'], 'nn.functional.adaptive_avg_pool1d': ['f32'], 'nn.functional.adaptive_avg_pool2d': ['f32'], + 'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'], + 'nn.functional.adaptive_max_pool1d': ['f32'], + 'nn.functional.adaptive_max_pool2d': ['f32'], + 'nn.functional.adaptive_max_pool3d': ['f32'], + 'nn.functional.alpha_dropout': ['f32'], 'nn.functional.avg_pool1d': ['f32', 'i64'], 'nn.functional.avg_pool2d': ['f32', 'i64'], + 'nn.functional.avg_pool3d': ['f32', 'i64'], + 'nn.functional.batch_norm': ['f32'], + 'nn.functional.bilinear': ['f32', 'i16', 'i32', 'i64', 'u8'], 'nn.functional.binary_cross_entropy': ['f32'], 'nn.functional.binary_cross_entropy_with_logits': ['f32'], 'nn.functional.celu': ['f32'], 'nn.functional.conv1d': ['f32'], 'nn.functional.conv2d': ['f32'], 'nn.functional.conv_transpose1d': ['f32'], - 'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'nn.functional.conv_transpose2d': ['f32'], + 'nn.functional.cosine_embedding_loss': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], 'nn.functional.cosine_similarity': ['f32'], + 'nn.functional.cross_entropy': ['f32'], + 'nn.functional.ctc_loss': ['f32'], + 'nn.functional.dropout': ['f32'], + 'nn.functional.dropout2d': ['f32'], + 'nn.functional.dropout3d': ['f32'], 'nn.functional.elu': ['f32'], - 'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'nn.functional.embedding': ['f16', 'f32'], + 'nn.functional.embedding_bag': ['f16', 'f32'], + 'nn.functional.feature_alpha_dropout': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.fractional_max_pool2d': ['f32'], + 'nn.functional.fractional_max_pool3d': ['f32'], 'nn.functional.gaussian_nll_loss': ['f32'], + 'nn.functional.gelu': ['f32'], 'nn.functional.glu': ['f32'], + 'nn.functional.grid_sample': ['f32'], 'nn.functional.group_norm': ['f32'], + 'nn.functional.hardshrink': ['f32'], + 'nn.functional.hardsigmoid': ['f32'], + 'nn.functional.hardswish': ['f32'], 'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'], 'nn.functional.hinge_embedding_loss': ['f32'], 'nn.functional.huber_loss': ['f16', 'f32'], 'nn.functional.instance_norm': ['f32'], - 'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'], + 'nn.functional.interpolate': ['f32', 'u8'], + 'nn.functional.kl_div': ['f32'], 'nn.functional.l1_loss': ['f16', 'f32'], + 'nn.functional.layer_norm': ['f32'], 'nn.functional.leaky_relu': ['f32'], - 'nn.functional.linear': ['f32'], - 'nn.functional.local_response_norm': ['f32'], - 'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'], + 'nn.functional.linear': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'nn.functional.local_response_norm': ['f32', 'i64'], + 'nn.functional.logsigmoid': ['f32'], + 'nn.functional.margin_ranking_loss': ['f32', + 'i16', + 'i32', + 'i64', + 'u8'], 'nn.functional.max_pool1d': ['f32'], 'nn.functional.max_pool2d': ['f32'], - 'max_pool2d_with_indices_backward': ['f32'], + 'nn.functional.max_pool3d': ['f32'], + 'nn.functional.max_unpool1d': ['f32'], + 'nn.functional.max_unpool2d': ['f32'], + 'nn.functional.max_unpool3d': ['f32'], + 'nn.functional.mish': ['f32'], 'nn.functional.mse_loss': ['f16', 'f32'], + 'nn.functional.multi_margin_loss': ['f32'], + 'nn.functional.multilabel_margin_loss': ['f32'], + 'nn.functional.multilabel_soft_margin_loss': ['f32'], 'nn.functional.nll_loss': ['f32'], - 'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'nn.functional.padreflect': ['f32'], - 'nn.functional.padreplicate': ['f32'], - 'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64'], - 'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'], + 'nn.functional.normalize': ['f32'], + 'nn.functional.one_hot': ['i64'], + 'nn.functional.pad': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.pairwise_distance': ['f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.pdist': ['f32'], + 'nn.functional.pixel_shuffle': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.pixel_unshuffle': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.poisson_nll_loss': ['f32', + 'i16', + 'i32', + 'i64', + 'u8'], 'nn.functional.prelu': ['f32'], 'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'], 'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'nn.functional.rrelu': ['f32'], 'nn.functional.selu': ['f32'], 'nn.functional.silu': ['f32'], 'nn.functional.smooth_l1_loss': ['f16', 'f32'], 'nn.functional.soft_margin_loss': ['f32'], - 'nn.functional.softmin': ['f32'], - 'nn.functional.softplus': ['f32'], - 'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'], - 'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'], + 'nn.functional.softmin': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'], + 'nn.functional.softshrink': ['f32'], + 'nn.functional.softsign': ['f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'i64', 'u8'], 'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'], - 'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'], - 'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64'], + 'nn.functional.triplet_margin_loss': ['f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.triplet_margin_with_distance_loss': ['f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'nn.functional.unfold': ['f16', 'f32'], 'nn.functional.upsample_bilinear': ['f32'], - 'nn.functional.upsample_nearest': ['f32'], + 'nn.functional.upsample_nearest': ['f32', 'u8'], + 'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'], 'norm': ['f32', 'f16'], + 'normal': ['f16', 'f32'], + 'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'ones_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'ormqr': ['f32'], + 'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'pca_lowrank': ['f32'], + 'permute': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'pinverse': ['f32'], + 'polar': ['f32'], + 'polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'pow': ['f16'], + 'pow': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'qr': ['f32'], + 'quantile': ['f32'], 'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'rand_like': ['f16', 'f32'], + 'randint': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'randint_like': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'randn': ['f16', 'f32'], + 'randn_like': ['f16', 'f32'], + 'ravel': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'], - 'remainder' : ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'remainder': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'renorm': ['f16', 'f32'], 'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'], - 'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'], + 'repeat_interleave': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'reshape': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'resize_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'resize_as_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'round': ['f32', 'f16', 'i16', 'i32', 'i64'], - 'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'], + 'round': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'rsqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'rsub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'scalar_tensor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'select_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'], + 'scatter_reduce': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'searchsorted': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'segment_reduce': ['f16', 'f32'], + 'select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'], - 'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'], - 'sin': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'], - 'softmax': ['f32'], + 'sigmoid': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'signal.windows.bartlett': ['f16', 'f32'], + 'signal.windows.blackman': ['f16', 'f32'], + 'signal.windows.cosine': ['f16', 'f32'], + 'signal.windows.exponential': ['f16', 'f32'], + 'signal.windows.gaussian': ['f16', 'f32'], + 'signal.windows.general_cosine': ['f16', 'f32'], + 'signal.windows.general_hamming': ['f16', 'f32'], + 'signal.windows.hamming': ['f16', 'f32'], + 'signal.windows.hann': ['f16', 'f32'], + 'signal.windows.kaiser': ['f16', 'f32'], + 'signbit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'sin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'sinc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'sinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'slice': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'], + 'sort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.airy_ai': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.bessel_j0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.bessel_j1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.bessel_y0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.bessel_y1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.chebyshev_polynomial_t': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.chebyshev_polynomial_u': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.entr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.erfcx': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.hermite_polynomial_h': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.hermite_polynomial_he': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.i0e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.i1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.i1e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.laguerre_polynomial_l': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.log_ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.modified_bessel_i0': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.modified_bessel_i1': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.modified_bessel_k0': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.modified_bessel_k1': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], 'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.ndtri': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.scaled_modified_bessel_k0': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.scaled_modified_bessel_k1': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.spherical_bessel_j0': ['b8', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'special.xlog1py': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'special.zeta': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'], + 'split_with_sizes': ['b8', + 'f16', + 'f32', + 'i16', + 'i32', + 'i64', + 'u8'], + 'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'std': ['f16', 'f32'], + 'std_mean': ['f16', 'f32'], + 'stft': ['f32'], 'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'svd': ['f32'], + 'svd_lowrank': ['f32'], + 'symeig': ['f32'], 't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'tan': ['b8', 'i16', 'i32', 'u8'], - 'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'], - 'tensordot': ['f32'], + 'take': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'take_along_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'tan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'tanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'topk': ['f32', 'f16'], - 'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'], - 'sort': ['f32', 'i16', 'i32', 'i64'], - 'argsort': ['f32', 'i16', 'i32', 'i64'], + 'tensordot': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'to_sparse': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'topk': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'trace': ['f32', 'i16', 'i32', 'i64', 'u8'], + 'transpose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'trapezoid': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'cumulative_trapezoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'triangular_solve': ['f32'], 'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'tril_indices': ['i32', 'i64'], 'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'triu_indices': ['i32', 'i64'], 'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'trunc': ['f32'], + 'trunc': ['f32', 'i16', 'i32', 'i64', 'u8'], 'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'unfold': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'unfold_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'uniform': ['f16', 'f32'], + 'unique_consecutive': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'], 'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'var': ['f16', 'f32'], + 'var_mean': ['f16', 'f32'], + 'vdot': ['f32', 'i16', 'i32', 'i64', 'u8'], 'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'view_as_complex': ['f16', 'f32'], + 'view_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], 'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'], - 'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'std': ['f16', 'f32'], - 'var': ['f16', 'f32'], - 'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'mean': ['f16', 'f32'], - 'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'native_layer_norm': ['torch.float32'], - 'nn.functional.layer_norm': ['torch.float32'], - 'nn.functional.bilinear': ['f32'], - 'linalg.solve_triangular': ['f32'], - 'triangular_solve': ['f32'], - '_native_batch_norm_legit': ['f32'], - 'native_batch_norm': ['f32'], - 'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'linalg.inv': ['f32'], - 'linalg.inv_ex': ['f32'], - 'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], - 'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'], + 'nn.functional.softplus': ['f32'], } - ALLOWLIST_OP_GRAD = { + 'H': ['f16', 'f32'], + 'T': ['f16', 'f32'], + '__getitem__': ['f16', 'f32'], '__radd__': ['f16', 'f32'], '__rdiv__': ['f16', 'f32'], '__rmatmul__': ['f32'], + '__rmod__': ['f16', 'f32'], '__rmul__': ['f16', 'f32'], - 'masked.log_softmax': ['f32'], - 'masked.logaddexp': ['f32'], - 'masked.softmax': ['f32'], - 'masked.softmin': ['f32'], - 'masked.std': ['f32'], + '__rpow__': ['f32'], + '__rsub__': ['f16', 'f32'], + '_native_batch_norm_legit': ['f32'], + '_softmax_backward_data': ['f32'], 'abs': ['f16', 'f32'], 'acos': ['f32'], 'acosh': ['f32'], @@ -9342,168 +9677,521 @@ class TestConsistency(TestCaseMPS): 'addmv': ['f32'], 'addr': ['f32'], 'all': ['f16', 'f32'], + 'amax': ['f16', 'f32'], + 'amin': ['f16', 'f32'], + 'angle': ['f16', 'f32'], 'any': ['f16', 'f32'], 'arange': ['f16', 'f32'], 'argmax': ['f16', 'f32'], 'argmin': ['f16', 'f32'], + 'argsort': ['f16', 'f32'], + 'argwhere': ['f16', 'f32'], + 'as_strided': ['f16', 'f32'], + 'as_strided_scatter': ['f16', 'f32'], 'asin': ['f32'], 'asinh': ['f32'], 'atan': ['f32'], 'atan2': ['f32'], + 'atanh': ['f32'], 'atleast_1d': ['f16', 'f32'], 'atleast_2d': ['f16', 'f32'], 'atleast_3d': ['f16', 'f32'], 'baddbmm': ['f32'], + 'bernoulli': ['f32'], + 'bfloat16': ['f16', 'f32'], 'block_diag': ['f16', 'f32'], 'bmm': ['f32'], + 'bool': ['f16', 'f32'], 'broadcast_shapes': ['f32'], + 'broadcast_tensors': ['f16', 'f32'], + 'broadcast_to': ['f16', 'f32'], + 'bucketize': ['f16', 'f32'], + 'byte': ['f16', 'f32'], + 'cartesian_prod': ['f16', 'f32'], + 'cat': ['f16', 'f32'], + 'cdist': ['f32'], 'ceil': ['f32'], + 'char': ['f16', 'f32'], + 'cholesky': ['f32'], + 'cholesky_inverse': ['f32'], + 'cholesky_solve': ['f32'], 'chunk': ['f16', 'f32'], + 'clamp': ['f32'], + 'clamp_max': ['f16', 'f32'], + 'clamp_min': ['f16', 'f32'], 'clone': ['f16', 'f32'], 'column_stack': ['f16', 'f32'], + 'combinations': ['f16', 'f32'], 'conj': ['f16', 'f32'], 'conj_physical': ['f16', 'f32'], + 'constant_pad_nd': ['f16', 'f32'], 'contiguous': ['f16', 'f32'], + 'copysign': ['f16', 'f32'], 'corrcoef': ['f32'], 'cos': ['f32'], 'cosh': ['f32'], - 'cumsum': ['f16', 'f32'], + 'count_nonzero': ['f16', 'f32'], + 'cov': ['f32'], + 'cross': ['f32'], + 'cummax': ['f32'], + 'cummin': ['f32'], + 'cumprod': ['f32'], + 'cumsum': ['f32'], + 'cumulative_trapezoid': ['f32'], 'deg2rad': ['f16', 'f32'], - 'diag': ['f32'], + 'diag': ['f16', 'f32'], 'diag_embed': ['f16', 'f32'], - 'diagflat': ['f32'], + 'diagflat': ['f16', 'f32'], + 'diagonal': ['f16', 'f32'], + 'diagonal_copy': ['f16', 'f32'], 'diagonal_scatter': ['f16', 'f32'], 'diff': ['f16', 'f32'], - 'dist': ['f32'], + 'digamma': ['f32'], + 'dist': ['f16', 'f32'], + 'div': ['f16', 'f32'], 'dot': ['f32'], + 'double': ['f16', 'f32'], + 'dsplit': ['f16', 'f32'], + 'dstack': ['f16', 'f32'], 'einsum': ['f32'], + 'empty_like': ['f16', 'f32'], + 'eq': ['f16', 'f32'], 'erf': ['f32'], + 'erfc': ['f32'], + 'erfinv': ['f32'], 'exp': ['f32'], 'exp2': ['f16', 'f32'], + 'expand': ['f16', 'f32'], + 'expand_as': ['f16', 'f32'], + 'expm1': ['f32'], + 'fft.fftshift': ['f16', 'f32'], + 'fft.hfft': ['f32'], + 'fft.hfft2': ['f32'], + 'fft.hfftn': ['f32'], + 'fft.ifftshift': ['f16', 'f32'], + 'fft.irfft': ['f32'], + 'fft.irfft2': ['f32'], + 'fft.irfftn': ['f32'], 'fill': ['f16', 'f32'], 'flatten': ['f16', 'f32'], 'flip': ['f16', 'f32'], 'fliplr': ['f16', 'f32'], 'flipud': ['f16', 'f32'], - 'float': ['f32'], + 'float': ['f16', 'f32'], + 'float_power': ['f16', 'f32'], 'floor': ['f32'], - 'gradient': ['f32'], - 'half': ['f16'], + 'fmax': ['f16', 'f32'], + 'fmin': ['f16', 'f32'], + 'fmod': ['f16', 'f32'], + 'frac': ['f16', 'f32'], + 'frexp': ['f16', 'f32'], + 'full': ['f16', 'f32'], + 'full_like': ['f16', 'f32'], + 'gather': ['f16', 'f32'], + 'ge': ['f16', 'f32'], + 'gradient': ['f16', 'f32'], + 'grid_sampler_2d': ['f32'], + 'gt': ['f16', 'f32'], + 'half': ['f16', 'f32'], + 'histc': ['f32'], + 'hsplit': ['f16', 'f32'], 'hstack': ['f16', 'f32'], - 'index_select': ['f16', 'f32'], + 'hypot': ['f32'], + 'i0': ['f32'], 'index_add': ['f16', 'f32'], + 'index_copy': ['f16', 'f32'], + 'index_fill': ['f16', 'f32'], + 'index_put': ['f16', 'f32'], + 'index_reduce': ['f16', 'f32'], + 'index_select': ['f16', 'f32'], + 'inner': ['f32'], + 'int': ['f16', 'f32'], 'isclose': ['f16', 'f32'], 'isfinite': ['f16', 'f32'], + 'isin': ['f32'], 'isinf': ['f16', 'f32'], 'isnan': ['f16', 'f32'], + 'isneginf': ['f16', 'f32'], + 'isposinf': ['f16', 'f32'], 'isreal': ['f16', 'f32'], - 'kron': ['f32'], - 'linalg.matrix_norm': ['f16'], + 'kron': ['f16', 'f32'], + 'kthvalue': ['f32'], + 'ldexp': ['f16', 'f32'], + 'le': ['f16', 'f32'], + 'lerp': ['f32'], + 'lgamma': ['f32'], + 'linalg.cholesky': ['f32'], + 'linalg.cholesky_ex': ['f32'], + 'linalg.cond': ['f32'], + 'linalg.cross': ['f32'], + 'linalg.det': ['f32'], + 'linalg.eigh': ['f32'], + 'linalg.eigvalsh': ['f32'], + 'linalg.householder_product': ['f32'], + 'linalg.inv': ['f32'], + 'linalg.inv_ex': ['f32'], + 'linalg.ldl_factor': ['f32'], + 'linalg.ldl_factor_ex': ['f32'], + 'linalg.lstsq': ['f32'], + 'linalg.lu': ['f32'], + 'linalg.lu_factor': ['f32'], + 'linalg.lu_factor_ex': ['f32'], + 'linalg.lu_solve': ['f32'], + 'linalg.matrix_norm': ['f16', 'f32'], + 'linalg.matrix_power': ['f32'], + 'linalg.matrix_rank': ['f32'], + 'linalg.multi_dot': ['f32'], + 'linalg.norm': ['f16', 'f32'], + 'linalg.pinv': ['f32'], + 'linalg.qr': ['f32'], + 'linalg.slogdet': ['f32'], + 'linalg.solve': ['f32'], + 'linalg.solve_ex': ['f32'], + 'linalg.solve_triangular': ['f32'], 'linalg.svd': ['f32'], + 'linalg.svdvals': ['f32'], + 'linalg.tensorinv': ['f32'], + 'linalg.tensorsolve': ['f32'], + 'linalg.vander': ['f32'], + 'linalg.vecdot': ['f32'], + 'linalg.vector_norm': ['f16', 'f32'], 'linspace': ['f16', 'f32'], 'log': ['f32'], 'log10': ['f32'], 'log1p': ['f32'], 'log2': ['f32'], - 'log_softmax': ['f32'], + 'log_softmax': ['f32', 'f16'], 'logaddexp': ['f32'], + 'logaddexp2': ['f32'], + 'logcumsumexp': ['f32'], + 'logdet': ['f32'], + 'logical_and': ['f16', 'f32'], 'logical_not': ['f16', 'f32'], + 'logical_or': ['f16', 'f32'], + 'logical_xor': ['f16', 'f32'], + 'logit': ['f32'], 'logspace': ['f32'], + 'logsumexp': ['f32'], + 'long': ['f16', 'f32'], + 'lt': ['f16', 'f32'], + 'lu': ['f32'], + 'lu_solve': ['f32'], + 'lu_unpack': ['f32'], + 'mH': ['f16', 'f32'], + 'mT': ['f16', 'f32'], + 'masked.amax': ['f16', 'f32'], + 'masked.amin': ['f16', 'f32'], + 'masked.argmax': ['f16', 'f32'], + 'masked.argmin': ['f16', 'f32'], + 'masked.cumprod': ['f32'], + 'masked.cumsum': ['f32'], + 'masked.log_softmax': ['f32'], + 'masked.logaddexp': ['f32'], + 'masked.logsumexp': ['f32'], + 'masked.mean': ['f16', 'f32'], + 'masked.median': ['f32'], + 'masked.norm': ['f16', 'f32'], + 'masked.normalize': ['f16', 'f32'], + 'masked.prod': ['f32'], + 'masked.softmax': ['f32'], + 'masked.softmin': ['f32'], + 'masked.std': ['f32'], + 'masked.sum': ['f16', 'f32'], + 'masked.var': ['f16', 'f32'], + 'masked_fill': ['f16', 'f32'], + 'masked_scatter': ['f16', 'f32'], + 'masked_select': ['f16', 'f32'], 'matmul': ['f32'], + 'matrix_exp': ['f32'], + 'max': ['f16', 'f32'], + 'max_pool2d_with_indices_backward': ['f32'], + 'maximum': ['f16', 'f32'], + 'mean': ['f16', 'f32'], + 'median': ['f32'], + 'meshgrid': ['f16', 'f32'], + 'min': ['f16', 'f32'], + 'minimum': ['f16', 'f32'], 'mm': ['f32'], + 'mode': ['f16', 'f32'], + 'movedim': ['f16', 'f32'], + 'msort': ['f16', 'f32'], + 'mul': ['f16', 'f32'], + 'multinomial': ['f32'], 'mv': ['f32'], + 'mvlgamma': ['f32'], + 'nan_to_num': ['f16', 'f32'], + 'nanmean': ['f16', 'f32'], + 'nanmedian': ['f32'], + 'nanquantile': ['f32'], + 'nansum': ['f16', 'f32'], + 'narrow': ['f16', 'f32'], + 'native_batch_norm': ['f32'], + 'native_dropout_backward': ['f16', 'f32'], + 'native_layer_norm': ['f32'], + 'ne': ['f16', 'f32'], 'neg': ['f16', 'f32'], - 'nn.functional.adaptive_max_pool1d': ['f32'], - 'nn.functional.adaptive_max_pool2d': ['f32'], + 'new_empty': ['f16', 'f32'], + 'new_empty_strided': ['f16', 'f32'], + 'new_full': ['f16', 'f32'], + 'new_ones': ['f16', 'f32'], + 'new_zeros': ['f16', 'f32'], + 'nn.functional._scaled_dot_product_attention': ['f32'], 'nn.functional.adaptive_avg_pool1d': ['f32'], 'nn.functional.adaptive_avg_pool2d': ['f32'], + 'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'], + 'nn.functional.adaptive_max_pool1d': ['f32'], + 'nn.functional.adaptive_max_pool2d': ['f32'], + 'nn.functional.adaptive_max_pool3d': ['f32'], + 'nn.functional.alpha_dropout': ['f32'], 'nn.functional.avg_pool1d': ['f32'], 'nn.functional.avg_pool2d': ['f32'], + 'nn.functional.avg_pool3d': ['f32'], + 'nn.functional.batch_norm': ['f32'], + 'nn.functional.bilinear': ['f32'], 'nn.functional.binary_cross_entropy': ['f32'], + 'nn.functional.binary_cross_entropy_with_logits': ['f32'], 'nn.functional.celu': ['f32'], 'nn.functional.conv1d': ['f32'], 'nn.functional.conv2d': ['f32'], 'nn.functional.conv_transpose1d': ['f32'], + 'nn.functional.conv_transpose2d': ['f32'], + 'nn.functional.conv_transpose3d': ['f32'], 'nn.functional.cosine_embedding_loss': ['f32'], + 'nn.functional.cosine_similarity': ['f32'], + 'nn.functional.cross_entropy': ['f32'], + 'nn.functional.ctc_loss': ['f32'], + 'nn.functional.dropout': ['f32'], + 'nn.functional.dropout2d': ['f32'], + 'nn.functional.dropout3d': ['f32'], 'nn.functional.elu': ['f32'], - 'nn.functional.feature_alpha_dropout': ['f16', 'f32'], + 'nn.functional.embedding': ['f16', 'f32'], + 'nn.functional.embedding_bag': ['f16', 'f32'], + 'nn.functional.feature_alpha_dropout': ['f32', 'f16'], + 'nn.functional.fractional_max_pool2d': ['f32'], + 'nn.functional.fractional_max_pool3d': ['f32'], + 'nn.functional.gaussian_nll_loss': ['f32'], + 'nn.functional.gelu': ['f32'], 'nn.functional.glu': ['f32'], + 'nn.functional.grid_sample': ['f32'], + 'nn.functional.group_norm': ['f32'], + 'nn.functional.hardshrink': ['f32'], + 'nn.functional.hardsigmoid': ['f32'], + 'nn.functional.hardswish': ['f32'], 'nn.functional.hardtanh': ['f32'], 'nn.functional.hinge_embedding_loss': ['f32'], 'nn.functional.huber_loss': ['f16', 'f32'], 'nn.functional.instance_norm': ['f32'], + 'nn.functional.interpolate': ['f32'], 'nn.functional.kl_div': ['f32'], 'nn.functional.l1_loss': ['f16', 'f32'], + 'nn.functional.layer_norm': ['f32'], 'nn.functional.leaky_relu': ['f32'], + 'nn.functional.linear': ['f32'], 'nn.functional.local_response_norm': ['f32'], + 'nn.functional.logsigmoid': ['f32'], 'nn.functional.margin_ranking_loss': ['f32'], 'nn.functional.max_pool1d': ['f32'], 'nn.functional.max_pool2d': ['f32'], + 'nn.functional.max_pool3d': ['f32'], + 'nn.functional.max_unpool1d': ['f32'], + 'nn.functional.max_unpool2d': ['f32'], + 'nn.functional.max_unpool3d': ['f32'], + 'nn.functional.mish': ['f32'], 'nn.functional.mse_loss': ['f32'], + 'nn.functional.multi_margin_loss': ['f32'], + 'nn.functional.multilabel_margin_loss': ['f32'], + 'nn.functional.multilabel_soft_margin_loss': ['f32'], 'nn.functional.nll_loss': ['f32'], - 'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'], + 'nn.functional.normalize': ['f32'], + 'nn.functional.pad': ['f16', 'f32'], 'nn.functional.pairwise_distance': ['f16', 'f32'], + 'nn.functional.pdist': ['f32'], + 'nn.functional.pixel_shuffle': ['f16', 'f32'], + 'nn.functional.pixel_unshuffle': ['f16', 'f32'], 'nn.functional.poisson_nll_loss': ['f32'], + 'nn.functional.prelu': ['f32'], 'nn.functional.relu': ['f32'], 'nn.functional.relu6': ['f32'], + 'nn.functional.rrelu': ['f32'], 'nn.functional.selu': ['f32'], 'nn.functional.silu': ['f32'], + 'nn.functional.smooth_l1_loss': ['f32'], 'nn.functional.soft_margin_loss': ['f32'], - 'nn.functional.softmin': ['f32'], + 'nn.functional.softmin': ['f32', 'f16'], 'nn.functional.softplus': ['f32'], + 'nn.functional.softshrink': ['f32'], 'nn.functional.softsign': ['f16', 'f32'], - 'nn.functional.smooth_l1_loss': ['f32'], + 'nn.functional.tanhshrink': ['f32'], 'nn.functional.threshold': ['f32'], 'nn.functional.triplet_margin_loss': ['f32'], 'nn.functional.triplet_margin_with_distance_loss': ['f32'], + 'nn.functional.unfold': ['f16', 'f32'], 'nn.functional.upsample_bilinear': ['f32'], - 'norm': ['f32', 'f16'], + 'nn.functional.upsample_nearest': ['f32'], + 'nonzero': ['f16', 'f32'], + 'norm': ['f16', 'f32'], + 'normal': ['f16', 'f32'], + 'ones': ['f16', 'f32'], + 'ones_like': ['f16', 'f32'], + 'ormqr': ['f32'], + 'outer': ['f16', 'f32'], + 'pca_lowrank': ['f32'], + 'permute': ['f16', 'f32'], + 'pinverse': ['f32'], + 'polygamma': ['f32'], 'positive': ['f16', 'f32'], + 'pow': ['f32'], + 'prod': ['f32'], + 'put': ['f16', 'f32'], + 'qr': ['f32'], + 'quantile': ['f32'], 'rad2deg': ['f16', 'f32'], + 'rand_like': ['f16', 'f32'], + 'randint': ['f16', 'f32'], + 'randint_like': ['f16', 'f32'], + 'randn_like': ['f16', 'f32'], + 'ravel': ['f16', 'f32'], 'real': ['f16', 'f32'], 'reciprocal': ['f16', 'f32'], + 'remainder': ['f16', 'f32'], + 'renorm': ['f16', 'f32'], 'repeat': ['f16', 'f32'], 'repeat_interleave': ['f16', 'f32'], + 'reshape': ['f16', 'f32'], + 'reshape_as': ['f16', 'f32'], 'resolve_conj': ['f16', 'f32'], 'resolve_neg': ['f16', 'f32'], + 'roll': ['f16', 'f32'], + 'rot90': ['f16', 'f32'], 'round': ['f32'], 'rsqrt': ['f32'], + 'rsub': ['f16', 'f32'], + 'scatter': ['f16', 'f32'], + 'scatter_add': ['f16', 'f32'], + 'scatter_reduce': ['f16', 'f32'], + 'searchsorted': ['f16', 'f32'], + 'segment_reduce': ['f16', 'f32'], + 'select': ['f16', 'f32'], 'select_scatter': ['f16', 'f32'], + 'sgn': ['f16', 'f32'], + 'short': ['f16', 'f32'], + 'sigmoid': ['f32'], 'sign': ['f16', 'f32'], + 'signbit': ['f16', 'f32'], 'sin': ['f32'], + 'sinc': ['f32'], 'sinh': ['f32'], + 'slice': ['f16', 'f32'], 'slice_scatter': ['f16', 'f32'], - 'softmax': ['f32'], + 'softmax': ['f32', 'f16'], + 'sort': ['f16', 'f32'], + 'special.airy_ai': ['f32'], + 'special.bessel_j0': ['f32'], + 'special.bessel_j1': ['f32'], + 'special.bessel_y0': ['f32'], + 'special.bessel_y1': ['f32'], + 'special.chebyshev_polynomial_t': ['f32'], + 'special.chebyshev_polynomial_u': ['f32'], + 'special.entr': ['f32'], + 'special.erfcx': ['f32'], + 'special.hermite_polynomial_h': ['f32'], + 'special.hermite_polynomial_he': ['f32'], + 'special.i0e': ['f32'], + 'special.i1': ['f32'], + 'special.i1e': ['f32'], + 'special.laguerre_polynomial_l': ['f32'], + 'special.log_ndtr': ['f32'], + 'special.modified_bessel_i0': ['f32'], + 'special.modified_bessel_i1': ['f32'], + 'special.modified_bessel_k0': ['f32'], + 'special.modified_bessel_k1': ['f32'], + 'special.ndtr': ['f32'], + 'special.ndtri': ['f32'], + 'special.polygamma': ['f32'], + 'special.scaled_modified_bessel_k0': ['f32'], + 'special.scaled_modified_bessel_k1': ['f32'], + 'special.spherical_bessel_j0': ['f32'], + 'special.xlog1py': ['f16', 'f32'], 'split': ['f16', 'f32'], + 'split_with_sizes': ['f16', 'f32'], 'sqrt': ['f32'], 'square': ['f16', 'f32'], 'squeeze': ['f16', 'f32'], 'stack': ['f16', 'f32'], - 'sub': ['f32'], + 'std': ['f16', 'f32'], + 'std_mean': ['f16', 'f32'], + 'sub': ['f16', 'f32'], + 'sum': ['f16', 'f32'], 'sum_to_size': ['f16', 'f32'], 'svd': ['f32'], + 'svd_lowrank': ['f32'], + 'symeig': ['f32'], 't': ['f16', 'f32'], + 'take': ['f16', 'f32'], + 'take_along_dim': ['f16', 'f32'], + 'tan': ['f32'], 'tanh': ['f32'], + 'tensor_split': ['f16', 'f32'], 'tensordot': ['f32'], 'tile': ['f16', 'f32'], + 'to': ['f16', 'f32'], + 'topk': ['f32'], + 'trace': ['f32'], + 'transpose': ['f16', 'f32'], + 'trapezoid': ['f16', 'f32'], + 'trapz': ['f16', 'f32'], + 'triangular_solve': ['f32'], 'tril': ['f16', 'f32'], 'triu': ['f16', 'f32'], 'true_divide': ['f16', 'f32'], 'trunc': ['f32'], 'unbind': ['f16', 'f32'], 'unflatten': ['f16', 'f32'], + 'unfold': ['f16', 'f32'], + 'unfold_copy': ['f16', 'f32'], + 'uniform': ['f16', 'f32'], 'unsqueeze': ['f16', 'f32'], + 'var': ['f16', 'f32'], + 'var_mean': ['f16', 'f32'], + 'vdot': ['f32'], 'view': ['f16', 'f32'], 'view_as': ['f16', 'f32'], + 'view_copy': ['f16', 'f32'], 'vsplit': ['f16', 'f32'], 'vstack': ['f16', 'f32'], + 'where': ['f16', 'f32'], + 'xlogy': ['f16', 'f32'], 'zero_': ['f16', 'f32'], - 'linalg.solve_triangular': ['f32'], - 'triangular_solve': ['f32'], - '_native_batch_norm_legit': ['f32'], - 'native_batch_norm': ['f32'], - 'native_layer_norm': ['f32'], - 'nn.functional.gelu': ['f32'], + 'zeros': ['f16', 'f32'], + 'zeros_like': ['f16', 'f32'], + } + + BLOCKLIST_OP_GRAD = { + # Unimplemented ops + '__getitem__': ['f16'], + 'combinations': ['f16', 'f32'], + 'logaddexp2': ['f32'], + 'masked_select': ['f16', 'f32'], + 'nn.functional.binary_cross_entropy_with_logits': ['f16', 'f32'], + 'nn.functional.group_norm': ['f32'], + 'prod': ['f32'], + 'sgn': ['f16', 'f32'], + 'unfold_copy': ['f16', 'f32'], + 'unfold': ['f16', 'f32'], + 'trace': ['f32'], + + # Correctness issues + 'atanh': ['f32'], + 'div': ['f16'], + + # Unsupported dtype + 'special.ndtr': ['f32'], + 'trapezoid': ['f16', 'f32'], + 'trapz': ['f16', 'f32'], + } + + BLOCKLIST_OP_GRAD_MACOS_12 = { + 'remainder': ['f16'], } # These ops that are problematic. So never run them even when @@ -9511,124 +10199,388 @@ class TestConsistency(TestCaseMPS): # If the dtype list is None, all dtypes are excluded. # All the entries in this list should be removed BLOCKLIST = { - # Functions that hang - 'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool], - # + forward when requires_grad=True or running backward - 'masked.mean': [torch.bool, torch.float16], - 'masked.prod': [torch.bool], - 'masked.sum': [torch.bool], - # Functions that hard crash - 'std': [torch.float16], - 'stft': [torch.float32], 'var': [torch.float16], - # + forward when requires_grad=True or running backward - 'nn.functional.embedding': [torch.float32, torch.float16], - '__rpow__': [torch.int64], - - 'as_strided_scatter': [torch.uint8], - 'atan2': [torch.int64], - 'bfloat16': None, - 'block_diag': [torch.uint8], - 'byte': None, - 'chalf': None, - 'diag_embed': [torch.uint8], - 'diagonal_scatter': [torch.uint8], - 'long': None, + 'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'resize_as_': [torch.float16, torch.float32], + 'topk': [torch.int16, torch.int32, torch.int64, torch.uint8], + + # Functions with correctness issues + 'multinomial': [torch.float32], + + # cpu result off, showing random values + 'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + # cpu result off, showing inf values + 'dist': [torch.float16], + + # failure due to issue: atan2() may generate NAN in output with + 'atan2': [torch.bool, torch.int16, torch.int32, torch.uint8], + + # Unsupported Border padding mode + 'grid_sampler_2d': [torch.float32], + 'nn.functional.grid_sample': [torch.float32], + + # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results + 'pow': [torch.int16, torch.int32, torch.int64, torch.uint8], + '__rpow__': [torch.uint8], + + # failures before macOS 13.3 + 'nn.functional.conv_transpose2d': [torch.float32], + } + + UNIMPLEMENTED_OPS = { + # Failures due to lack of op implementation on MPS backend + 'linalg.eig': [torch.float32], + 'linalg.eigvals': [torch.float32], + 'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.rfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.rfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'stft': [torch.float32], + 'nn.functional.conv_transpose3d': [torch.int64, torch.float32], + 'rounddecimals_neg_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'rounddecimals_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'rounddecimals_0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + '__rmod__': [torch.float16, torch.float32], + '__rsub__': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'aminmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'angle': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'argsort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'bucketize': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'cholesky': [torch.float32], + 'cholesky_inverse': [torch.float32], + 'cholesky_solve': [torch.float32], + 'copysign': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'cummax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'cummin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'cumprod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'digamma': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'erfc': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'erfinv': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fmin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fmod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'frexp': [torch.float16, torch.float32], + 'gcd': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'geqrf': [torch.float32], + 'heaviside': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'histc': [torch.float32], + 'histogram': [torch.float32], + 'histogramdd': [torch.float32], + 'hypot': [torch.float32], + 'i0': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'igamma': [torch.float16, torch.float32], + 'igammac': [torch.float16, torch.float32], + 'index_copy': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'index_fill': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'index_reduce': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'isin': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'isneginf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'isposinf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'kthvalue': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'lcm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'ldexp': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'lerp': [torch.float32], + 'lgamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'linalg.cholesky': [torch.float32], + 'linalg.cholesky_ex': [torch.float32], + 'linalg.cond': [torch.float32], + 'linalg.detsingular': [torch.float32], + 'linalg.det': [torch.float32], + 'linalg.eigh': [torch.float32], + 'linalg.eigvalsh': [torch.float32], + 'linalg.householder_product': [torch.float32], + 'linalg.ldl_factor': [torch.float32], + 'linalg.ldl_factor_ex': [torch.float32], + 'linalg.ldl_solve': [torch.float32], + 'linalg.lstsq': [torch.float32], + 'linalg.lstsqgrad_oriented': [torch.float32], + 'linalg.lu': [torch.float32], + 'linalg.lu_factor': [torch.float32], + 'linalg.lu_factor_ex': [torch.float32], + 'linalg.lu_solve': [torch.float32], + 'linalg.matrix_norm': [torch.float32], + 'linalg.norm': [torch.float32], + 'linalg.normsubgradients_at_zero': [torch.float32], + 'linalg.qr': [torch.float32], + 'linalg.slogdet': [torch.float32], + 'linalg.solve': [torch.float32], + 'linalg.solve_ex': [torch.float32], + 'linalg.svdvals': [torch.float32], + 'linalg.tensorsolve': [torch.float32], + 'linalg.vander': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'linalg.vecdot': [torch.float32], + 'logcumsumexp': [torch.float32], + 'logdet': [torch.float32], + 'logit': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'lu': [torch.float32], + 'lu_solve': [torch.float32], + 'lu_unpack': [torch.float32], + 'masked.cumprod': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'masked.median': [torch.float32], + 'masked_scatter': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'matrix_exp': [torch.float32], + 'mode': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'msort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'mvlgamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'mvlgammamvlgamma_p_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'mvlgammamvlgamma_p_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'mvlgammamvlgamma_p_5': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'nanquantile': [torch.float32], + 'nanmean': [torch.float32, torch.float16], + 'nanmedian': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'nansum': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'native_dropout_backward': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'nextafter': [torch.float32], + 'normnuc': [torch.float32], + 'nn.functional._scaled_dot_product_attention': [torch.float32], + 'nn.functional.fractional_max_pool2d': [torch.float32], + 'nn.functional.fractional_max_pool3d': [torch.float32], + 'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32], + 'nn.functional.adaptive_max_pool3d': [torch.float32], + 'nn.functional.interpolatearea': [torch.float32], + 'nn.functional.interpolatebicubic': [torch.float32], + 'nn.functional.interpolatelinear': [torch.float32], + 'nn.functional.interpolatetrilinear': [torch.float32], + 'nn.functional.max_unpool1dgrad': [torch.float32], + 'nn.functional.max_unpool2dgrad': [torch.float32], + 'nn.functional.max_unpool3dgrad': [torch.float32], + 'nn.functional.avg_pool3d': [torch.float32, torch.int64], + 'nn.functional.ctc_loss': [torch.float32], + 'nn.functional.embedding_bag': [torch.float16, torch.float32], + 'nn.functional.hardshrink': [torch.float32], + 'nn.functional.hardsigmoid': [torch.float32], + 'nn.functional.logsigmoid': [torch.float32], + 'nn.functional.max_pool3d': [torch.float32], + 'nn.functional.max_unpool1d': [torch.float32], + 'nn.functional.max_unpool2d': [torch.float32], + 'nn.functional.max_unpool3d': [torch.float32], + 'nn.functional.mish': [torch.float32], + 'nn.functional.multi_margin_loss': [torch.float32], + 'nn.functional.multilabel_margin_loss': [torch.float32], + 'nn.functional.multilabel_soft_margin_loss': [torch.float32], + 'nn.functional.pdist': [torch.float32], + 'nn.functional.rrelu': [torch.float32], + 'nn.functional.softshrink': [torch.float32], + 'nn.functional.unfold': [torch.float16, torch.float32], + 'nn.functional.norm': [torch.float32], + 'ormqr': [torch.float32], + 'pca_lowrank': [torch.float32], + 'pinverse': [torch.float32], + 'polar': [torch.float32], + 'polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'polygammapolygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'polygammapolygamma_n_1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'qr': [torch.float32], + 'quantile': [torch.float32], + 'renorm': [torch.float16, torch.float32], + 'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reducemin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reducemean': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reduceprod': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'scatter_reducesum': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'searchsorted': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'segment_reduce': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'segment_reduceoffsets': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'segment_reducelengths': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'sinc': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'sort': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.airy_ai': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.bessel_j0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.chebyshev_polynomial_t': [torch.bool, + torch.float16, + torch.float32, + torch.int16, + torch.int32, + torch.int64, + torch.uint8], + 'special.chebyshev_polynomial_u': [torch.bool, + torch.float16, + torch.float32, + torch.int16, + torch.int32, + torch.int64, + torch.uint8], + 'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.hermite_polynomial_h': [torch.bool, + torch.float16, + torch.float32, + torch.int16, + torch.int32, + torch.int64, + torch.uint8], + 'special.hermite_polynomial_he': [torch.bool, + torch.float16, + torch.float32, + torch.int16, + torch.int32, + torch.int64, + torch.uint8], + 'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.laguerre_polynomial_l': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.log_ndtr': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.modified_bessel_i0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.modified_bessel_i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.ndtri': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.polygammaspecial_polygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.scaled_modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.scaled_modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.spherical_bessel_j0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.xlog1py': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'special.zeta': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'std_mean': [torch.float16, torch.float32], + 'std_meanunbiased': [torch.float16, torch.float32], + 'svd_lowrank': [torch.float32], + 'symeig': [torch.float32], + 'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'var_mean': [torch.float16, torch.float32], + 'var_meanunbiased': [torch.float16, torch.float32], + 'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'view_as_complex': [torch.float16, torch.float32], + 'xlogy': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + } + + EXPECTED_FAILURES = { + # Failures due to unsupported data types on MPS backend + 'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], 'nn.functional.conv1d': [torch.int64], 'nn.functional.conv2d': [torch.int64], 'nn.functional.conv_transpose1d': [torch.int64], - 'nn.functional.conv_transpose2d': [torch.int64], - 'nn.functional.conv_transpose3d': [torch.int64, torch.float32], - 'nn.functional.local_response_norm': [torch.int64], - 'nn.functional.padcircular': [torch.uint8], - 'pow': [torch.int64], - 'select_scatter': [torch.uint8], - 'sigmoid': [torch.int64], - - - # failures due to lack of op implementation on MPS backend - 'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], - - # These were moved from ALLOWLIST to BLOCK as they are not working - # locally - 'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], - '__radd__': ['torch.bool', 'torch.uint8'], - '__rmul__': ['torch.uint8'], - 'neg': ['torch.uint8'], - 'add': ['torch.bool', 'torch.uint8'], - 'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], - 'diag': ['torch.int64'], - 'diagflat': ['torch.int64'], - - # Functions that are flaky - # These are detected as "ok" by the expect case but actually fail to run sometimes - 'as_strided': None, - 'broadcast_tensors': None, - 'broadcast': None, - 'broadcast_to': None, - 'diagonal': None, - 'divfloor_rounding': None, - 'divno_rounding_mode': None, - 'divtrunc_rounding': None, - 'dsplit': None, - 'hsplit': None, - 'empty': None, - 'expand_as': None, - 'expand': None, - 'ge': None, - 'ne': None, - 'le': None, - 'lt': None, - 'gt': None, - 'transpose': None, - 'splitlist_args': None, - 'select': None, - 'reshape': None, - 'reshape_as': None, - 'permute': None, - 'norm': None, - 'nn.functional.pixel_unshuffle': None, - 'nn.functional.pixel_shuffle': None, - 'nn.functional.cross_entropy': None, - 'nn.functional.one_hot': None, - 'narrow': None, - 'movedim': None, - 'minreduction_with_dim': None, - 'minreduction_no_dim': None, - 'minbinary': None, - 'meshgridvariadic_tensors': None, - 'meshgridlist_of_tensors': None, - 'maxreduction_with_dim': None, - 'maxreduction_no_dim': None, - 'maxbinary': None, - 'maximum': None, - 'minimum': None, - 'outer': None, - 'softmaxwith_dtype': None, - 'rounddecimals_neg_3': None, - 'rounddecimals_3': None, - 'rounddecimals_0': None, - 'normnuc': None, - 'nn.functional.softminwith_dtype': None, - 'nn.functional.feature_alpha_dropoutwith_train': None, - 'log_softmaxwith_dtype': None, - 'split_with_sizes': None, - 'trapezoid': None, - 'eq': None, - 'mul': None, - 'cartesian_prod': None, - 'bool': None, - 'inner': None, - 'dstack': None, - 'take_along_dim': None, + 'nn.functional.softminwith_dtype': [torch.bool, + torch.float16, + torch.float32, + torch.int16, + torch.int32, + torch.int64, + torch.uint8], + 'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + '__rmatmul__': [torch.int16, torch.int32, torch.uint8], + 'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'addr': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'cdouble': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'cfloat': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'complex': [torch.float16, torch.float32], + 'double': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.fft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.fftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.fftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.hfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.hfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.hfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ifft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ifftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ifftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.irfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.irfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.irfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'float_power': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'full_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'inner': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'linalg.matrix_rank': [torch.float32], + 'linalg.matrix_rankhermitian': [torch.float32], + 'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'linalg.pinv': [torch.float32], + 'linalg.pinvhermitian': [torch.float32], + 'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8], # MPS device does not support mm for non-float inputs + 'mm': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'mv': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'nn.functional.batch_norm': [torch.float32], + 'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'signal.windows.blackman': [torch.float16], + 'signal.windows.cosine': [torch.float16], + 'signal.windows.exponential': [torch.float16], + 'signal.windows.gaussian': [torch.float16], + 'signal.windows.general_cosine': [torch.float16], + 'signal.windows.general_hamming': [torch.float16], + 'signal.windows.hamming': [torch.float16], + 'signal.windows.hann': [torch.float16], + 'signal.windows.kaiser': [torch.float16], + 'stft': [torch.float32], + 'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8], + 'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8], } - # Those ops worked on MacOS12, but broken on MacOS13 - VENTURA_BLOCKLIST = { - 'masked.softmax': [torch.float32], + UNDEFINED_BEHAVIOUR = { + # Failures due to random output that they generate using + # Philox engine causing mismatch with CPU results + 'uniform': [torch.float16, torch.float32], + 'rand_like': [torch.float16, torch.float32], + 'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'randn_like': [torch.float16, torch.float32], + 'bernoulli': [torch.float32], + 'nn.functional.feature_alpha_dropoutwith_train': [torch.float32], + 'normal': [torch.float16, torch.float32, torch.float16, torch.float32], + 'normalnumber_mean': [torch.float16, torch.float32], + 'nn.functional.alpha_dropout': [torch.float32], + 'nn.functional.dropout': [torch.float32], + 'nn.functional.dropout2d': [torch.float32], + 'nn.functional.dropout3d': [torch.float32], + # these fill tensors with uninitialized data, causing mismatch with CPU + 'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + 'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique + 'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + # duplicate indices are used in the testcase - undefined behaviour + 'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8], + # problem 104760543, zero to negative integer powers are undefined + '__rpow__': [torch.int16, torch.int32, torch.int64], + } + + FAST_MATH_PRECISION_ISSUES = { + # Failures due to precision issues + 'tan': [torch.float32], + 'pow': [torch.float32], 'masked.softmin': [torch.float32], + 'masked.softmax': [torch.float32], 'masked.log_softmax': [torch.float32], - 'dot': [torch.int64], + 'cdist': [torch.float32], + '__rpow__': [torch.float32] } FP16_LOW_PRECISION_LIST = { @@ -9638,14 +10590,59 @@ class TestConsistency(TestCaseMPS): 'true_divide', 'kron', 'gradient', 'var', 'std', 'linalg.vector_norm', - 'masked.sum', 'masked.std', - 'masked.var', + 'addr', + + # for macOS 12 + 'masked.normalize', 'masked.sum', 'masked.var', + 'outer', + 'sum_to_size', 'sum', + 'mul', + } + + BLOCKLIST_MACOS_12 = { + # expected failures + 'nn.functional.interpolatenearest': [torch.float32], + 'nn.functional.upsample_nearest': [torch.float32], + 'nn.functional.conv_transpose2d': [torch.float32] } + ALLOWLIST_MACOS_13_3 = { + 'pow': [torch.int16, torch.int32, torch.int64, torch.uint8], + '__rpow__': [torch.uint8], + 'nn.functional.conv_transpose2d': [torch.float32], + } + + MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), ( + FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS)) + # Used for accept mode only NEW_ALLOW_LIST = defaultdict(list) NEW_ALLOW_LIST_GRAD = defaultdict(list) + def get_error_message(self, key, op_name, dtype): + if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]: + return f"Running test with {op_name} fails due to precision issues (fast math) so skipping" + elif key in self.BLOCKLIST and dtype in self.BLOCKLIST[key]: + return f"Running test with {op_name} fails so skipping" + elif key in self.UNDEFINED_BEHAVIOUR and dtype in self.UNDEFINED_BEHAVIOUR[key]: + return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping" + elif key in self.EXPECTED_FAILURES and dtype in self.EXPECTED_FAILURES[key]: + return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping" + elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]: + return f"Running test with {op_name} expected to fail due to missing op implementation" + elif product_version < 13.0 and key in self.BLOCKLIST_MACOS_12 and dtype in self.BLOCKLIST_MACOS_12[key]: + return f"Running test with {op_name} expected to fail on macOS 12" + return None + + def compare_with_CUDA(self, op, mps_out, atol, rtol): + cuda_out = CUDA_RESULT[op.name] + try: + self.assertEqual(cuda_out, mps_out, atol=atol, rtol=rtol) + except Exception as e: + return False + else: + return True + @ops(op_db, allowed_dtypes=MPS_DTYPES) def test_output_match(self, device, dtype, op): self.assertEqual(device, "cpu") @@ -9653,13 +10650,15 @@ def test_output_match(self, device, dtype, op): self.skipTest("MPS is not available") key = op.name + op.variant_test_name - - if key in self.VENTURA_BLOCKLIST and torch.backends.mps.is_macos13_or_newer(): - if dtype in self.VENTURA_BLOCKLIST[key]: - self.skipTest(f"{key}_{dtype} fails on Ventura, see https://github.com/pytorch/pytorch/issues/85758") - if key in self.BLOCKLIST: - if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]: - self.skipTest(f"Running test with {op.name} hangs so skipping") + if key in self.MPS_SKIP_LIST: + msg = self.get_error_message(key, op.name, dtype) + if msg is not None and not (product_version >= 13.3 and + key in self.ALLOWLIST_MACOS_13_3 and dtype in self.ALLOWLIST_MACOS_13_3[key]): + self.skipTest(msg) + if product_version < 13.0 and key in self.BLOCKLIST_MACOS_12: + msg = self.get_error_message(key, op.name, dtype) + if msg is not None: + self.skipTest(msg) # Make this an expecttest manually # When this env variable is set, generate a new ALLOWLIST_OP @@ -9677,7 +10676,10 @@ def test_output_match(self, device, dtype, op): if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]: self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded") - if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]: + if (op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name] or + (op.name in self.BLOCKLIST_OP_GRAD and dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD[op.name]) or + (product_version < 13.0 and op.name in self.BLOCKLIST_OP_GRAD_MACOS_12 and + dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD_MACOS_12[op.name])): run_grad_test = False def get_samples(): @@ -9709,7 +10711,7 @@ def get_samples(): cpu_out = op(*cpu_args, **cpu_kwargs) mps_out = op(*mps_args, **mps_kwargs) - if op.name == "nn.functional.conv2d" and dtype == torch.float32: + if op.name == "nn.functional.conv2d" or op.name == "linalg.multi_dot" and dtype == torch.float32: atol = 1e-4 rtol = 3e-5 elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16: @@ -9721,6 +10723,11 @@ def get_samples(): elif (op.name == "native_layer_norm"): atol = 1e-4 rtol = 1.3e-5 + elif op.name == "norm" and dtype == torch.float16: + atol = 7e-4 + rtol = 1.5e-3 + elif op.name == "unique" and cpu_kwargs["sorted"] is False: + continue else: atol = None rtol = None @@ -9731,6 +10738,9 @@ def get_samples(): if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]): self.skipTest(f"Expected Runtime Error: {str(e)}") + if op.name in CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol): + continue + if not generate_new_truth: raise e forward_failed = True @@ -9808,6 +10818,12 @@ def req_grad(t): # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS @skipIfSlowGradcheckEnv class TestCommon(TestCase): + + UNIMPLEMENTED_OPS = { + 'aminmax': [torch.float32], + 'roll': [torch.float32], + } + exact_dtype = True # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI @@ -9838,6 +10854,10 @@ def tearDownClass(cls): # MPS only supports float32 @ops(_ref_test_ops, allowed_dtypes=(torch.float32,)) def test_numpy_ref_mps(self, device, dtype, op): + key = op.name + op.variant_test_name + if key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]: + self.skipTest(f"Running test with {op.name} expected to fail due to missing op implementation") + # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS # does not support float64 Tensors. # A few ops are currently broken on their reference inputs, but not their sample inputs. These should diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index 8e34ec10a8350..75e87155c7ca0 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -13,7 +13,7 @@ import torch.backends.mps from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \ skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \ - IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \ + IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_WITH_MPS, \ _TestParametrizer, compose_parametrize_fns, dtype_name, \ NATIVE_DEVICES, skipIfTorchDynamo from torch.testing._internal.common_cuda import _get_torch_cuda_version, \ @@ -555,10 +555,8 @@ def get_device_type_test_bases(): test_bases.append(CPUTestBase) if torch.cuda.is_available(): test_bases.append(CUDATestBase) - # Disable MPS testing in generic device testing temporarily while we're - # ramping up support. - # elif torch.backends.mps.is_available(): - # test_bases.append(MPSTestBase) + elif torch.backends.mps.is_available(): + test_bases.append(MPSTestBase) return test_bases diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 03193f5ed7b27..66466c56aa3a9 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -896,6 +896,7 @@ def _check_module_exists(name: str) -> bool: TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1' TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1' TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1' +TEST_WITH_MPS = os.getenv('PYTORCH_TEST_WITH_MPS', '0') == '1' # Enables tests that are slow to run (disabled by default) TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'