diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index dbba68081d3eb..0b0b1e3599b30 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -37,7 +37,7 @@ cross_compile_arm64() {
   # Cross compilation for arm64
   # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
   # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_OPENMP=OFF USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
 compile_x86_64() {
diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
deleted file mode 100644
index 765fd1715e891..0000000000000
--- a/.github/auto_request_review.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Documented at https://github.com/necojackarc/auto-request-review
-reviewers:
-  groups:
-    symbolic-shapes:
-      - ezyang
-      - Chillee
-      - albanD
-      - miladm
-      - bdhirsh
-      - voznesenskym
-      - jbschlosser
-
-  per_author:
-    symbolic-shapes:
-      - symbolic-shapes
-      - antoniojkim
-      - wconstab
-      - SherlockNoMad
-
-files:
-  # none yet, TODO: migrate CODEOWNERS here
-
-options:
-  ignore_draft: true
-  ignored_keywords:
-    - DO NOT REVIEW
-  # Just manually setup a self-referential per_author rule if you
-  # want group assignment
-  enable_group_assignment: false
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index f5f66ae5129bf..5a6483ad54b3e 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -63,8 +63,8 @@ on:
 
 jobs:
   build:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    # # Don't run on forked repos.
+    # if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner-type }}
     env:
       # For sccache access (only on non-forked PRs)
@@ -106,6 +106,7 @@ jobs:
           environment-file: ${{ inputs.environment-file }}
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 1fcafb6db66ff..f9c402a772ac7 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -83,6 +83,20 @@ jobs:
           set -ex
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
+      - name: Run MPS Test Modules
+        id: test_2
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+        shell: arch -arch arm64 bash {0}
+        # During bring up of test_modules don't show this as an error.
+        continue-on-error: true
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          # TODO(https://github.com/pytorch/pytorch/issues/79293)
+
+          ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose
+
       - name: Print remaining test logs
         shell: bash
         if: always()
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index d8ede95f2958d..fb4ceaad40be9 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -128,6 +128,7 @@ jobs:
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp
@@ -182,6 +183,12 @@ jobs:
         run: |
           cat test/**/*.log || true
 
+      - name: Print remaining test logs
+        shell: bash
+        if: always()
+        run: |
+          cat test/**/*.log || true
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
deleted file mode 100644
index 7c98c2990fba7..0000000000000
--- a/.github/workflows/auto_request_review.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Auto Request Review
-
-on:
-  pull_request:
-    types: [opened, ready_for_review, reopened]
-
-jobs:
-  auto-request-review:
-    # Don't run on forked repos
-    if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: Auto Request Review
-    runs-on: ubuntu-latest
-    steps:
-      - name: Request review based on files changes and/or groups the author belongs to
-        # v0.7.0
-        uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
deleted file mode 100644
index 5fa5fed16daf8..0000000000000
--- a/.github/workflows/check-labels.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Check Labels
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  check-labels:
-    name: Check labels
-    runs-on: linux.20_04.4x
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
-
-      - name: Check labels
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
-        run: |
-          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5dc152286e503..58566ebc37465 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,251 +5,77 @@ on:
   push:
     branches:
       - master
-      - main
-      - release/*
-      - landchecks/*
   workflow_dispatch:
 
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
-  docker-image:
-    name: docker-image
-    uses: ./.github/workflows/_calculate-docker-image.yml
-    with:
-      docker-image-name: pytorch-linux-focal-linter
-
   lintrunner:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        CACHE_DIRECTORY="/tmp/.lintbin"
-        # Try to recover the cached binaries
-        if [[ -d "${CACHE_DIRECTORY}" ]]; then
-          # It's ok to fail this as lintrunner init would download these binaries
-          # again if they do not exist
-          cp -r "${CACHE_DIRECTORY}" . || true
-        fi
-
-        # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
-
-        # Do build steps necessary for linters
-        python3 -m tools.linter.clang_tidy.generate_build_files
-        python3 -m tools.generate_torch_version --is_debug=false
-        python3 -m tools.pyi.gen_pyi \
-          --native-functions-path aten/src/ATen/native/native_functions.yaml \
-          --tags-path aten/src/ATen/native/tags.yaml \
-          --deprecated-functions-path "tools/autograd/deprecated.yaml"
-
-        RC=0
-        # Run lintrunner on all files
-        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
-          echo ""
-          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
-          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-          RC=1
-        fi
-
-        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-        jq --raw-output \
-          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-          lint.json || true
-
-        exit $RC
-
-  quick-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Ensure no non-breaking spaces
-        # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
-        # does not support the '\u000a' syntax (which is relevant for local linters)
-        (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-
-        # Ensure cross-OS compatible file names
-        (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
-
-        # Ensure no versionless Python shebangs
-        (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-
-        # Ensure ciflow tags mentioned in config
-        python3 .github/scripts/collect_ciflow_labels.py --validate-tags
-
-        # C++ docs check
-        pushd docs/cpp/source
-        ./check-doxygen.sh
-        popd
-
-        # CUDA kernel launch check
-        set -eux
-        python3 torch/testing/_internal/check_kernel_launches.py |& tee cuda_kernel_launch_checks.txt
-
-  pr-sanity-checks:
-    name: pr-sanity-checks
-    runs-on: [self-hosted, linux.large]
-    # Only run this on pull requests. This check is simple enough to be done without a Docker image
-    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
+    runs-on: macos-m1-12
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
         with:
           submodules: false
-          fetch-depth: -1
+          fetch-depth: 1
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          # pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
 
-      - name: PR size check (nonretryable)
+      - name: Install requirements
         env:
-          BASE: ${{ github.event.pull_request.base.sha }}
-          HEAD: ${{ github.event.pull_request.head.sha }}
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
         run: |
-          bash .github/scripts/pr-sanity-check.sh
-
-  workflow-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate workflows
-        .github/scripts/generate_ci_workflows.py
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m pip install --force-reinstall -r .github/requirements-gha-cache.txt
 
-        RC=0
-        # Assert that regenerating the workflows didn't change them
-        if ! .github/scripts/report_git_status.sh .github/workflows; then
-          echo
-          echo 'As shown by the above diff, the committed .github/workflows'
-          echo 'are not up to date according to .github/templates.'
-          echo 'Please run this command, commit, and push again to your PR:'
-          echo
-          echo '    .github/scripts/generate_ci_workflows.py'
-          echo
-          echo 'If running that command does nothing, you may need to rebase'
-          echo 'onto a more recent commit from the PyTorch master branch.'
-          RC=1
-        fi
-
-        # Check that jobs will be cancelled
-        .github/scripts/ensure_actions_will_cancel.py
-
-        exit $RC
-
-  toc:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate ToCs and check that they didn't change
-        set -eu
-
-        export PATH=~/.npm-global/bin:"$PATH"
-        for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
-          markdown-toc --bullets='-' -i "$FILE"
-        done
-
-        if ! .github/scripts/report_git_status.sh .; then
-          echo
-          echo 'As shown by the above diff, the table of contents in one or'
-          echo 'more Markdown files is not up to date with the file contents.'
-          echo 'You can either apply that Git diff directly to correct the'
-          echo 'table of contents, or if you have npm installed, you can'
-          echo 'install the npm package markdown-toc and run the following'
-          # shellcheck disable=SC2016
-          echo 'command (replacing $FILE with the filename for which you want'
-          echo 'to regenerate the table of contents):'
-          echo
-          # shellcheck disable=SC2016
-          echo "    markdown-toc --bullets='-' -i \"\$FILE\""
-          false
-        fi
-
-  test-tools:
-    name: Test tools
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      fetch-depth: 0
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Test tools
-        python3 -m unittest discover -vs tools/test -p 'test_*.py'
-        python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
+      - name: Initialize lint dependencies
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} lintrunner init
 
-  test_collect_env:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    name: Test collect_env
-    runs-on: linux.20_04.4x
-    strategy:
-      matrix:
-        test_type: [with_torch, without_torch, older_python_version]
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python 3.5
-        if: matrix.test_type == 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.5'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Setup Python 3.8
-        if: matrix.test_type != 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install torch
-        if: matrix.test_type == 'with_torch'
+      - name: Do build steps necessary for linters
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
         run: |
-          pip install -r requirements.txt
-          # Doesn't really matter what torch version, we just need ANY torch installed
-          pip install 'torch==1.*'
-      - name: Run collect_env.py (nonretryable)
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m tools.linter.clang_tidy.generate_build_files
+          ${CONDA_RUN} python3 -m tools.generate_torch_version --is_debug=false
+          ${CONDA_RUN} python3 -m tools.pyi.gen_pyi \
+            --native-functions-path aten/src/ATen/native/native_functions.yaml \
+            --tags-path aten/src/ATen/native/tags.yaml \
+            --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+      - name: Run lintrunner on all MPS files (nonretryable)
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
         run: |
-          # All we need to see is that it passes
-          python3 torch/utils/collect_env.py
+          # shellcheck disable=SC1090
+          set -ex
+          set +e
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+              exit 1
+          fi
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 663eac84514fe..a2ca4867fd76b 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -1,10 +1,11 @@
 name: Mac MPS
 
 on:
-  push:
-    tags:
-      - ciflow/mps/*
-  workflow_dispatch:
+  # push:
+  #   tags:
+  #     - ciflow/mps/*
+  # workflow_dispatch:
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -18,7 +19,7 @@ jobs:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
       xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-13
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
deleted file mode 100644
index 1c137084a97e9..0000000000000
--- a/.github/workflows/periodic.yml
+++ /dev/null
@@ -1,284 +0,0 @@
-name: periodic
-
-on:
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
-  push:
-    tags:
-      - ciflow/periodic/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-jobs:
-  parallelnative-linux-focal-py3_8-gcc7-build:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  parallelnative-linux-focal-py3_8-gcc7-test:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: parallelnative-linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
-      timeout-minutes: 300
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "aot_eager_all", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          # These jobs run too slowly so they must be sharded, unfortunately
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-focal-rocm5_4_2-py3_8-test:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_4_2-py3_8-build
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
-    secrets:
-      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-  linux-bionic-cuda11_7-py3_9-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
-        ]}
-      build-with-debug: false
-
-  linux-bionic-cuda11_7-py3_9-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_9-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-build:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-test:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_8-py3_8-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_8-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.8-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_8-py3-build:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_8-py3-test:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_8-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.7-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.7-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_7-py3-build:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_7-py3-test:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_7-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }}
-
-  ios-12-5-1-x86-64-coreml:
-    name: ios-12-5-1-x86-64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-x86-64-coreml
-      ios-platform: SIMULATOR
-      ios-arch: x86_64
-
-  ios-12-5-1-arm64:
-    name: ios-12-5-1-arm64
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-coreml:
-    name: ios-12-5-1-arm64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-coreml
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-custom-ops:
-    name: ios-12-5-1-arm64-custom-ops
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-custom-ops
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-metal:
-    name: ios-12-5-1-arm64-metal
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-metal
-      ios-platform: OS
-      ios-arch: arm64
-
-  buck-build-test:
-    name: buck-build-test
-    uses: ./.github/workflows/_buck-build-test.yml
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
deleted file mode 100644
index 2c5493639e4e7..0000000000000
--- a/.github/workflows/pull.yml
+++ /dev/null
@@ -1,368 +0,0 @@
-name: pull
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-      - main
-      - release/*
-      - landchecks/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  linux-focal-py3_8-gcc7-build:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-gcc7-test:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-docs:
-    name: linux-docs
-    uses: ./.github/workflows/_docs.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-
-  linux-focal-py3_8-gcc7-no-ops:
-    name: linux-focal-py3.8-gcc7-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-no-ops
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_8-gcc7-pch:
-    name: linux-focal-py3.8-gcc7-pch
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-pch
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_9-clang7-asan-build:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_9-clang7-asan-test:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_9-clang7-asan-build
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.test-matrix }}
-
-  linux-focal-py3_8-clang10-onnx-build:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-clang10-onnx-test:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-onnx-build
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
-
-  linux-bionic-py3_8-clang9-build:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image-name: pytorch-linux-bionic-py3.8-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang9-test:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang9-build
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-py3_11-clang9-build:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_11-clang9-test:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-vulkan-bionic-py3_11-clang9-build:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-vulkan-bionic-py3_11-clang9-test:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-vulkan-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }}
-
-  linux-focal-py3-clang7-mobile-build:
-    name: linux-focal-py3-clang7-mobile-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-build
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      build-generates-artifacts: false
-
-  linux-jammy-cuda-11_7-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-
-  linux-focal-py3-clang7-mobile-custom-build-static:
-    name: linux-focal-py3-clang7-mobile-custom-build-static
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-custom-build-static
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-      build-generates-artifacts: false
-
-  linux-bionic-py3_8-clang8-xla-build:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image-name: xla_base
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang8-xla-test:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang8-xla-build
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.test-matrix }}
-
-  win-vs2019-cpu-py3-build:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cpu-py3-test:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cpu-py3-build
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
-
-  win-vs2019-cuda11_7-py3-build:
-    if: github.event_name == 'pull_request'
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      sync-tag: win-cuda-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-bazel-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-    uses: ./.github/workflows/_bazel-build-test.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3_8-gcc7-mobile-lightweight-dispatch-build:
-    name: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      build-generates-artifacts: false
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    # don't run build twice on master
-    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: 8.6
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }}
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
deleted file mode 100644
index 8d55f6a9479ca..0000000000000
--- a/.github/workflows/run_torchbench.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-name: TorchBench CI (pytorch-linux-py3.8-cu116)
-on:
-  pull_request:
-
-env:
-  PYTHON_VERSION: "3.8"
-  # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
-  NUMPY_VERSION: "1.21.2"
-  SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh"
-  PR_NUM: ${{ github.event.number }}
-  PR_BODY: ${{ github.event.pull_request.body }}
-  PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-  PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-jobs:
-  run-torchbench:
-    # We don't accept running on non-pytorch repos because of security concerns
-    # Only run the job when the body contains magic word "RUN_TORCHBENCH:"
-    if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.body, 'RUN_TORCHBENCH:') }}
-    runs-on: [self-hosted, bm-runner]
-    # Set to 12 hours
-    timeout-minutes: 720
-    steps:
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          path: pytorch
-      - name: Update self-hosted PyTorch
-        run: |
-          pushd "${HOME}"/pytorch
-          git remote prune origin
-          git fetch
-          popd
-      - name: Create conda environment and install deps
-        run: |
-          conda create -y -n pr-ci python="${PYTHON_VERSION}"
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22.* typing-extensions boto3 \
-                           pillow pytest tabulate gitpython git-lfs tqdm psutil
-          pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
-      - name: Setup TorchBench branch
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          python pytorch/.github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch
-      - name: Checkout TorchBench
-        uses: malfet/checkout@silent-checkout
-        with:
-          repository: pytorch/benchmark
-          path: benchmark
-          lfs: false
-          ref: ${{ env.TORCHBENCH_BRANCH }}
-      - name: GPU Info
-        run: |
-          nvidia-smi
-      - name: Run TorchBench
-        run: |
-          set -x
-          pushd "${HOME}"/pytorch
-          PR_MERGE_BASE=$(git merge-base "$PR_BASE_SHA" "$PR_HEAD_SHA")
-          popd
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  --pr-body "$PR_BODY_FILE" \
-                  run \
-                  --pytorch-path "${HOME}"/pytorch \
-                  --torchbench-path "${PWD}"/benchmark \
-                  --pr-num "$PR_NUM" \
-                  --pr-base-sha "$PR_MERGE_BASE" \
-                  --pr-head-sha "$PR_HEAD_SHA"
-      - name: Upload result to S3
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  upload-s3 \
-                  --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}"
-      - name: Remove conda environment and cleanup
-        run: |
-          conda env remove --name pr-ci
-          rm /tmp/pr-body.txt
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: TorchBench result
-          path: ~/.torchbench/bisection/pr${{ github.event.number }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 978162aed855a..4e76c172fb6e9 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -265,7 +265,7 @@ void printTensorNDArray(const Tensor& t) {
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
   bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 9e643ebf29390..440cde4140f45 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1208,8 +1208,7 @@ void elu_variants_out_mps (
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
     MPSGraphTensor *gradInputTensor_ = nil;
   };
 
@@ -1218,7 +1217,7 @@ void elu_variants_out_mps (
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                  to_string(alpha.to<double>()) + ":" +
                                                  to_string(scale.to<double>()) + ":" +
                                                  to_string(input_scale.to<double>()) + ":" +
@@ -1235,18 +1234,14 @@ void elu_variants_out_mps (
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-          MPSGraphTensor* inputTensor = nil;
-          MPSGraphTensor* resultTensor = nil;
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
           MPSGraphTensor* lessThanZeroGradTensor = nil;
 
           if(is_result) {
-            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                                shape:@[@1]
                                                             dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                         secondaryTensor:alphaTensor
                                                                                    name:nil];
             auto constMul = scale.to<double>() * input_scale.to<double>();
@@ -1258,11 +1253,10 @@ void elu_variants_out_mps (
                                                                           name:nil];
           }
           else {
-            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                     shape:@[@1]
                                                                  dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                           secondaryTensor:inputScaleTensor
                                                                                      name:nil];
             MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@@ -1282,7 +1276,7 @@ void elu_variants_out_mps (
           MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                    secondaryTensor:zeroTensor
                                                                               name:nil];
           MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@@ -1294,8 +1288,7 @@ void elu_variants_out_mps (
                                                                                  name:nil];
 
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
           newCachedGraph->gradInputTensor_ = gradInputTensor;
         }
         return newCachedGraph;
@@ -1304,28 +1297,14 @@ void elu_variants_out_mps (
     }
 
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder resultPlaceholder = Placeholder();
-    if(is_result)
-      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if(is_result)
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
-      };
-    else
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
+    };
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };
@@ -1840,7 +1819,7 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     using namespace mps;
 
     Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
     if (grad_output.numel() == 0) {
       return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
     }
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index c730eccfe944e..6569e59086fc9 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -26,6 +26,8 @@
 void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha,
                     const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support binary op with uint8 natively starting from macOS 13.0");
   TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
               (self.scalar_type() == ScalarType::Long ||
               (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 3cd442099f5ca..66c6eac098d8f 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -56,14 +56,15 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
     const Tensor& input_t,
     const Tensor& weight_t,
     const c10::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::optional<IntArrayRef> input_shape) {
   TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
 
   namespace native_mps = at::native::mps;
@@ -82,7 +83,16 @@ Tensor _mps_convolution(
 
   auto memory_format = input_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
+  bool gather_input_data = true;
+  // Perform the convolution directly in NCHW if the tensor is already contiguous in memory
+  if (is_channels_last && input_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
   auto output_t = at::empty(
+                    input_shape.has_value() ?
+                    input_shape.value() :
                     conv_output_size(input->sizes(), weight->sizes(),
                                      padding, stride, dilation),
                     input->scalar_type(),
@@ -212,7 +222,7 @@ Tensor _mps_convolution(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape, gather_input_data);
     auto weightsPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_t);
     auto biasPlaceholder = native_mps::Placeholder();
     // Reshape the bias to be broadcastable with output of conv2d
@@ -237,21 +247,35 @@ Tensor _mps_convolution(
   return *output;
 }
 
+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
+}
+
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  Tensor grad_output_t = grad_output_.contiguous(memory_format);
-  Tensor weight_t = weight_.contiguous(memory_format);
-  MPSShape* weightShape = getMPSShape(weight_);
+  bool gather_input_data = true;
+  if (is_channels_last && grad_output_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
   auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
@@ -327,10 +351,10 @@ Tensor mps_convolution_backward_input(
           }
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
           MPSGraphTensor* gradInputTensor;
@@ -358,8 +382,8 @@ Tensor mps_convolution_backward_input(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -377,16 +401,19 @@ Tensor mps_convolution_backward_input(
 }
 
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-
-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
+  bool gather_input_data = true;
+  if (is_channels_last && input_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
 
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
 
@@ -475,7 +502,7 @@ Tensor mps_convolution_backward_weights(
           MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
 
@@ -505,8 +532,8 @@ Tensor mps_convolution_backward_weights(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, nil, gather_input_data);
     auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -525,12 +552,9 @@ Tensor mps_convolution_backward_weights(
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight, grad_bias;
   if (input.numel() == 0) {
     if (output_mask[0]) {
@@ -576,10 +600,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
     const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+  return _mps_convolution_impl(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }
 
 Tensor mps_convolution_transpose_backward_weight(
@@ -595,15 +619,12 @@ Tensor mps_convolution_transpose_backward_weight(
 
 
 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,2> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
   }
   if (output_mask[1]) {
     grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index e4c673145adaa..16f5718dd29c0 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -251,8 +251,11 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   bool returnGatherOutput = dst_.is_contiguous();
   Tensor src;
   auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+  const bool sameDataType = src_.dtype() == dst_.dtype();
 
-  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
+      // the copy_cast path requires storage_offset to be applied before casting
+      (src_.storage_offset() && !sameDataType)) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
 
@@ -282,7 +285,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   src._set_neg(src_.is_neg());
 
   const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
+  if (sameDataType) {
     MPSStream* stream = getCurrentMPSStream();
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
@@ -297,22 +300,27 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   TORCH_CHECK(dst.defined(), "dst is undefined");
   TORCH_CHECK(src.defined(), "src is undefined");
 
+  bool needs_broadcasting = false;
+
   if (src.numel() == 0 || dst.is_same(src)) {
     return dst;
   }
   if (dst.numel() == 0) {
     dst.resize_as_(src);
   }
+  if (dst.dim() > src.dim()) {
+    needs_broadcasting = true;
+  }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
-    return copy_from_mps_(dst, src, non_blocking);
+    return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
-    return copy_to_mps_(dst, src, non_blocking);
+    return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
-    return copy_kernel_mps(dst, src, non_blocking);
+    return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   TORCH_INTERNAL_ASSERT(
       src.device().type() == DeviceType::MPS,
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 310cbb7bf9370..8522ac920275f 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
+            MPSGraphTensor* castGradTensor = incomingGradTensor;
+            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
+            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
+            if (dataType == MPSDataTypeFloat16) {
+              castGradTensor = [mpsGraph castTensor: incomingGradTensor
+                                             toType: MPSDataTypeFloat32
+                                               name: @"castGradTensor"];
+            }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                                axes: @[@-1]
                                                                name: nil];
             }
 
-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                              indicesTensor: reshapedIndicesTensor
                                                                      shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                            batchDimensions: 0
                                                                       mode: MPSGraphScatterModeAdd
                                                                       name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
+              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
+                                                 toType: MPSDataTypeFloat16
+                                                   name: @"castGradTensor"];
+            }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;
             newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 2b9272d467595..08727fed8265c 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
 
+  auto output_memory_format = output.suggest_memory_format();
   // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
   // by simply restriding them (instead of calling the costly Contiguous()).
   if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
       outputSizes.insert(outputSizes.begin(), nbatch);
     }
     output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
     output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+    output_memory_format = MemoryFormat::Contiguous;
   }
 
   if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
@@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
     }
 
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+
+    if (output_memory_format != suggested_memory_format) {
+      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
+    }
   }
 }
 
@@ -356,6 +362,8 @@ Tensor mps_max_pool2d_backward(
     const Tensor& output,
     const Tensor& indices) {
 
+  auto indices_memory_format = indices.suggest_memory_format();
+
   mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
     MPSGraph* mpsGraph = cachedGraph.graph();
     NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@@ -366,6 +374,10 @@ Tensor mps_max_pool2d_backward(
   };
   mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
                        padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  }
 }
 
 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index f858714fb82d5..f47dd910dc234 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -139,6 +139,8 @@ void reduction_out_mps(
   MPSReductionType reduction_type,
   const std::string& func_name) {
 
+  // issue 103641234, reduction ops does not have int64 support
+  TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 reduction ops, casting it to int32");
   IntArrayRef input_shape = input_t.sizes();
 
   if (opt_dim.has_value()) {
@@ -163,6 +165,9 @@ void reduction_out_mps(
     if (reduction_type == MPSReductionType::PROD) {
       output_t.fill_(1);
     }
+    else if (reduction_type == MPSReductionType::SUM) {
+      output_t.zero_();
+    }
     return;
   }
 
@@ -197,7 +202,10 @@ void reduction_out_mps(
              (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
             inputCastDtype = getMPSDataType(dtype.value());
           } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
             inputCastDtype = MPSDataTypeFloat32;
           }
 
@@ -241,7 +249,7 @@ void reduction_out_mps(
                                                                axes:wrappedAxes
                                                                name:nil];
           } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                      numLower:0
                                                                      numUpper:0
                                                                          name:nil];
diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index d46ce356318e2..bee82fcc24803 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -30,10 +30,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if (has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -71,8 +76,10 @@
             for (size_t i = 0; i < num_layers; i += 1) {
                 [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                 [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                if(has_biases) {
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                }
             }
 
             MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@@ -109,9 +116,12 @@
             NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                    secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                MPSGraphTensor* biasTensor = nil;
+                if(has_biases) {
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                     secondaryTensor:recurrentBiasList[i]
+                                                                name:nil];
+                }
                 outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                         recurrentWeight:recurrentKernelWeightsList[i]
                                             inputWeight:kernelWeightsList[i]
@@ -121,7 +131,6 @@
                                              descriptor:opDesc
                                                    name:nil];
 
-
                 stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                             dimension:0
                                                             start:i
@@ -196,12 +205,14 @@
       for (size_t i = 0; i < num_layers; i+=1) {
           kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
           recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
           [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
           [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          if(has_biases) {
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          }
 
       }
       Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@@ -250,10 +261,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if(has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -296,8 +312,10 @@
                     for (size_t i = 0; i < num_layers; i += 1) {
                         [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                         [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        if(has_biases) {
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        }
                     }
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@@ -349,9 +367,15 @@
                         cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                     axis:0
                                                     name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                            secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                        MPSGraphTensor* biasTensor = nil;
+                        if(has_biases) {
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                            secondaryTensor:recurrentBiasList[i]
+                                                            name:nil];
+                        } else {
+                            biasTensor = [mpsGraph constantWithScalar:0.0
+                                                            dataType:inputTensor.dataType];
+                        }
 
                         MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                     dimension:0
@@ -391,7 +415,6 @@
                                                   descriptor: opDesc
                                                         name: nil];
 
-
                         gradientTensor_ = [outputs objectAtIndex:0];
                         [gradOutputArray addObject:[outputs objectAtIndex:0]];
                         [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
@@ -445,18 +468,20 @@
         for (size_t i = 0; i < num_layers; i+=1) {
             kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
             recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
             [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
             [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            if(has_biases) {
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            }
         }
 
         Tensor output = at::empty_like(input);
         Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
         Tensor grad_weights = at::empty_like(kernel_weights[0]);
-        Tensor grad_bias = at::empty_like(biases[0]);
+        Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
         Tensor grad_state = at::empty_like(hx[0]);
         Tensor grad_cell_state = at::empty_like(hx[1]);
         Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
@@ -482,13 +507,15 @@
             Tensor output = at::empty_like(input);
             Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
             Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
+            Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
             Tensor grad_state = at::empty_like(hx[0]);
             Tensor grad_cell_state = at::empty_like(hx[1]);
             weights.push_back(grad_weights);
             weights.push_back(grad_rec_weights);
-            weights.push_back(grad_bias);
-            weights.push_back(grad_bias);
+            if(has_biases) {
+                weights.push_back(grad_bias);
+                weights.push_back(grad_bias);
+            }
             gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
             gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
             gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index a869ff3379aa8..0c6e5b06d0898 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -16,6 +16,8 @@ bool is_empty_tensor(const Tensor& self) {
 
 void unary_op(const Tensor& self, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock, is_noop_p is_noop = is_empty_tensor)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support unary op with uint8 natively starting from macOS 13.0");
   if (!output.is_same_size(self)) {
     output.resize_(self.sizes());
   }
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 17895e19c7d76..3b781dea08f48 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
   } else {
     native::upsample_2d_common_check(input.sizes(), output_size);
   }
+  Tensor out;
+  if (!output.is_contiguous()) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
   bool centerResults = false;
   MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
   MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
     MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
 
     Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
         inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
         outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
   }
 }
 
diff --git a/test/cuda_results.yaml b/test/cuda_results.yaml
new file mode 100644
index 0000000000000..bc6e0948ae569
--- /dev/null
+++ b/test/cuda_results.yaml
@@ -0,0 +1,102 @@
+ConsistencyTest: {
+  nn.functional.conv_transpose2d:
+    [[[7.399066925048828, 4.4053635597229, -25.85348129272461,
+        58.88909149169922, -88.75193786621094, -18.98126983642578, 9.437820434570312],
+      [-59.78305435180664, -65.34088134765625, -108.04747009277344, 196.6062469482422,
+        71.39350891113281, 37.8786735534668, -69.55322265625], [92.78504943847656,
+        91.24403381347656, -94.33301544189453, 9.261059761047363, -182.10206604003906,
+        141.4270477294922, 146.89010620117188], [-14.363212585449219, 43.454036712646484,
+        -76.1098403930664, 242.9479522705078, 198.1458282470703, -49.77315139770508,
+        5.891449451446533], [-43.56822967529297, 4.782844066619873, -29.526945114135742,
+        65.15388488769531, 161.29757690429688, 118.60847473144531, 27.08570671081543],
+      [68.29853057861328, -11.507468223571777, 2.044086217880249, 11.003862380981445,
+        34.993282318115234, -21.256723403930664, 91.49512481689453], [-70.4466781616211,
+        69.04386138916016, 7.764842987060547, 7.61972713470459, -28.99899673461914,
+        54.575748443603516, -5.762258052825928]], [[-36.238487243652344, 37.29551696777344,
+        -22.012331008911133, -30.1353702545166, 33.82851028442383, 33.00322341918945,
+        2.7218000888824463], [-7.999058246612549, 122.72489929199219, -1.0639530420303345,
+        2.9564287662506104, -143.1276092529297, -110.75650024414062, 48.0764274597168],
+      [-91.0599136352539, -11.656601905822754, 69.62447357177734, 88.12522888183594,
+        337.3008728027344, -76.9416732788086, -110.24406433105469], [-108.1512451171875,
+        98.42401123046875, 142.46144104003906, -127.48089599609375, -3.367496967315674,
+        86.82833099365234, 86.29623413085938], [-14.339198112487793, -52.287410736083984,
+        171.43614196777344, 200.14817810058594, 200.35476684570312, -189.4150390625,
+        -46.86980056762695], [30.196495056152344, 25.22877311706543, 95.29426574707031,
+        4.455311298370361, 118.48747253417969, 87.11080932617188, -83.6124038696289],
+      [-2.5434072017669678, 91.8791732788086, -10.615175247192383, -12.58531379699707,
+        -49.3439826965332, 33.37324523925781, -5.983145713806152]], [[4.551003932952881,
+        15.84842586517334, -46.354671478271484, 14.721636772155762, 39.01048278808594,
+        49.70054244995117, -18.268564224243164], [16.728954315185547, 129.43505859375,
+        -4.6139116287231445, -3.382319688796997, -238.76353454589844, 13.42194938659668,
+        40.393280029296875], [-2.335604429244995, -85.94283294677734, -142.2253875732422,
+        135.27537536621094, 18.01512336730957, -26.331714630126953, -33.35443878173828],
+      [-79.17593383789062, -93.72674560546875, -110.94194030761719, -61.455223083496094,
+        6.811624526977539, 129.06478881835938, 12.435402870178223], [10.859378814697266,
+        41.3059196472168, 143.55824279785156, -41.754737854003906, -235.32406616210938,
+        -70.98460388183594, 130.46929931640625], [193.57574462890625, -142.5060272216797,
+        -102.45012664794922, 124.68048095703125, 136.05215454101562, -9.650590896606445,
+        -45.59521484375], [-37.829593658447266, 39.12519454956055, 9.293094635009766,
+        -18.8004093170166, -0.7294210195541382, 51.884910583496094, 36.15913391113281]],
+    [[-15.651233673095703, 16.31340980529785, -26.752052307128906, 6.281721115112305,
+        43.765541076660156, -13.097319602966309, -30.443206787109375], [10.67841911315918,
+        66.1829605102539, -9.394262313842773, -131.45101928710938, -38.621002197265625,
+        65.9507064819336, 48.76960372924805], [-76.0918197631836, -9.108996391296387,
+        13.64936637878418, 96.7411880493164, 124.2474365234375, -111.50318145751953,
+        -42.397071838378906], [-83.31562805175781, 32.27967071533203, 250.08163452148438,
+        58.24131393432617, 129.95318603515625, -10.683560371398926, -123.84668731689453],
+      [-11.536887168884277, -15.220125198364258, 197.18821716308594, -31.680112838745117,
+        -81.35874938964844, 157.96974182128906, 105.61251831054688], [78.15926361083984,
+        -84.49744415283203, -73.91180419921875, 86.370361328125, 77.87918090820312,
+        55.3555908203125, -7.273794651031494], [25.232547760009766, 30.352109909057617,
+        53.722267150878906, 44.87421798706055, 44.618812561035156, 4.511796951293945,
+        9.039834976196289]]]
+}
+UnitTest: {
+  norm: 
+  [
+    {
+      dtype: f16,
+      args: [[[ 8.9453,  4.0859,  0.1230,  2.1367, -5.0000],
+        [ 7.2773, -4.6953, -3.5586,  8.2812, -0.8789],
+        [ 0.7119, -1.4854,  6.8633, -7.9805, -3.6562],
+        [-1.0195, -7.2695, -0.0264, -3.5078, -0.2900],
+        [ 8.7656,  5.8984, -2.3125, -0.0352,  5.2812]],],
+      params: [0.5,],
+      res: [2000.]
+    },
+    {
+      dtype: f16,
+      args: [[[[ 8.9219,  3.0508, -3.0234, -5.6250, -5.3516],
+         [-5.8906,  5.2109, -7.2500,  7.3047, -0.1846],
+         [-2.1367, -8.8047, -3.4727, -3.0859,  4.9062],
+         [ 2.1797, -8.5078,  6.1445, -5.0547,  2.8828],
+         [-2.6191,  4.6680, -4.1758,  8.7734, -5.4844]],
+
+        [[-5.8984,  7.3281, -7.3672, -0.0879,  7.0039],
+         [ 2.0117, -6.4258,  8.6250,  2.5137, -2.2676],
+         [-7.2578,  1.6875,  7.8750,  7.5078,  0.8350],
+         [-4.8164, -3.6914, -3.9199,  4.9219, -4.6680],
+         [ 5.0547, -7.1289,  2.3633,  3.7793, -7.4375]],
+
+        [[-8.6953, -3.8750,  0.8965, -4.4453,  6.1328],
+         [ 8.6719,  2.5586, -3.0664, -7.7891,  2.5234],
+         [ 5.8008,  0.5977,  4.9219,  3.0156,  3.6211],
+         [-6.0898, -3.4883,  2.6543,  7.1992,  5.9414],
+         [-3.6035,  8.3906,  2.2070, -1.1162,  7.2852]],
+
+        [[-2.4531, -2.9180,  6.2422, -6.3711, -8.3516],
+         [ 3.3398, -8.5078, -8.9375, -2.0312, -4.3320],
+         [-1.4326, -4.5000, -0.3252, -6.8555, -8.2969],
+         [ 5.8438,  5.6094, -6.6797, -0.0439,  3.6035],
+         [ 4.5859,  7.1016, -0.8086,  5.6953,  0.5098]],
+
+        [[ 3.0859,  4.4844,  0.6152,  7.9609, -7.6562],
+         [-0.7998, -3.4023,  5.7734, -2.4785,  5.9219],
+         [ 7.1094,  1.4502, -7.1289,  4.7188, -4.8359],
+         [ 2.7422, -1.9512,  5.6602, -3.6387, -8.6953],
+         [-4.6953,  0.2900,  2.7148, -0.0176,  7.6992]]],],
+      params: [1.5],
+      res: [125.2500]
+    },
+  ],
+}
diff --git a/test/test_modules.py b/test/test_modules.py
index 2ae17f5f8cf85..9c244fb65e60b 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -10,12 +10,23 @@
 from torch.testing._internal.common_cuda import with_tf32_off
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
+from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck, skipIfMps, skipIfTorchInductor)
+    gradgradcheck, skipIfTorchInductor)
 from unittest.mock import patch, call
 
+MPS_DTYPES = get_all_dtypes()
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+    del MPS_DTYPES[MPS_DTYPES.index(t)]
+
+def _get_mps_error_msg(device, dtype, op, mps_blocklist):
+    if torch.backends.mps.is_available() and device == "mps" and dtype not in MPS_DTYPES:
+        return f"MPS doesn't support {str(dtype)} datatype"
+    if op.name.startswith(tuple(mps_blocklist)):
+        return "MPS doesn't support op " + str(op.name)
+    return None
 
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
@@ -33,7 +44,8 @@ def _assert_module_parameters_and_buffer_are(self, module, device, dtype):
         def _check_module(items, name, device=device, dtype=dtype):
             for item_name, item in items:
                 self.assertEqual(
-                    item.device, device,
+                    # workaround for the tests checking the device (mps:0 with mps)
+                    item.device.type, device.type,
                     f'{name} {item_name} is on device {item.device} instead of the expected device {device}')
                 if item.dtype.is_floating_point:
                     self.assertEqual(
@@ -42,9 +54,16 @@ def _check_module(items, name, device=device, dtype=dtype):
         _check_module(module.named_parameters(), "Parameter")
         _check_module(module.named_buffers(), "Buffer")
 
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db)
     def test_forward(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -84,6 +103,10 @@ def test_forward(self, device, dtype, module_info, training):
     # They should be applied to any created parameters and buffers.
     @modules(module_db)
     def test_factory_kwargs(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -198,6 +221,11 @@ def _to_device1(objs):
     @modules(module_db)
     def test_repr(self, device, dtype, module_info, training):
         # Test module can be represented with repr and str without errors.
+
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -211,10 +239,19 @@ def test_repr(self, device, dtype, module_info, training):
             m.__repr__()
             str(m)
 
-    @skipIfMps
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info, training):
         # Test that module can be pickled and unpickled.
+
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -249,6 +286,15 @@ def test_pickle(self, device, dtype, module_info, training):
     def test_check_inplace(self, device, dtype, module_info, training):
         # Check if the inplace variant of the module gives the same result as the out of place
         # variant.
+
+        MPS_BLOCKLIST = [
+            "nn.ELU"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=True, training=training)
@@ -326,11 +372,21 @@ def inner_zero_grad(obj):
                 obj.grad = None
         self._traverse_obj(obj, inner_zero_grad)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_non_contiguous_tensors(self, device, dtype, module_info, training):
         # Check modules work with non-contiguous tensors
+        MPS_BLOCKLIST = [
+            # hard crashes
+            "nn.GRU",
+            "nn.LSTM",
+            "nn.RNN"
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
 
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
@@ -582,10 +638,18 @@ def check_backward(cpu_output, gpu_output):
                     for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs):
                         check_backward(cpu_output, gpu_output)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_memory_format(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.BatchNorm3d",  # failed assert
+            "nn.LSTM",  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
         # TODO tighten it to a specific module
         atol, rtol = (3e-3, 7e-3) if is_sm86 else (None, None)
@@ -682,9 +746,12 @@ def inner_check_out_mem_format(output):
 
     # Test whether train and eval modes differ for each module. Use to verify
     # that the ModuleInfo entry flag is correct.
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db, train_eval_mode=TrainEvalMode.train_only)
     def test_if_train_and_eval_modes_differ(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
diff --git a/test/test_mps.py b/test/test_mps.py
index b3740b5cd1148..e983760d0951c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -16,6 +16,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import itertools
+import yaml
+import platform
 from collections import defaultdict
 from torch import inf
 from torch.nn import Parameter
@@ -26,9 +28,10 @@
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
+import torch.mps
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
-from functools import partial
+from functools import partial, reduce
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
@@ -62,6 +65,8 @@
     TestCase = object  # noqa: F811
     NNTestCase = object  # noqa: F811
 
+product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
+
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
 
@@ -371,6 +376,15 @@ def test_avg_pool2d_ceil_mode(self):
 
 
 class TestMPS(TestCaseMPS):
+    def help_extra_unit(self, opname, op):
+        if opname not in OP_UNIT_TEST:
+            return
+        for test in OP_UNIT_TEST[opname]:
+            mps_args = test.sample()
+            mps_out = op(*mps_args)
+            mps_out = (mps_out, ) if isinstance(mps_out, torch.Tensor) else mps_out
+            self.assertEqual(test.expected(), mps_out)
+
     def test_exp(self, device="mps", dtype=torch.float):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
             b = torch.arange(18, device="cpu") / 3 * math.pi
@@ -432,6 +446,53 @@ def helper(val, shape):
         helper(0, [1024])
         helper(0.2, [2, 3])
 
+    def test_mm(self):
+        B = torch.ones(5, 6).to("mps")
+        C = torch.ones(6, 5).to("mps")
+        D = torch.mm(B, C).cpu()
+        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
+
+    def test_linalg_cross(self):
+        def helper(dtype):
+            device = "mps"
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+            else:
+                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
+                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+
+            # test for broadcastable inputs
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
+            else:
+                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
+                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
+
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(100, 10, device=device)
@@ -577,53 +638,6 @@ def test_cdist_norm_batch(self, device="mps"):
                             expected = self._brute_cdist(x, y, p=p)
                             self.assertEqual(expected, actual)
 
-    def test_mm(self):
-        B = torch.ones(5, 6).to("mps")
-        C = torch.ones(6, 5).to("mps")
-        D = torch.mm(B, C).cpu()
-        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
-
-    def test_linalg_cross(self):
-        def helper(dtype):
-            device = "mps"
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-            else:
-                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
-                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-
-            # test for broadcastable inputs
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
-            else:
-                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
-                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
-
     def test_cross(self):
         a = torch.randn(4, 3, device="mps")
         b = torch.randn(4, 3, device="mps")
@@ -640,6 +654,13 @@ def test_addmm(self):
         D = torch.addmm(A, B, C).to("cpu")
         torch.testing.assert_close(D, torch.full((5, 5), 7.0))
 
+    def test_addr(self):
+        A = torch.ones(5, 10).to("mps")
+        B = torch.ones(5).to("mps")
+        C = torch.ones(10).to("mps")
+        D = torch.addr(A, B, C).to("cpu")
+        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
+
     def test_bmm(self):
         batch1_cpu = torch.randn(10, 3, 4)
         batch2_cpu = torch.randn(10, 4, 5)
@@ -653,13 +674,6 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
-    def test_addr(self):
-        A = torch.ones(5, 10).to("mps")
-        B = torch.ones(5).to("mps")
-        C = torch.ones(10).to("mps")
-        D = torch.addr(A, B, C).to("cpu")
-        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
-
     def test_trace(self):
         M_cpu = torch.randn(3, 3)
         M_mps = M_cpu.detach().clone().to("mps")
@@ -1212,7 +1226,7 @@ def test_norm(self):
         self.assertEqual(res, res_cpu)
 
         c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
-        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu")
+        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -1237,6 +1251,8 @@ def test_norm(self):
         res_cpu = torch.norm(d_cpu[0, :, :]), torch.norm(d_cpu[1, :, :])
         self.assertEqual(res, res_cpu)
 
+        self.help_extra_unit('norm', torch.norm)
+
     def test_layer_norm(self):
         # TODO: Test non-contiguous
         def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32):
@@ -1783,6 +1799,15 @@ def test_slice_reshape(self):
         x_cpu = x_cpu + 2
         self.assertEqual(x, x_cpu)
 
+    def test_slice_casting(self):
+        # generate random binary numbers
+        cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8)
+        mps_in = cpu_in.detach().clone().to("mps")
+        # check copy_cast(unit8 -> bool) on tensors with storage offset
+        cpu_out = cpu_in[:, :, 11 : 12, :12].to(torch.bool)
+        mps_out = mps_in[:, :, 11 : 12, :12].to(torch.bool)
+        self.assertEqual(cpu_out, mps_out)
+
     def test_slice_reshape_contg_view(self):
         import torch
 
@@ -1818,12 +1843,6 @@ def test_view_slice(self):
                 actual_pts[i, j] = X[pts[i, j], j]
                 self.assertEqual(actual_pts[i, j], actual_pts_mps[i, j])
 
-    def test_slice_scatter(self):
-        shape = (4, 4)
-        tensor = torch.randint(10, shape, device="mps")
-        tensor_before = tensor.clone()
-        torch.empty(shape[0], shape[1] * 2, device="mps")[:, ::2].copy_(tensor)
-        torch.testing.assert_close(tensor, tensor_before)
 
     def test_slice(self):
         values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
@@ -1983,99 +2002,6 @@ def helper(shape, repeats):
         helper((3, 4, 5), (2, 3, 4, 5))
         helper((3, 4, 5), (2, 2, 2))
 
-    def test_torch_repeat_interleave(self, device="mps"):
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-        # exercise single argument function signature
-        temp = y.repeat_interleave(2)
-        self.assertEqual(torch.Size([8]), temp.size())
-
-        for dtype in [torch.int, torch.long]:
-            lengths = torch.tensor([1, 2], dtype=dtype, device="mps")
-            output_size = torch.sum(lengths)
-            a = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-            )
-            self.assertEqual(a.dtype, y.dtype)
-            self.assertEqual(a.size(), torch.Size([3, 2]))
-
-            a_with_output = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-                output_size=output_size,
-            )
-            self.assertEqual(a_with_output.dtype, y.dtype)
-            self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
-
-    def test_repeat_interleave(self, device="mps"):
-        x = torch.tensor([0, 1, 2, 3], device=device)
-        expected = torch.tensor([1, 2, 2, 3, 3, 3], dtype=torch.int32, device=device)
-        self.assertEqual(torch.repeat_interleave(x), expected)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4.0, device=device))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.tensor([1, 2, -1, 3, 4], device=device))
-
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-
-        y1_v1 = torch.repeat_interleave(y, 2)
-        y1_v2 = torch.repeat_interleave(y, torch.tensor(2, device=device))
-        y1_v3 = torch.repeat_interleave(y, torch.tensor([2], device=device))
-        y1_expect = torch.tensor([1, 1, 2, 2, 3, 3, 4, 4], device=device)
-        self.assertEqual(y1_v1, y1_expect)
-        self.assertEqual(y1_v2, y1_expect)
-        self.assertEqual(y1_v3, y1_expect)
-
-        y2 = torch.repeat_interleave(y, 3, dim=1)
-        y2_expect = torch.tensor([[1, 1, 1, 2, 2, 2],
-                                  [3, 3, 3, 4, 4, 4]], device=device)
-        self.assertEqual(y2, y2_expect)
-
-        y3 = torch.repeat_interleave(y, torch.tensor([1, 2], device=device), dim=0)
-        y3_expect = torch.tensor([[1, 2],
-                                  [3, 4],
-                                  [3, 4]], device=device)
-        self.assertEqual(y3, y3_expect)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.tensor([1, 2, 3], device=device), dim=0)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.arange(9, device=device).reshape(3, 3), dim=0)
-
-        # test zero sized dimension
-        x = torch.zeros((5, 0), device=device)
-        y = torch.repeat_interleave(x, repeats=3, dim=1)
-        self.assertEqual(y, x.new_zeros(5, 0, device=device))
-
-        x = torch.tensor([], dtype=torch.int64, device=device)
-        y = torch.repeat_interleave(x, x)
-        self.assertEqual(y, x)
-
-    def test_repeat_interleave_simple(self):
-        def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
-            x = torch.randn(shape, dtype=dtype, device="mps")
-            x_cpu = x.detach().clone().cpu()
-
-            num_repeats_cpu = num_repeats.detach().clone().cpu()
-
-            repeats = torch.repeat_interleave(x, num_repeats, dim)
-            repeats_cpu = torch.repeat_interleave(x_cpu, num_repeats_cpu, dim)
-
-            self.assertEqual(repeats, repeats_cpu)
-        helper(shape=3, num_repeats=torch.tensor([100], device="mps"))
-        helper(shape=(2, 2), num_repeats=torch.tensor([3, 3], device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.arange(10, device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
-        helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
-
     def test_count_nonzero(self):
         def helper(dtype):
             n = [
@@ -2151,6 +2077,15 @@ def test_to(self):
             x_mps = x_cpu.to('mps')
             self.assertEqual(x_mps.to(torch.float32), x_cpu.to(torch.float32))
 
+    @unittest.skipIf(True, "non-contiguous tensor to mps is incorrect.")
+    def test_to_non_contiguous(self):
+        x = torch.arange(16, dtype=torch.float32).reshape(2, 2, 2, 2)
+        x1 = x[:, :, :1, :]
+        x2 = x[:, :, 1:, :]
+        self.assertFalse(x1.is_contiguous())
+        self.assertFalse(x2.is_contiguous())
+        self.assertEqual(x1, x1.detach().to("mps"))
+        self.assertEqual(x2, x2.detach().to("mps"))
 
     def test_setitem_scalar(self) -> None:
         device = 'mps'
@@ -2224,9 +2159,9 @@ def test_storage_offset_greater_than_src_nbytes(self):
             tensor_list.append(t)
 
         for i in range(0, n_tensors - 1):
-            t = tensor_list[i].view(1, n_tensor_elems)
+            t = tensor_list[i].view(1, 784)
             t_mps = t.to("mps")
-            self.assertEqual(t, t_mps.cpu(), f"i={i}")
+            self.assertEqual(t, t_mps.cpu())
 
     # See https://github.com/pytorch/pytorch/issues/82427
     # and https://github.com/pytorch/pytorch/issues/83692
@@ -2238,6 +2173,7 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
         for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
@@ -2304,6 +2240,7 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_unique_all_dtypes(self, device="mps"):
         def helper(dtype):
             def ensure_tuple(x):
@@ -2359,7 +2296,7 @@ def ensure_tuple(x):
                                 if k == i:
                                     count += 1
                             self.assertEqual(j, count)
-        [helper(dtype) for dtype in [torch.float32, torch.int64, torch.int32, torch.int16, torch.uint8]]
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]]
 
     def test_unique(self):
         def helper(x, return_inverse, return_counts):
@@ -2371,12 +2308,12 @@ def helper(x, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
-        helper(torch.randint(3, (10, )), False, False)
-        helper(torch.randint(3, (10, )), True, False)
-        helper(torch.randint(3, (10, )), False, True)
-        helper(torch.randint(3, (10, )), True, True)
-        helper(torch.randint(3, (1, )), True, True)
-        helper(torch.randint(3, (0, )), True, True)
+        helper(torch.randint(3, (10,)), False, False)
+        helper(torch.randint(3, (10,)), True, False)
+        helper(torch.randint(3, (10,)), False, True)
+        helper(torch.randint(3, (10,)), True, True)
+        helper(torch.randint(3, (1,)), True, True)
+        helper(torch.randint(3, (0,)), True, True)
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2388,13 +2325,13 @@ def helper(x, dim, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
-        helper(torch.randint(3, (10, )), 0, False, False)
-        helper(torch.randint(3, (10, )), 0, True, False)
-        helper(torch.randint(3, (10, )), 0, False, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (1, )), 0, True, True)
-        helper(torch.randint(3, (0, )), 0, True, True)
+        helper(torch.randint(3, (10,)), 0, False, False)
+        helper(torch.randint(3, (10,)), 0, True, False)
+        helper(torch.randint(3, (10,)), 0, False, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (1,)), 0, True, True)
+        helper(torch.randint(3, (0,)), 0, True, True)
 
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
@@ -2437,134 +2374,6 @@ def test_from_numpy_non_contiguous(self):
         t_mps = torch.tensor(a, device="mps")
         self.assertEqual(t_cpu, t_mps.to("cpu"))
 
-    # See https://github.com/pytorch/pytorch/issues/86954
-    def test_copy_non_contiguous(self):
-        x = torch.arange(27).reshape(3, 3, 3).permute(2, 0, 1)
-        self.assertFalse(x.is_contiguous())
-        y = x.to('mps')
-        self.assertFalse(y.is_contiguous())
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.arange(4**3).reshape(4, 4, 4).permute((2, 0, 1))[1:, ::2]
-        y = x.to('mps')
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.full((4, 4, 4, 4), 13, device="cpu")
-        y = torch.full((4, 4, 4, 4), 13, device="mps")
-        z = torch.arange(4**4).reshape(4, 4, 4, 4).permute(3, 2, 0, 1)[1::, ::2]
-        x.permute(3, 2, 1, 0)[1::, ::2] = z
-        # As y is on MPS and z on CPU, this dispatches to a copy operator
-        y.permute(3, 2, 1, 0)[1::, ::2] = z
-        self.assertEqual(x, y.to('cpu'))
-
-    # See https://github.com/pytorch/pytorch/pull/84742
-    # and https://github.com/pytorch/pytorch/pull/78319
-    def test_binops_dtype_precedence(self):
-        # Test dtype precedence (casting order) in binary operations by comparing to CPU result
-        # Example values for all dtypes supported on the MPS backend
-        sample_vals = {
-            torch.bool: [False, True],
-            torch.int16: [-15, 0, 1, 10],
-            torch.int32: [-376, 0, 1, 13],
-            torch.int64: [-8, 0, 1, 77],
-            torch.float16: [-234.5, 0.0, 1.0, 2.0],
-            torch.float32: [-1.0, 0.0, 0.1, 111.99],
-        }
-        # Test all combinations of dtypes, operations, dimensionality
-        for dtype1, dtype2, binop in itertools.product(
-                sample_vals.keys(), sample_vals.keys(), ['add', 'sub', 'mul', 'div']):
-            # bool minus bool is generally unsupported, so skip
-            if binop == 'sub' and (dtype1 == torch.bool or dtype2 == torch.bool):
-                continue
-            full_shape = (10,)
-            for val1, val2 in itertools.product(sample_vals[dtype1], sample_vals[dtype2]):
-                # print(f'{dtype1},{dtype2}: ({val1}).{binop}({val2})')
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='mps')))
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                # Test tensors created with torch.full
-                x1 = torch.full(full_shape, val1, dtype=dtype1, device='mps')
-                y1 = torch.tensor(val2, dtype=dtype2, device='mps')
-                x2 = torch.full(full_shape, val1, dtype=dtype1, device='cpu')
-                y2 = torch.tensor(val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x1, binop)(y1), getattr(x2, binop)(y2))
-                x3 = torch.tensor(val1, dtype=dtype1, device='mps')
-                y3 = torch.full(full_shape, val2, dtype=dtype2, device='mps')
-                x4 = torch.tensor(val1, dtype=dtype1, device='cpu')
-                y4 = torch.full(full_shape, val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x3, binop)(y3), getattr(x4, binop)(y4))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='cpu')))
-
-    def test_nansum(self):
-        def helper(dtype, noncontiguous, dim):
-            zero_cpu = torch.zeros((), dtype=dtype)
-
-            # Randomly scale the values
-            scale = random.randint(10, 100)
-            x_cpu: torch.Tensor = make_tensor(
-                (5, 5), dtype=dtype, device='cpu',
-                low=-scale, high=scale, noncontiguous=noncontiguous)
-
-            if dtype.is_floating_point:
-                nan_mask_cpu = x_cpu < (0.2 * scale)
-                x_no_nan_cpu = torch.where(nan_mask_cpu, zero_cpu, x_cpu)
-                x_cpu[nan_mask_cpu] = np.nan
-            else:
-                x_no_nan_cpu = x_cpu
-
-            x_mps = x_cpu.to('mps')
-            actual_out_mps = torch.empty(0, dtype=dtype, device='mps')
-            expect_out_cpu = torch.empty(0, dtype=dtype)
-            dim_kwargs = {"dim": dim} if dim is not None else {}
-            expect = torch.sum(x_no_nan_cpu, **dim_kwargs)
-
-            actual_cpu = torch.nansum(x_cpu, **dim_kwargs)
-            # Sanity check on CPU
-            self.assertEqual(expect, actual_cpu)
-
-            # Test MPS
-            actual_mps = torch.nansum(x_mps, **dim_kwargs)
-            # Test out= variant
-            torch.nansum(x_mps, out=actual_out_mps, **dim_kwargs)
-            torch.nansum(x_cpu, out=expect_out_cpu, **dim_kwargs)
-            self.assertEqual(expect, actual_mps)
-            self.assertEqual(expect_out_cpu, actual_out_mps)
-
-        args = itertools.product(
-            (torch.float16, torch.float32, torch.int32, torch.int64),   # dtype
-            (True, False),                                              # noncontiguous
-            (0, 1, None),                                               # dim
-        )
-
-        for dtype, noncontiguous, dim in args:
-            with self.subTest(dtype=dtype, noncontiguous=noncontiguous, dim=dim):
-                helper(dtype, noncontiguous, dim)
-
     def test_cumsum_all_dtypes(self):
         def helper(dtype):
             t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype)
@@ -2582,22 +2391,32 @@ def helper(dtype):
             e_string = str(e)
             self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
 
-    def test_cumsum_minus_one_axis(self):
-        def helper(dtype):
-            # Test with axis -1
-            cpu_x = None
-            if(dtype == torch.float32):
-                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
-            else:
-                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+    def test_gelu_tanh(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
             x = cpu_x.detach().clone().to('mps')
 
-            cpu_y = cpu_x.cumsum(-1)
-            y = x.cumsum(-1)
+            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
+            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
+            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
 
-            self.assertEqual(y, cpu_y)
+        helper((2, 8, 4, 5))
 
-        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+    # # Failures due to precision issues, enable after resolving from mps
+    # def test_div_floor_int(self):
+    #     def helper(shape, dtype):
+    #         cpu_x = torch.randint(-9999, -1,shape, device='cpu', dtype=dtype)
+    #         x = cpu_x.detach().clone().to('mps')
+
+    #         cpu_y = torch.randint(1, 9999, shape, device='cpu', dtype=dtype)
+    #         y = cpu_y.detach().clone().to('mps')
+
+    #         div_result = torch.div(x, y,rounding_mode='floor')
+    #         div_result_cpu = torch.div(cpu_x, cpu_y, rounding_mode='floor')
+    #         self.assertEqual(div_result, div_result_cpu)
+
+    #     helper((2, 8, 4, 5), torch.int16)
+    #     helper((2, 8, 4, 5), torch.int32)
 
     def test_median_int16(self):
         def helper(shape, dtype):
@@ -2610,6 +2429,23 @@ def helper(shape, dtype):
 
         helper((2, 8, 4, 5), torch.int16)
 
+    def test_cumsum_minus_one_axis(self):
+        def helper(dtype):
+            # Test with axis -1
+            cpu_x = None
+            if dtype == torch.float32:
+                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
+            else:
+                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = cpu_x.cumsum(-1)
+            y = x.cumsum(-1)
+
+            self.assertEqual(y, cpu_y)
+
+        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2762,6 +2598,20 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self):
 
 
 class TestNLLLoss(TestCaseMPS):
+    def test_nll2d_loss_backward(self, device='mps'):
+        a = torch.randn(3, 5, requires_grad=True, device=device)
+        b = torch.tensor([1, 0, 4], device=device)
+        loss = nn.NLLLoss()
+        out = loss(a, b)
+        self.assertIsNone(out.grad_fn._saved_weight)
+        loss = nn.NLLLoss(weight=torch.ones((5,), device=device))
+        out = loss(a, b)
+        self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,)))
+
+        out.sum().backward()
+        with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
+            out.grad_fn._saved_weight
+
     def test_nll_loss_mismatched_batch(self, device='mps'):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -2823,13 +2673,13 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
-        target = torch.randint(num_channels, target_size, device='cpu')
         weights = torch.randn(num_channels)
+        weights_mps = weights.to("mps")
+        target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
-        weights_mps = weights.to("mps")
 
         output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
         output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
@@ -3366,6 +3216,7 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_signed_vs_unsigned_comparison(self):
         cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
         mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
@@ -4519,30 +4370,10 @@ def helper(shape):
         helper((5, 9, 7, 4))
         helper((50, 20, 7, 4))
 
-    def test_sort(self):
-        for SIZE in (4, 2049):
-            device = 'mps'
-            x = torch.rand(4, SIZE, device=device)
-            res1val, res1ind = torch.sort(x)
-
-            res2val = torch.tensor((), device=device)
-            res2ind = torch.tensor((), device=device, dtype=torch.long)
-            torch.sort(x, out=(res2val, res2ind))
-            self.assertEqual(res1val, res2val, atol=0, rtol=0)
-            self.assertEqual(res1ind, res2ind, atol=0, rtol=0)
-            self.assertEqual(torch.argsort(x), res1ind)
-            self.assertEqual(x.argsort(), res1ind)
-
-            self.assertEqual(
-                torch.sort(torch.tensor((50, 40, 30, 20, 10), device=device))[0],
-                torch.tensor((10, 20, 30, 40, 50), device=device),
-                atol=0, rtol=0
-            )
-
     def test_upsample_nearest2d(self):
-        def helper(N, C, H, W):
+        def helper(N, C, H, W, memory_format):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(N, C, H, W)
+                                    requires_grad=True).reshape(N, C, H, W).to(memory_format=memory_format)
             inputCPU.retain_grad()
             inputMPS = inputCPU.detach().to('mps').requires_grad_()
 
@@ -4568,8 +4399,9 @@ def helper(N, C, H, W):
 
                     self.assertEqual(inputCPU.grad, inputMPS.grad)
 
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            helper(1, 1, 4, 4, memory_format=memory_format)
+            helper(7, 5, 3, 2, memory_format=memory_format)
 
     def test_upsample_bilinear2d(self):
         def helper(N, C, H, W):
@@ -4604,6 +4436,7 @@ def helper(N, C, H, W):
         helper(1, 1, 4, 4)
         helper(7, 5, 3, 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_interpolate(self):
         def helper(shape, output_size, scales, mode, align_corners=False):
             inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -4753,6 +4586,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
         # verify if a change in shape of padding would cause problems with graph caching
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+        # negative padding
+        helper((1, 3, 4, 4), (-1, 1, -2, 1), nn.ReplicationPad2d)
         # Constant Pad 2D
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size
@@ -4772,10 +4607,10 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d)
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
-        # input size < pad size
-        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
+        # input size < pad size
+        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
 
     # Test stack forward
     def test_stack(self):
@@ -5288,17 +5123,6 @@ def _gelu_ref(X):
         finally:
             torch.set_num_threads(num_threads)
 
-    def test_gelu_tanh(self):
-        def helper(shape):
-            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
-            x = cpu_x.detach().clone().to('mps')
-
-            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
-            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
-            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
-
-        helper((2, 8, 4, 5))
-
     # Test hardtanh
     def test_hardtanh(self):
         def helper(shape, min_val, max_val, inplace=False):
@@ -5475,14 +5299,14 @@ def helper(shape):
 
     # Test index add
     def test_index_add(self):
-        def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dtype=torch.int32):
-            cpu_x = torch.randn(shape, device='cpu', dtype=x_dtype, requires_grad=False)
+        def helper(shape, dim, index, source_shape, alpha, idx_dtype=torch.int32):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
             x = cpu_x.detach().clone().to('mps')
 
             cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
             idx = cpu_idx.detach().clone().to('mps')
 
-            cpu_source = torch.randn(source_shape, device='cpu', dtype=x_dtype, requires_grad=False)
+            cpu_source = torch.randn(source_shape, device='cpu', dtype=torch.float, requires_grad=False)
             source = cpu_source.detach().clone().to('mps')
 
             idx_result = torch.index_add(x, dim=dim, index=idx, source=source, alpha=alpha)
@@ -5498,8 +5322,6 @@ def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dt
         # test result dim=1
         helper((2,), 0, [1], (1,), 6.0)
         helper(2, 0, 1, 1, 6)
-        # test float16
-        helper((2,), 0, [1], (1,), 6.0, x_dtype=torch.float16)
 
     # Test flip
     def test_flip(self):
@@ -5543,23 +5365,6 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((2, 8, 4, 5), 2, [3, 0, 1])
         helper((2, 8, 4, 5), 3, [2, 3, 0])
         helper((2, 3, 3), -1, [1, 2])
-        helper((), 0, [0])
-        helper((5), 0, [])
-
-    def test_index_select_scalar(self):
-        def helper(value, dim, index, idx_dtype=torch.int32):
-            cpu_x = torch.tensor(value, device='cpu', dtype=torch.float, requires_grad=False)
-            x = cpu_x.detach().clone().to('mps')
-
-            cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
-            idx = cpu_idx.detach().clone().to('mps')
-
-            idx_result = torch.index_select(x, dim=dim, index=idx)
-            idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx)
-
-            self.assertEqual(idx_result, idx_result_cpu)
-
-        helper(22, 0, [])
 
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
@@ -5938,13 +5743,6 @@ def test_arange_empty(self):
         y_cpu = torch.arange(0, 0, 1, out=out_cpu)
         self.assertEqual(y_mps, y_cpu)
 
-    # Test rgange
-    def test_range(self):
-        self.assertEqual(np.arange(11, dtype=np.float32), torch.range(0, 10, device='mps'))
-        self.assertEqual(np.arange(7, 0, -1, dtype=np.float32), torch.range(7, 1, -1, device='mps'))
-        self.assertEqual(np.array([1.0000, 1.3000, 1.6000, 1.9000], dtype=np.float32), torch.range(1, 2, .3, device='mps'))
-        self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(0, 6.3, device='mps'))
-
     # Test softmax
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):
@@ -6183,25 +5981,24 @@ def test_device_synchronize(self):
         torch.mps.synchronize()
 
     def test_mps_allocator_module(self):
-        # first garbage collect and empty the cached blocks
+        # limit memory allocations up to 1.5x of recommended maximum size from Metal API
+        torch.mps.set_per_process_memory_fraction(1.5)
+
+        # just running some ops to allocate buffers
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device='mps', dtype=torch.float)
+
+        x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True)
+        x = net1(x)
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
         gc.collect()
+        # running this test alone will not release any buffers as they are in use.
+        # however, running along with other tests should release the cached allocations.
         torch.mps.empty_cache()
-        # measure memory allocations from MPSAllocator
-        current_alloc_before = torch.mps.current_allocated_memory()
-        # after garbage collection and emptying the cache the
-        # current_allocated_memory must be zero
-        self.assertTrue(current_alloc_before == 0)
-        # measure total memory allocations from Metal driver
-        driver_alloc_before = torch.mps.driver_allocated_memory()
-        # allocate a new 8 MB tensor to force allocation of a new Metal Heap
-        x = torch.ones(1024 * 1024 * 8, device="mps")
-        # get memory allocations after allocating tensor x
-        current_alloc_after = torch.mps.current_allocated_memory()
-        driver_alloc_after = torch.mps.driver_allocated_memory()
-        # current and driver memory allocations must have
-        # grown at this point
-        self.assertTrue(current_alloc_after > current_alloc_before)
-        self.assertTrue(driver_alloc_after > driver_alloc_before)
+        x.backward(torch.randn_like(x))
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
 
     # Test random_.to and random_.from
     def test_random(self):
@@ -6369,65 +6166,18 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
         helper(np.array([1, 1, 1, 1, 1]), (0 + 1 + 2 + 3 + 4) / 5, (6 - 2 * 2), 10000)
         helper(np.array([[1, 1, 1, 1, 1, 1, 1]]), 0, 0, 7, False)
 
-    def test_cumsum_dim_check(self):
-        x = torch.rand((3, 3), device="mps")
-        self.assertEqual(x.cumsum(1), x.cumsum(-1))
-        self.assertEqual(x.cumsum(0), x.cumsum(-2))
-        self.assertRaises(IndexError, lambda: x.cumsum(2))
-        self.assertRaises(IndexError, lambda: x.cumsum(-3))
-
-
-class TestTopK(TestCase):
-    def _test_topk(self, shape, largest):
-        cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
-        x = cpu_x.detach().clone().to('mps')
-        if isinstance(shape, tuple):
-            for curr_dim, dim_size in enumerate(shape):
-                for k in range(1, dim_size + 1):
-                    topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest)
-                    topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest)
-                    self.assertEqual(topk_values, topk_values_cpu)
-                    self.assertEqual(topk_indices, topk_indices_cpu)
-        else:
-            for k in range(1, shape):
-                topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest)
-                topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest)
-                self.assertEqual(topk_values, topk_values_cpu)
-                self.assertEqual(topk_indices, topk_indices_cpu)
-
-    def test_topk(self):
-        largest_vals = [True, False]
-        shapes = [
-            # Zero Element Tensors
-            0,
-            (1, 0),
-            (0, 1),
-            (1, 0, 1),
-            # Multiple Element Tensors
-            1,
-            2,
-            (5, 1),
-            (1, 5),
-            (5, 9, 7, 4),
-        ]
-
-        for shape in shapes:
-            for largest_val in largest_vals:
-                with self.subTest(shape=shape, largest_val=largest_val):
-                    self._test_topk(shape, largest_val)
-
 class TestNNMPS(NNTestCase):
 
     def _create_basic_net(self):
         class Layer(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Layer, self).__init__()
                 self.layer_dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
 
         class Net(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Net, self).__init__()
                 self.l1 = Layer()
                 self.dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
@@ -6526,9 +6276,7 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-
-        # Force set to zeros.
-        module.zero_grad(set_to_none=False)
+        module.zero_grad(set_to_none=False)   # Force set to zeros.
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
@@ -6536,7 +6284,6 @@ def test_zero_grad(self):
         self.assertIsNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
 
-
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:
             module = nn.Conv2d(2, 5, kernel_size=3, padding=1).to(dtype)
@@ -6650,33 +6397,6 @@ def attention2(key, *, workaround=False, device):
         r2_cpu = r2.to("cpu")
         self.assertEqual(r1, r2_cpu)
 
-    def test_group_norm_backward(self, device='mps'):
-        # See https://github.com/pytorch/pytorch/issues/88331 for more detail
-        shape = [1, 4, 16, 16]
-        x = torch.full(shape, 7.0, device=device)
-
-        target = torch.ones((1, 3, 128, 128), device=device)
-
-        conv_in = nn.Conv2d(4, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        conv_out = nn.Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        norm = nn.GroupNorm(32, 128, eps=1e-6, affine=True, device=device)
-
-        with torch.enable_grad():
-            x = x.detach().requires_grad_()
-            out = 5.5 * x
-            out = conv_in(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = F.interpolate(out, scale_factor=8.0, mode="nearest")
-            out = norm(out)
-            out = conv_out(out)
-
-            loss = (out - target).norm(dim=-1).sum()
-            grad = -torch.autograd.grad(loss, x)[0]
-            self.assertFalse(grad.detach().isnan().any().item(), 'NaN gradients returned by autograd')
-
-
     # def test_conv2d_same_padding(self, device='mps'):
         # x = torch.rand(1, 1, 10, 11, device=device)
         # y = torch.rand(1, 1, 4, 5, device=device)
@@ -7491,10 +7211,12 @@ def test_T(self, device="mps"):
         self.assertEqual(t2, t1)
         b = torch.randn(10, device=device)
         self.assertEqual(b, b.T)
+        scalar = torch.tensor(5, device=device)
+        self.assertEqual(scalar, scalar.T)
 
     def test_transposes(self, device="mps", dtype=torch.float32):
         for op in ("T", "H", "mT", "mH", "adjoint"):
-            shapes = ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),)
+            shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),)
             for shape in shapes:
                 a = make_tensor(shape, device=device, dtype=dtype)
                 t1 = getattr(a, op)
@@ -7711,7 +7433,8 @@ def test_conv_transpose_1d_nn_functional(self):
     def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
-            conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
+            conv_cpu = torch.nn.Conv1d(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).requires_grad_()
             conv_mps = torch.nn.Conv1d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
             conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
@@ -7751,15 +7474,89 @@ def test_conv1d_contiguous(self):
 
     def test_conv2d_all_strides_paddings(self):
         # https://github.com/pytorch/pytorch/issues/83180
-        y_cpu = torch.randn(2, 2, 3, 6)
-        y_gpu = y_cpu.to(device='mps')
-        for strideX in range(1, 4):
-            for strideY in range(1, 4):
-                conv_cpu = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=(strideX, strideY))
-                conv_gpu = copy.deepcopy(conv_cpu).to(device='mps')
-                x_cpu = conv_cpu(y_cpu)
-                x_gpu = conv_gpu(y_gpu)
-                self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
+        def helper(N, C, H, W, groups, input_mem_format, weight_mem_format, permute_data):
+            x_cpu = torch.randn(N, C, H, W).to(memory_format=input_mem_format).requires_grad_()
+            x_mps = x_cpu.detach().clone().to(device='mps').requires_grad_()
+
+            if permute_data:
+                x_cpu.permute(0, 2, 3, 1)
+                x_mps.permute(0, 2, 3, 1)
+
+            for strideX in range(1, 4):
+                for strideY in range(1, 4):
+                    conv_cpu = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY)).requires_grad_()
+                    conv_cpu.weight.data = conv_cpu.weight.to(memory_format=weight_mem_format).requires_grad_()
+
+                    conv_mps = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY), device="mps")
+                    conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+                    conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+                    res_cpu = conv_cpu(x_cpu)
+                    res_mps = conv_mps(x_mps)
+                    self.assertEqual(res_cpu, res_mps.cpu(), rtol=1e-03, atol=1e-05)
+
+                    res_cpu = res_cpu.sum().backward()
+                    res_mps = res_mps.sum().backward()
+                    self.assertEqual(res_cpu, res_mps, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.bias.grad, conv_mps.bias.grad)
+                    self.assertEqual(x_cpu.grad, x_mps.grad)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            for mem_format_weight in [torch.contiguous_format, torch.channels_last]:
+                for permute_data in [True, False]:
+                    helper(2, 2, 3, 6, 1, mem_format_input, mem_format_weight, permute_data)
+                    helper(10, 10, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+                    helper(32, 32, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+
+    def test_conv_transpose_2d_strided(self):
+        def helper(m_cpu, memory_format):
+            m_mps = copy.deepcopy(m_cpu).requires_grad_()
+            m_mps.weight.data = m_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+            m_mps.bias.data = m_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+            input_cpu = torch.randn(20, 16, 50, 100).to(memory_format=memory_format).requires_grad_()
+            input_mps = input_cpu.detach().clone().to("mps")
+
+            output_cpu = m_cpu(input_cpu)
+            output_mps = m_mps(input_mps)
+            self.assertEqual(output_cpu, output_mps)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            # With square kernels and equal stride
+            helper(nn.ConvTranspose2d(16, 33, 3, stride=2).requires_grad_(), mem_format_input)
+
+            # non-square kernels and unequal stride and with padding
+            helper(nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)).requires_grad_(), mem_format_input)
+
+    def test_conv_transpose_2d_specified_output(self):
+        input_cpu = torch.randn(1, 16, 12, 12)
+        input_mps = input_cpu.detach().clone().to("mps")
+
+        downsample_cpu = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        downsample_mps = nn.Conv2d(16, 16, 3, stride=2, padding=1, device="mps")
+        downsample_mps.weight.data = downsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        downsample_mps.bias.data = downsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        upsample_cpu = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        upsample_mps = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, device="mps")
+        upsample_mps.weight.data = upsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        upsample_mps.bias.data = upsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        h_cpu = downsample_cpu(input_cpu)
+        h_mps = downsample_mps(input_mps)
+        self.assertEqual(h_cpu, h_mps)
+
+        size_cpu = h_cpu.size()
+        size_mps = h_mps.size()
+        self.assertEqual(size_cpu, size_mps)
+
+        output_cpu = upsample_cpu(h_cpu, output_size=input_cpu.size())
+        output_mps = upsample_mps(h_mps, output_size=input_mps.size())
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
 
     def test_conv2d_single_stride(self):
         y_cpu = torch.randn(2, 2, 3, 6)
@@ -8351,6 +8148,7 @@ def test_bool_indices(self, device="mps"):
             self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_bool_indices_accumulate(self, device="mps"):
         mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
         mask = mask > 0
@@ -8541,6 +8339,7 @@ def helper(device, dtype):
             self.assertEqual(res.shape, src.shape)
         [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]]
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_index_src_datatype(self):
         def helper(device, dtype):
             orig_dtype = dtype
@@ -8874,6 +8673,63 @@ def get_results(device):
         self.assertEqual(cpu_input_grad, mps_input_grad)
         self.assertEqual(cpu_weight_grad, mps_weight_grad)
 
+    def test_RNN_cell_no_broadcasting(self):
+        def test(cell_module, input, hx, input_size, hidden_size):
+            cell = cell_module(input_size, hidden_size, device='mps')
+            self.assertRaises(RuntimeError, lambda: cell(input, hx))
+
+        def test_all(hidden_size, bad_hx, good_hx, input_size, input):
+            test(nn.RNNCell, input, bad_hx, input_size, hidden_size)
+            test(nn.GRUCell, input, bad_hx, input_size, hidden_size)
+            test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size)
+            test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size)
+
+        hidden_size = 20
+        input_size = 10
+        input = torch.randn(3, input_size, device='mps')
+        bad_hx = torch.randn(1, hidden_size, device='mps')
+        good_hx = torch.randn(3, hidden_size, device='mps')
+
+        # Test hidden/input batch size broadcasting
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test hx's hidden_size vs module's hidden_size broadcasting
+        bad_hx = torch.randn(3, 1)
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test input's input_size vs module's input_size broadcasting
+        bad_input = torch.randn(3, 1)
+        test_all(hidden_size, good_hx, good_hx, input_size, bad_input)
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = torch.randn(3, 10, device='mps')
+            hx = torch.randn(3, 20, device='mps')
+            cx = torch.randn(3, 20, device='mps')
+            lstm = nn.LSTMCell(10, 20, bias=bias, device='mps')
+            for _ in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx + cx).sum().backward()
+
+    def test_LSTM_cell_forward_input_size(self):
+        input = torch.randn(3, 11, device='mps')
+        hx = torch.randn(3, 20, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+
+    def test_LSTM_cell_forward_hidden_size(self):
+        input = torch.randn(3, 10, device='mps')
+        hx = torch.randn(3, 21, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+        self.assertRaises(Exception, lambda: lstm(input, (cx, hx)))
+
+
 class TestFallbackWarning(TestCase):
     # TODO: Remove once test_testing.py is running on MPS devices
     def test_no_warning_on_import(self):
@@ -9019,76 +8875,137 @@ def test_serialization_map_location(self):
 
 
 MPS_DTYPES = get_all_dtypes()
-for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
+abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()}
+class UnitTestSample:
+    def __init__(self, dtype, args, params, out):
+        requires_grad = (dtype.is_floating_point or dtype.is_complex)
+        self.args_ = [t.detach().to('mps').requires_grad_(requires_grad) for t in args]
+        self.params_ = params
+        self.out_ = out
+
+    def sample(self):
+        return self.args_ + self.params_
+
+    def expected(self):
+        return tuple(self.out_)
+
+CUDA_RESULT = dict()
+OP_UNIT_TEST = dict()
+dirname = os.path.dirname(__file__)
+filename = os.path.join(dirname, "cuda_results.yaml")
+with open(filename) as f:
+    data = yaml.safe_load(f)
+    for key, value in data['ConsistencyTest'].items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
+    for key, samples in data['UnitTest'].items():
+        unit_tests = []
+        for sample in samples:
+            dtype = abbrs_to_torch_dtype_dict[sample['dtype']]
+            args = [torch.as_tensor(arg).to(dtype) for arg in sample['args']]
+            params = sample['params']
+            out = [torch.as_tensor(res).to(dtype) for res in sample['res']]
+            unit_tests.append(UnitTestSample(dtype, args, params, out))
+        OP_UNIT_TEST[key] = unit_tests
 
 class TestConsistency(TestCaseMPS):
+
     # TODO: This is only used while some ops are being added.
     # This list should contain all ops and dtypes eventually
     # This can be generated automatically in the `new_mps_allowlist.txt` file
     # by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU`
     # You most likely do NOT want to modify this manually
     ALLOWLIST_OP = {
+        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'],
-        '__rmatmul__': ['f32'],
+        '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmatmul__': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rpow__': ['f16'],
+        '__rpow__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rsub__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.norm': ['f16', 'f32'],
-        'masked.normalize': ['f16', 'f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.var': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
+        'abs': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'addbmm': ['f32'],
+        'addbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'addcdiv': ['f32'],
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'addmm': ['f32'],
-        'addmv': ['f32'],
-        'addr': ['f32'],
+        'addmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addmv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'allclose': ['f16', 'f32'],
+        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'aminmax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'angle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amix': ['f32'],
-        'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan2': ['f32'],
-        'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'argsort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'argwhere': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided_scatter': ['b8',
+                               'f16',
+                               'f32',
+                               'i16',
+                               'i32',
+                               'i64',
+                               'u8'],
+        'asin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'asinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'baddbmm': ['f32'],
+        'baddbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bincount': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'bmm': ['f32'],
+        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bool': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bucketize': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cartesian_prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'ceil': ['f32', 'int32', 'int64', 'f16'],
+        'cdist': ['f32'],
+        'cdouble': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ceil': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cfloat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'chalf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -9096,241 +9013,659 @@ class TestConsistency(TestCaseMPS):
         'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'complex': ['f16', 'f32'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'corrcoef': ['f32'],
-        'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cov': ['f32'],
-        'cumsum': ['f16', 'f32', 'int16', 'int32'],
+        'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'corrcoef': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cov': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diag': ['f32', 'i32'],
-        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'diagflat': ['f32', 'i32'],
-        'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_scatter': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'dist': ['f32'],
+        'digamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32', 'u8', 'b8', 'i16', 'i32', 'i64'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'einsum': ['f32'],
+        'double': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'einsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'erf': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfinv': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expm1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flip': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fliplr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flipud': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'floor_divide': ['f32', 'f16'],
-        'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'float_power': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor_divide': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmod': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'full_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'gradient': ['f16', 'f32', 'i16'],
+        'gcd': ['i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'geqrf': ['f32'],
+        'gradient': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'grid_sampler_2d': ['f32'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'heaviside': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'histc': ['f32'],
+        'histogram': ['f32'],
+        'histogramdd': ['f32'],
+        'hsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'hypot': ['f32'],
+        'i0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'igamma': ['f16', 'f32'],
+        'igammac': ['f16', 'f32'],
+        'index_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_reduce': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'inner': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isin': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isneginf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isposinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.matrix_norm': ['f16'],
+        'kthvalue': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'lcm': ['i16', 'i32', 'i64', 'u8'],
+        'ldexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lerp': ['f32'],
+        'lgamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.det': ['f32'],
+        'linalg.eig': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvals': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.ldl_solve': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.vecdot': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log_softmax': ['f32'],
-        'logaddexp': ['f16', 'f32'],
-        'logaddexp2': ['f16', 'f32'],
+        'log': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log10': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log1p': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log_softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
         'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logit': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logsumexp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.var': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'matmul': ['f32'],
-        'mm': ['f32'],
-        'mv': ['f32'],
+        'matmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'matrix_exp': ['f32'],
+        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mode': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'movedim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'msort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'multinomial': ['f32'],
+        'mv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mvlgamma': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nanquantile': ['f32'],
+        'nansum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['b8',
+                                    'f16',
+                                    'f32',
+                                    'i16',
+                                    'i32',
+                                    'i64',
+                                    'u8'],
+        'native_layer_norm': ['f32'],
+        'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'neg': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty_strided': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'new_full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nextafter': ['f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32', 'i64'],
         'nn.functional.avg_pool2d': ['f32', 'i64'],
+        'nn.functional.avg_pool3d': ['f32', 'i64'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.cosine_embedding_loss': ['b8',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
         'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['b8',
+                                                'f16',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
         'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
         'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
-        'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.interpolate': ['f32', 'u8'],
+        'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.linear': ['f32'],
-        'nn.functional.local_response_norm': ['f32'],
-        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
+        'nn.functional.linear': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.local_response_norm': ['f32', 'i64'],
+        'nn.functional.logsigmoid': ['f32'],
+        'nn.functional.margin_ranking_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
-        'max_pool2d_with_indices_backward': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padreflect': ['f32'],
-        'nn.functional.padreplicate': ['f32'],
-        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.one_hot': ['i64'],
+        'nn.functional.pad': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'nn.functional.pairwise_distance': ['f16',
+                                            'f32',
+                                            'i16',
+                                            'i32',
+                                            'i64',
+                                            'u8'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['b8',
+                                        'f16',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'nn.functional.pixel_unshuffle': ['b8',
+                                          'f16',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'nn.functional.poisson_nll_loss': ['f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
         'nn.functional.smooth_l1_loss': ['f16', 'f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
-        'nn.functional.softplus': ['f32'],
-        'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
-        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.softmin': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softshrink': ['f32'],
+        'nn.functional.softsign': ['f16',
+                                   'f32',
+                                   'i16',
+                                   'i32',
+                                   'i64',
+                                   'u8'],
+        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'],
-        'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.triplet_margin_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['f32',
+                                                            'i16',
+                                                            'i32',
+                                                            'i64',
+                                                            'u8'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'nn.functional.upsample_nearest': ['f32'],
+        'nn.functional.upsample_nearest': ['f32', 'u8'],
+        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'norm': ['f32', 'f16'],
+        'normal': ['f16', 'f32'],
+        'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ones_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ormqr': ['f32'],
+        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pca_lowrank': ['f32'],
+        'permute': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pinverse': ['f32'],
+        'polar': ['f32'],
+        'polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'pow': ['f16'],
+        'pow': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randint_like': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randn': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'remainder' : ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'remainder': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
+        'repeat_interleave': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'reshape': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_as_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'round': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scalar_tensor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'select_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'scatter_reduce': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'searchsorted': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'softmax': ['f32'],
+        'sigmoid': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'signal.windows.bartlett': ['f16', 'f32'],
+        'signal.windows.blackman': ['f16', 'f32'],
+        'signal.windows.cosine': ['f16', 'f32'],
+        'signal.windows.exponential': ['f16', 'f32'],
+        'signal.windows.gaussian': ['f16', 'f32'],
+        'signal.windows.general_cosine': ['f16', 'f32'],
+        'signal.windows.general_hamming': ['f16', 'f32'],
+        'signal.windows.hamming': ['f16', 'f32'],
+        'signal.windows.hann': ['f16', 'f32'],
+        'signal.windows.kaiser': ['f16', 'f32'],
+        'signbit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'sort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.airy_ai': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.chebyshev_polynomial_t': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.chebyshev_polynomial_u': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.entr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.erfcx': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.hermite_polynomial_h': ['b8',
+                                         'f32',
+                                         'i16',
+                                         'i32',
+                                         'i64',
+                                         'u8'],
+        'special.hermite_polynomial_he': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.i0e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.laguerre_polynomial_l': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.log_ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.modified_bessel_i0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_i1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.ndtri': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.scaled_modified_bessel_k0': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.scaled_modified_bessel_k1': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.spherical_bessel_j0': ['b8',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'special.xlog1py': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.zeta': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'split_with_sizes': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
+        'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'stft': ['f32'],
         'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tan': ['b8', 'i16', 'i32', 'u8'],
-        'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'tensordot': ['f32'],
+        'take': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'take_along_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'topk': ['f32', 'f16'],
-        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'sort': ['f32', 'i16', 'i32', 'i64'],
-        'argsort': ['f32', 'i16', 'i32', 'i64'],
+        'tensordot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'to_sparse': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'topk': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'trace': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'transpose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapezoid': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumulative_trapezoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'triangular_solve': ['f32'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'triu_indices': ['i32', 'i64'],
         'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'trunc': ['f32'],
+        'trunc': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'uniform': ['f16', 'f32'],
+        'unique_consecutive': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'view_as_complex': ['f16', 'f32'],
+        'view_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'std': ['f16', 'f32'],
-        'var': ['f16', 'f32'],
-        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mean': ['f16', 'f32'],
-        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'native_layer_norm': ['torch.float32'],
-        'nn.functional.layer_norm': ['torch.float32'],
-        'nn.functional.bilinear': ['f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.inv': ['f32'],
-        'linalg.inv_ex': ['f32'],
-        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softplus': ['f32'],
     }
 
-
     ALLOWLIST_OP_GRAD = {
+        'H': ['f16', 'f32'],
+        'T': ['f16', 'f32'],
+        '__getitem__': ['f16', 'f32'],
         '__radd__': ['f16', 'f32'],
         '__rdiv__': ['f16', 'f32'],
         '__rmatmul__': ['f32'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['f16', 'f32'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['f32'],
+        '__rpow__': ['f32'],
+        '__rsub__': ['f16', 'f32'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
         'abs': ['f16', 'f32'],
         'acos': ['f32'],
         'acosh': ['f32'],
@@ -9342,168 +9677,521 @@ class TestConsistency(TestCaseMPS):
         'addmv': ['f32'],
         'addr': ['f32'],
         'all': ['f16', 'f32'],
+        'amax': ['f16', 'f32'],
+        'amin': ['f16', 'f32'],
+        'angle': ['f16', 'f32'],
         'any': ['f16', 'f32'],
         'arange': ['f16', 'f32'],
         'argmax': ['f16', 'f32'],
         'argmin': ['f16', 'f32'],
+        'argsort': ['f16', 'f32'],
+        'argwhere': ['f16', 'f32'],
+        'as_strided': ['f16', 'f32'],
+        'as_strided_scatter': ['f16', 'f32'],
         'asin': ['f32'],
         'asinh': ['f32'],
         'atan': ['f32'],
         'atan2': ['f32'],
+        'atanh': ['f32'],
         'atleast_1d': ['f16', 'f32'],
         'atleast_2d': ['f16', 'f32'],
         'atleast_3d': ['f16', 'f32'],
         'baddbmm': ['f32'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['f16', 'f32'],
         'block_diag': ['f16', 'f32'],
         'bmm': ['f32'],
+        'bool': ['f16', 'f32'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['f16', 'f32'],
+        'broadcast_to': ['f16', 'f32'],
+        'bucketize': ['f16', 'f32'],
+        'byte': ['f16', 'f32'],
+        'cartesian_prod': ['f16', 'f32'],
+        'cat': ['f16', 'f32'],
+        'cdist': ['f32'],
         'ceil': ['f32'],
+        'char': ['f16', 'f32'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['f16', 'f32'],
+        'clamp': ['f32'],
+        'clamp_max': ['f16', 'f32'],
+        'clamp_min': ['f16', 'f32'],
         'clone': ['f16', 'f32'],
         'column_stack': ['f16', 'f32'],
+        'combinations': ['f16', 'f32'],
         'conj': ['f16', 'f32'],
         'conj_physical': ['f16', 'f32'],
+        'constant_pad_nd': ['f16', 'f32'],
         'contiguous': ['f16', 'f32'],
+        'copysign': ['f16', 'f32'],
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
-        'cumsum': ['f16', 'f32'],
+        'count_nonzero': ['f16', 'f32'],
+        'cov': ['f32'],
+        'cross': ['f32'],
+        'cummax': ['f32'],
+        'cummin': ['f32'],
+        'cumprod': ['f32'],
+        'cumsum': ['f32'],
+        'cumulative_trapezoid': ['f32'],
         'deg2rad': ['f16', 'f32'],
-        'diag': ['f32'],
+        'diag': ['f16', 'f32'],
         'diag_embed': ['f16', 'f32'],
-        'diagflat': ['f32'],
+        'diagflat': ['f16', 'f32'],
+        'diagonal': ['f16', 'f32'],
+        'diagonal_copy': ['f16', 'f32'],
         'diagonal_scatter': ['f16', 'f32'],
         'diff': ['f16', 'f32'],
-        'dist': ['f32'],
+        'digamma': ['f32'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32'],
         'dot': ['f32'],
+        'double': ['f16', 'f32'],
+        'dsplit': ['f16', 'f32'],
+        'dstack': ['f16', 'f32'],
         'einsum': ['f32'],
+        'empty_like': ['f16', 'f32'],
+        'eq': ['f16', 'f32'],
         'erf': ['f32'],
+        'erfc': ['f32'],
+        'erfinv': ['f32'],
         'exp': ['f32'],
         'exp2': ['f16', 'f32'],
+        'expand': ['f16', 'f32'],
+        'expand_as': ['f16', 'f32'],
+        'expm1': ['f32'],
+        'fft.fftshift': ['f16', 'f32'],
+        'fft.hfft': ['f32'],
+        'fft.hfft2': ['f32'],
+        'fft.hfftn': ['f32'],
+        'fft.ifftshift': ['f16', 'f32'],
+        'fft.irfft': ['f32'],
+        'fft.irfft2': ['f32'],
+        'fft.irfftn': ['f32'],
         'fill': ['f16', 'f32'],
         'flatten': ['f16', 'f32'],
         'flip': ['f16', 'f32'],
         'fliplr': ['f16', 'f32'],
         'flipud': ['f16', 'f32'],
-        'float': ['f32'],
+        'float': ['f16', 'f32'],
+        'float_power': ['f16', 'f32'],
         'floor': ['f32'],
-        'gradient': ['f32'],
-        'half': ['f16'],
+        'fmax': ['f16', 'f32'],
+        'fmin': ['f16', 'f32'],
+        'fmod': ['f16', 'f32'],
+        'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['f16', 'f32'],
+        'full_like': ['f16', 'f32'],
+        'gather': ['f16', 'f32'],
+        'ge': ['f16', 'f32'],
+        'gradient': ['f16', 'f32'],
+        'grid_sampler_2d': ['f32'],
+        'gt': ['f16', 'f32'],
+        'half': ['f16', 'f32'],
+        'histc': ['f32'],
+        'hsplit': ['f16', 'f32'],
         'hstack': ['f16', 'f32'],
-        'index_select': ['f16', 'f32'],
+        'hypot': ['f32'],
+        'i0': ['f32'],
         'index_add': ['f16', 'f32'],
+        'index_copy': ['f16', 'f32'],
+        'index_fill': ['f16', 'f32'],
+        'index_put': ['f16', 'f32'],
+        'index_reduce': ['f16', 'f32'],
+        'index_select': ['f16', 'f32'],
+        'inner': ['f32'],
+        'int': ['f16', 'f32'],
         'isclose': ['f16', 'f32'],
         'isfinite': ['f16', 'f32'],
+        'isin': ['f32'],
         'isinf': ['f16', 'f32'],
         'isnan': ['f16', 'f32'],
+        'isneginf': ['f16', 'f32'],
+        'isposinf': ['f16', 'f32'],
         'isreal': ['f16', 'f32'],
-        'kron': ['f32'],
-        'linalg.matrix_norm': ['f16'],
+        'kron': ['f16', 'f32'],
+        'kthvalue': ['f32'],
+        'ldexp': ['f16', 'f32'],
+        'le': ['f16', 'f32'],
+        'lerp': ['f32'],
+        'lgamma': ['f32'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32'],
+        'linalg.det': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32'],
+        'linalg.vecdot': ['f32'],
+        'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
         'log10': ['f32'],
         'log1p': ['f32'],
         'log2': ['f32'],
-        'log_softmax': ['f32'],
+        'log_softmax': ['f32', 'f16'],
         'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
+        'logical_and': ['f16', 'f32'],
         'logical_not': ['f16', 'f32'],
+        'logical_or': ['f16', 'f32'],
+        'logical_xor': ['f16', 'f32'],
+        'logit': ['f32'],
         'logspace': ['f32'],
+        'logsumexp': ['f32'],
+        'long': ['f16', 'f32'],
+        'lt': ['f16', 'f32'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['f16', 'f32'],
+        'mT': ['f16', 'f32'],
+        'masked.amax': ['f16', 'f32'],
+        'masked.amin': ['f16', 'f32'],
+        'masked.argmax': ['f16', 'f32'],
+        'masked.argmin': ['f16', 'f32'],
+        'masked.cumprod': ['f32'],
+        'masked.cumsum': ['f32'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32'],
+        'masked.mean': ['f16', 'f32'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['f32'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32'],
+        'masked.sum': ['f16', 'f32'],
+        'masked.var': ['f16', 'f32'],
+        'masked_fill': ['f16', 'f32'],
+        'masked_scatter': ['f16', 'f32'],
+        'masked_select': ['f16', 'f32'],
         'matmul': ['f32'],
+        'matrix_exp': ['f32'],
+        'max': ['f16', 'f32'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['f16', 'f32'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32'],
+        'meshgrid': ['f16', 'f32'],
+        'min': ['f16', 'f32'],
+        'minimum': ['f16', 'f32'],
         'mm': ['f32'],
+        'mode': ['f16', 'f32'],
+        'movedim': ['f16', 'f32'],
+        'msort': ['f16', 'f32'],
+        'mul': ['f16', 'f32'],
+        'multinomial': ['f32'],
         'mv': ['f32'],
+        'mvlgamma': ['f32'],
+        'nan_to_num': ['f16', 'f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32'],
+        'nanquantile': ['f32'],
+        'nansum': ['f16', 'f32'],
+        'narrow': ['f16', 'f32'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['f16', 'f32'],
+        'native_layer_norm': ['f32'],
+        'ne': ['f16', 'f32'],
         'neg': ['f16', 'f32'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'new_empty': ['f16', 'f32'],
+        'new_empty_strided': ['f16', 'f32'],
+        'new_full': ['f16', 'f32'],
+        'new_ones': ['f16', 'f32'],
+        'new_zeros': ['f16', 'f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32'],
         'nn.functional.avg_pool2d': ['f32'],
+        'nn.functional.avg_pool3d': ['f32'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.conv_transpose3d': ['f32'],
         'nn.functional.cosine_embedding_loss': ['f32'],
+        'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
+        'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['f32', 'f16'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
+        'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
+        'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
+        'nn.functional.interpolate': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
+        'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.pad': ['f16', 'f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['f16', 'f32'],
+        'nn.functional.pixel_unshuffle': ['f16', 'f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
+        'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32'],
         'nn.functional.relu6': ['f32'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
+        'nn.functional.softmin': ['f32', 'f16'],
         'nn.functional.softplus': ['f32'],
+        'nn.functional.softshrink': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
-        'nn.functional.smooth_l1_loss': ['f32'],
+        'nn.functional.tanhshrink': ['f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'norm': ['f32', 'f16'],
+        'nn.functional.upsample_nearest': ['f32'],
+        'nonzero': ['f16', 'f32'],
+        'norm': ['f16', 'f32'],
+        'normal': ['f16', 'f32'],
+        'ones': ['f16', 'f32'],
+        'ones_like': ['f16', 'f32'],
+        'ormqr': ['f32'],
+        'outer': ['f16', 'f32'],
+        'pca_lowrank': ['f32'],
+        'permute': ['f16', 'f32'],
+        'pinverse': ['f32'],
+        'polygamma': ['f32'],
         'positive': ['f16', 'f32'],
+        'pow': ['f32'],
+        'prod': ['f32'],
+        'put': ['f16', 'f32'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['f16', 'f32'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32'],
+        'randint_like': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['f16', 'f32'],
         'real': ['f16', 'f32'],
         'reciprocal': ['f16', 'f32'],
+        'remainder': ['f16', 'f32'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['f16', 'f32'],
         'repeat_interleave': ['f16', 'f32'],
+        'reshape': ['f16', 'f32'],
+        'reshape_as': ['f16', 'f32'],
         'resolve_conj': ['f16', 'f32'],
         'resolve_neg': ['f16', 'f32'],
+        'roll': ['f16', 'f32'],
+        'rot90': ['f16', 'f32'],
         'round': ['f32'],
         'rsqrt': ['f32'],
+        'rsub': ['f16', 'f32'],
+        'scatter': ['f16', 'f32'],
+        'scatter_add': ['f16', 'f32'],
+        'scatter_reduce': ['f16', 'f32'],
+        'searchsorted': ['f16', 'f32'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['f16', 'f32'],
         'select_scatter': ['f16', 'f32'],
+        'sgn': ['f16', 'f32'],
+        'short': ['f16', 'f32'],
+        'sigmoid': ['f32'],
         'sign': ['f16', 'f32'],
+        'signbit': ['f16', 'f32'],
         'sin': ['f32'],
+        'sinc': ['f32'],
         'sinh': ['f32'],
+        'slice': ['f16', 'f32'],
         'slice_scatter': ['f16', 'f32'],
-        'softmax': ['f32'],
+        'softmax': ['f32', 'f16'],
+        'sort': ['f16', 'f32'],
+        'special.airy_ai': ['f32'],
+        'special.bessel_j0': ['f32'],
+        'special.bessel_j1': ['f32'],
+        'special.bessel_y0': ['f32'],
+        'special.bessel_y1': ['f32'],
+        'special.chebyshev_polynomial_t': ['f32'],
+        'special.chebyshev_polynomial_u': ['f32'],
+        'special.entr': ['f32'],
+        'special.erfcx': ['f32'],
+        'special.hermite_polynomial_h': ['f32'],
+        'special.hermite_polynomial_he': ['f32'],
+        'special.i0e': ['f32'],
+        'special.i1': ['f32'],
+        'special.i1e': ['f32'],
+        'special.laguerre_polynomial_l': ['f32'],
+        'special.log_ndtr': ['f32'],
+        'special.modified_bessel_i0': ['f32'],
+        'special.modified_bessel_i1': ['f32'],
+        'special.modified_bessel_k0': ['f32'],
+        'special.modified_bessel_k1': ['f32'],
+        'special.ndtr': ['f32'],
+        'special.ndtri': ['f32'],
+        'special.polygamma': ['f32'],
+        'special.scaled_modified_bessel_k0': ['f32'],
+        'special.scaled_modified_bessel_k1': ['f32'],
+        'special.spherical_bessel_j0': ['f32'],
+        'special.xlog1py': ['f16', 'f32'],
         'split': ['f16', 'f32'],
+        'split_with_sizes': ['f16', 'f32'],
         'sqrt': ['f32'],
         'square': ['f16', 'f32'],
         'squeeze': ['f16', 'f32'],
         'stack': ['f16', 'f32'],
-        'sub': ['f32'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'sub': ['f16', 'f32'],
+        'sum': ['f16', 'f32'],
         'sum_to_size': ['f16', 'f32'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['f16', 'f32'],
+        'take': ['f16', 'f32'],
+        'take_along_dim': ['f16', 'f32'],
+        'tan': ['f32'],
         'tanh': ['f32'],
+        'tensor_split': ['f16', 'f32'],
         'tensordot': ['f32'],
         'tile': ['f16', 'f32'],
+        'to': ['f16', 'f32'],
+        'topk': ['f32'],
+        'trace': ['f32'],
+        'transpose': ['f16', 'f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+        'triangular_solve': ['f32'],
         'tril': ['f16', 'f32'],
         'triu': ['f16', 'f32'],
         'true_divide': ['f16', 'f32'],
         'trunc': ['f32'],
         'unbind': ['f16', 'f32'],
         'unflatten': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'uniform': ['f16', 'f32'],
         'unsqueeze': ['f16', 'f32'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32'],
         'view': ['f16', 'f32'],
         'view_as': ['f16', 'f32'],
+        'view_copy': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
+        'where': ['f16', 'f32'],
+        'xlogy': ['f16', 'f32'],
         'zero_': ['f16', 'f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'native_layer_norm': ['f32'],
-        'nn.functional.gelu': ['f32'],
+        'zeros': ['f16', 'f32'],
+        'zeros_like': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD = {
+        # Unimplemented ops
+        '__getitem__': ['f16'],
+        'combinations': ['f16', 'f32'],
+        'logaddexp2': ['f32'],
+        'masked_select': ['f16', 'f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f16', 'f32'],
+        'nn.functional.group_norm': ['f32'],
+        'prod': ['f32'],
+        'sgn': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'trace': ['f32'],
+
+        # Correctness issues
+        'atanh': ['f32'],
+        'div': ['f16'],
+
+        # Unsupported dtype
+        'special.ndtr': ['f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD_MACOS_12 = {
+        'remainder': ['f16'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -9511,124 +10199,388 @@ class TestConsistency(TestCaseMPS):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
-        # Functions that hang
-        'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
-        # + forward when requires_grad=True or running backward
-        'masked.mean': [torch.bool, torch.float16],
-        'masked.prod': [torch.bool],
-        'masked.sum': [torch.bool],
-
         # Functions that hard crash
-        'std': [torch.float16],
-        'stft': [torch.float32], 'var': [torch.float16],
-        # + forward when requires_grad=True or running backward
-        'nn.functional.embedding': [torch.float32, torch.float16],
-        '__rpow__': [torch.int64],
-
-        'as_strided_scatter': [torch.uint8],
-        'atan2': [torch.int64],
-        'bfloat16': None,
-        'block_diag': [torch.uint8],
-        'byte': None,
-        'chalf': None,
-        'diag_embed': [torch.uint8],
-        'diagonal_scatter': [torch.uint8],
-        'long': None,
+        'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'resize_as_': [torch.float16, torch.float32],
+        'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # Functions with correctness issues
+        'multinomial': [torch.float32],
+
+        # cpu result off, showing random values
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # cpu result off, showing inf values
+        'dist': [torch.float16],
+
+        # failure due to issue: atan2() may generate NAN in output with
+        'atan2': [torch.bool, torch.int16, torch.int32, torch.uint8],
+
+        # Unsupported Border padding mode
+        'grid_sampler_2d': [torch.float32],
+        'nn.functional.grid_sample': [torch.float32],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+
+        # failures before macOS 13.3
+        'nn.functional.conv_transpose2d': [torch.float32],
+    }
+
+    UNIMPLEMENTED_OPS = {
+        # Failures due to lack of op implementation on MPS backend
+        'linalg.eig': [torch.float32],
+        'linalg.eigvals': [torch.float32],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'stft': [torch.float32],
+        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
+        'rounddecimals_neg_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmod__': [torch.float16, torch.float32],
+        '__rsub__': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'aminmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'angle': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'argsort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bucketize': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cholesky': [torch.float32],
+        'cholesky_inverse': [torch.float32],
+        'cholesky_solve': [torch.float32],
+        'copysign': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cumprod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'digamma': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfc': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfinv': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'frexp': [torch.float16, torch.float32],
+        'gcd': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'geqrf': [torch.float32],
+        'heaviside': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'histc': [torch.float32],
+        'histogram': [torch.float32],
+        'histogramdd': [torch.float32],
+        'hypot': [torch.float32],
+        'i0': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'igamma': [torch.float16, torch.float32],
+        'igammac': [torch.float16, torch.float32],
+        'index_copy': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_fill': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_reduce': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isin': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isneginf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isposinf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'kthvalue': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lcm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ldexp': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lerp': [torch.float32],
+        'lgamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.cholesky': [torch.float32],
+        'linalg.cholesky_ex': [torch.float32],
+        'linalg.cond': [torch.float32],
+        'linalg.detsingular': [torch.float32],
+        'linalg.det': [torch.float32],
+        'linalg.eigh': [torch.float32],
+        'linalg.eigvalsh': [torch.float32],
+        'linalg.householder_product': [torch.float32],
+        'linalg.ldl_factor': [torch.float32],
+        'linalg.ldl_factor_ex': [torch.float32],
+        'linalg.ldl_solve': [torch.float32],
+        'linalg.lstsq': [torch.float32],
+        'linalg.lstsqgrad_oriented': [torch.float32],
+        'linalg.lu': [torch.float32],
+        'linalg.lu_factor': [torch.float32],
+        'linalg.lu_factor_ex': [torch.float32],
+        'linalg.lu_solve': [torch.float32],
+        'linalg.matrix_norm': [torch.float32],
+        'linalg.norm': [torch.float32],
+        'linalg.normsubgradients_at_zero': [torch.float32],
+        'linalg.qr': [torch.float32],
+        'linalg.slogdet': [torch.float32],
+        'linalg.solve': [torch.float32],
+        'linalg.solve_ex': [torch.float32],
+        'linalg.svdvals': [torch.float32],
+        'linalg.tensorsolve': [torch.float32],
+        'linalg.vander': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.vecdot': [torch.float32],
+        'logcumsumexp': [torch.float32],
+        'logdet': [torch.float32],
+        'logit': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lu': [torch.float32],
+        'lu_solve': [torch.float32],
+        'lu_unpack': [torch.float32],
+        'masked.cumprod': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'masked.median': [torch.float32],
+        'masked_scatter': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matrix_exp': [torch.float32],
+        'mode': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'msort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_5': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nanquantile': [torch.float32],
+        'nanmean': [torch.float32, torch.float16],
+        'nanmedian': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nansum': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'native_dropout_backward': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nextafter': [torch.float32],
+        'normnuc': [torch.float32],
+        'nn.functional._scaled_dot_product_attention': [torch.float32],
+        'nn.functional.fractional_max_pool2d': [torch.float32],
+        'nn.functional.fractional_max_pool3d': [torch.float32],
+        'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
+        'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatearea': [torch.float32],
+        'nn.functional.interpolatebicubic': [torch.float32],
+        'nn.functional.interpolatelinear': [torch.float32],
+        'nn.functional.interpolatetrilinear': [torch.float32],
+        'nn.functional.max_unpool1dgrad': [torch.float32],
+        'nn.functional.max_unpool2dgrad': [torch.float32],
+        'nn.functional.max_unpool3dgrad': [torch.float32],
+        'nn.functional.avg_pool3d': [torch.float32, torch.int64],
+        'nn.functional.ctc_loss': [torch.float32],
+        'nn.functional.embedding_bag': [torch.float16, torch.float32],
+        'nn.functional.hardshrink': [torch.float32],
+        'nn.functional.hardsigmoid': [torch.float32],
+        'nn.functional.logsigmoid': [torch.float32],
+        'nn.functional.max_pool3d': [torch.float32],
+        'nn.functional.max_unpool1d': [torch.float32],
+        'nn.functional.max_unpool2d': [torch.float32],
+        'nn.functional.max_unpool3d': [torch.float32],
+        'nn.functional.mish': [torch.float32],
+        'nn.functional.multi_margin_loss': [torch.float32],
+        'nn.functional.multilabel_margin_loss': [torch.float32],
+        'nn.functional.multilabel_soft_margin_loss': [torch.float32],
+        'nn.functional.pdist': [torch.float32],
+        'nn.functional.rrelu': [torch.float32],
+        'nn.functional.softshrink': [torch.float32],
+        'nn.functional.unfold': [torch.float16, torch.float32],
+        'nn.functional.norm': [torch.float32],
+        'ormqr': [torch.float32],
+        'pca_lowrank': [torch.float32],
+        'pinverse': [torch.float32],
+        'polar': [torch.float32],
+        'polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'qr': [torch.float32],
+        'quantile': [torch.float32],
+        'renorm': [torch.float16, torch.float32],
+        'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemean': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceprod': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducesum': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'searchsorted': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduce': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduceoffsets': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reducelengths': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sinc': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sort': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.airy_ai': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_t': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.chebyshev_polynomial_u': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_h': [torch.bool,
+                                         torch.float16,
+                                         torch.float32,
+                                         torch.int16,
+                                         torch.int32,
+                                         torch.int64,
+                                         torch.uint8],
+        'special.hermite_polynomial_he': [torch.bool,
+                                          torch.float16,
+                                          torch.float32,
+                                          torch.int16,
+                                          torch.int32,
+                                          torch.int64,
+                                          torch.uint8],
+        'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.laguerre_polynomial_l': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.log_ndtr': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.ndtri': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygammaspecial_polygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.spherical_bessel_j0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.xlog1py': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.zeta': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'std_mean': [torch.float16, torch.float32],
+        'std_meanunbiased': [torch.float16, torch.float32],
+        'svd_lowrank': [torch.float32],
+        'symeig': [torch.float32],
+        'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'var_mean': [torch.float16, torch.float32],
+        'var_meanunbiased': [torch.float16, torch.float32],
+        'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'view_as_complex': [torch.float16, torch.float32],
+        'xlogy': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+    }
+
+    EXPECTED_FAILURES = {
+        # Failures due to unsupported data types on MPS backend
+        'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.conv1d': [torch.int64],
         'nn.functional.conv2d': [torch.int64],
         'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
-        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.local_response_norm': [torch.int64],
-        'nn.functional.padcircular': [torch.uint8],
-        'pow': [torch.int64],
-        'select_scatter': [torch.uint8],
-        'sigmoid': [torch.int64],
-
-
-        # failures due to lack of op implementation on MPS backend
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-
-        # These were moved from ALLOWLIST to BLOCK as they are not working
-        # locally
-        'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '__radd__': ['torch.bool', 'torch.uint8'],
-        '__rmul__': ['torch.uint8'],
-        'neg': ['torch.uint8'],
-        'add': ['torch.bool', 'torch.uint8'],
-        'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'diag': ['torch.int64'],
-        'diagflat': ['torch.int64'],
-
-        # Functions that are flaky
-        # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'as_strided': None,
-        'broadcast_tensors': None,
-        'broadcast': None,
-        'broadcast_to': None,
-        'diagonal': None,
-        'divfloor_rounding': None,
-        'divno_rounding_mode': None,
-        'divtrunc_rounding': None,
-        'dsplit': None,
-        'hsplit': None,
-        'empty': None,
-        'expand_as': None,
-        'expand': None,
-        'ge': None,
-        'ne': None,
-        'le': None,
-        'lt': None,
-        'gt': None,
-        'transpose': None,
-        'splitlist_args': None,
-        'select': None,
-        'reshape': None,
-        'reshape_as': None,
-        'permute': None,
-        'norm': None,
-        'nn.functional.pixel_unshuffle': None,
-        'nn.functional.pixel_shuffle': None,
-        'nn.functional.cross_entropy': None,
-        'nn.functional.one_hot': None,
-        'narrow': None,
-        'movedim': None,
-        'minreduction_with_dim': None,
-        'minreduction_no_dim': None,
-        'minbinary': None,
-        'meshgridvariadic_tensors': None,
-        'meshgridlist_of_tensors': None,
-        'maxreduction_with_dim': None,
-        'maxreduction_no_dim': None,
-        'maxbinary': None,
-        'maximum': None,
-        'minimum': None,
-        'outer': None,
-        'softmaxwith_dtype': None,
-        'rounddecimals_neg_3': None,
-        'rounddecimals_3': None,
-        'rounddecimals_0': None,
-        'normnuc': None,
-        'nn.functional.softminwith_dtype': None,
-        'nn.functional.feature_alpha_dropoutwith_train': None,
-        'log_softmaxwith_dtype': None,
-        'split_with_sizes': None,
-        'trapezoid': None,
-        'eq': None,
-        'mul': None,
-        'cartesian_prod': None,
-        'bool': None,
-        'inner': None,
-        'dstack': None,
-        'take_along_dim': None,
+        'nn.functional.softminwith_dtype': [torch.bool,
+                                            torch.float16,
+                                            torch.float32,
+                                            torch.int16,
+                                            torch.int32,
+                                            torch.int64,
+                                            torch.uint8],
+        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmatmul__': [torch.int16, torch.int32, torch.uint8],
+        'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addr': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cdouble': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cfloat': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'complex': [torch.float16, torch.float32],
+        'double': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'float_power': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'inner': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.matrix_rank': [torch.float32],
+        'linalg.matrix_rankhermitian': [torch.float32],
+        'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.pinv': [torch.float32],
+        'linalg.pinvhermitian': [torch.float32],
+        'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],  # MPS device does not support mm for non-float inputs
+        'mm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.batch_norm': [torch.float32],
+        'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'signal.windows.blackman': [torch.float16],
+        'signal.windows.cosine': [torch.float16],
+        'signal.windows.exponential': [torch.float16],
+        'signal.windows.gaussian': [torch.float16],
+        'signal.windows.general_cosine': [torch.float16],
+        'signal.windows.general_hamming': [torch.float16],
+        'signal.windows.hamming': [torch.float16],
+        'signal.windows.hann': [torch.float16],
+        'signal.windows.kaiser': [torch.float16],
+        'stft': [torch.float32],
+        'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
     }
 
-    # Those ops worked on MacOS12, but broken on MacOS13
-    VENTURA_BLOCKLIST = {
-        'masked.softmax': [torch.float32],
+    UNDEFINED_BEHAVIOUR = {
+        # Failures due to random output that they generate using
+        # Philox engine causing mismatch with CPU results
+        'uniform': [torch.float16, torch.float32],
+        'rand_like': [torch.float16, torch.float32],
+        'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'randn_like': [torch.float16, torch.float32],
+        'bernoulli': [torch.float32],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
+        'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'normalnumber_mean': [torch.float16, torch.float32],
+        'nn.functional.alpha_dropout': [torch.float32],
+        'nn.functional.dropout': [torch.float32],
+        'nn.functional.dropout2d': [torch.float32],
+        'nn.functional.dropout3d': [torch.float32],
+        # these fill tensors with uninitialized data, causing mismatch with CPU
+        'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
+        'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # duplicate indices are used in the testcase - undefined behaviour
+        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # problem 104760543, zero to negative integer powers are undefined
+        '__rpow__': [torch.int16, torch.int32, torch.int64],
+    }
+
+    FAST_MATH_PRECISION_ISSUES = {
+        # Failures due to precision issues
+        'tan': [torch.float32],
+        'pow': [torch.float32],
         'masked.softmin': [torch.float32],
+        'masked.softmax': [torch.float32],
         'masked.log_softmax': [torch.float32],
-        'dot': [torch.int64],
+        'cdist': [torch.float32],
+        '__rpow__': [torch.float32]
     }
 
     FP16_LOW_PRECISION_LIST = {
@@ -9638,14 +10590,59 @@ class TestConsistency(TestCaseMPS):
         'true_divide', 'kron',
         'gradient', 'var', 'std',
         'linalg.vector_norm',
-        'masked.sum', 'masked.std',
-        'masked.var',
+        'addr',
+
+        # for macOS 12
+        'masked.normalize', 'masked.sum', 'masked.var',
+        'outer',
+        'sum_to_size', 'sum',
+        'mul',
+    }
+
+    BLOCKLIST_MACOS_12 = {
+        # expected failures
+        'nn.functional.interpolatenearest': [torch.float32],
+        'nn.functional.upsample_nearest': [torch.float32],
+        'nn.functional.conv_transpose2d': [torch.float32]
     }
 
+    ALLOWLIST_MACOS_13_3 = {
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+        'nn.functional.conv_transpose2d': [torch.float32],
+    }
+
+    MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
+        FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
+
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
+    def get_error_message(self, key, op_name, dtype):
+        if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]:
+            return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
+        elif key in self.BLOCKLIST and dtype in self.BLOCKLIST[key]:
+            return f"Running test with {op_name} fails so skipping"
+        elif key in self.UNDEFINED_BEHAVIOUR and dtype in self.UNDEFINED_BEHAVIOUR[key]:
+            return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping"
+        elif key in self.EXPECTED_FAILURES and dtype in self.EXPECTED_FAILURES[key]:
+            return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
+        elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            return f"Running test with {op_name} expected to fail due to missing op implementation"
+        elif product_version < 13.0 and key in self.BLOCKLIST_MACOS_12 and dtype in self.BLOCKLIST_MACOS_12[key]:
+            return f"Running test with {op_name} expected to fail on macOS 12"
+        return None
+
+    def compare_with_CUDA(self, op, mps_out, atol, rtol):
+        cuda_out = CUDA_RESULT[op.name]
+        try:
+            self.assertEqual(cuda_out, mps_out, atol=atol, rtol=rtol)
+        except Exception as e:
+            return False
+        else:
+            return True
+
     @ops(op_db, allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
@@ -9653,13 +10650,15 @@ def test_output_match(self, device, dtype, op):
             self.skipTest("MPS is not available")
 
         key = op.name + op.variant_test_name
-
-        if key in self.VENTURA_BLOCKLIST and torch.backends.mps.is_macos13_or_newer():
-            if dtype in self.VENTURA_BLOCKLIST[key]:
-                self.skipTest(f"{key}_{dtype} fails on Ventura, see https://github.com/pytorch/pytorch/issues/85758")
-        if key in self.BLOCKLIST:
-            if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
-                self.skipTest(f"Running test with {op.name} hangs so skipping")
+        if key in self.MPS_SKIP_LIST:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None and not (product_version >= 13.3 and
+                                        key in self.ALLOWLIST_MACOS_13_3 and dtype in self.ALLOWLIST_MACOS_13_3[key]):
+                self.skipTest(msg)
+        if product_version < 13.0 and key in self.BLOCKLIST_MACOS_12:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None:
+                self.skipTest(msg)
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP
@@ -9677,7 +10676,10 @@ def test_output_match(self, device, dtype, op):
                 if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
                     self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
 
-            if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]:
+            if (op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name] or
+               (op.name in self.BLOCKLIST_OP_GRAD and dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD[op.name]) or
+               (product_version < 13.0 and op.name in self.BLOCKLIST_OP_GRAD_MACOS_12 and
+               dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD_MACOS_12[op.name])):
                 run_grad_test = False
 
         def get_samples():
@@ -9709,7 +10711,7 @@ def get_samples():
                 cpu_out = op(*cpu_args, **cpu_kwargs)
                 mps_out = op(*mps_args, **mps_kwargs)
 
-                if op.name == "nn.functional.conv2d" and dtype == torch.float32:
+                if op.name == "nn.functional.conv2d" or op.name == "linalg.multi_dot" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
                 elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16:
@@ -9721,6 +10723,11 @@ def get_samples():
                 elif (op.name == "native_layer_norm"):
                     atol = 1e-4
                     rtol = 1.3e-5
+                elif op.name == "norm" and dtype == torch.float16:
+                    atol = 7e-4
+                    rtol = 1.5e-3
+                elif op.name == "unique" and cpu_kwargs["sorted"] is False:
+                    continue
                 else:
                     atol = None
                     rtol = None
@@ -9731,6 +10738,9 @@ def get_samples():
                 if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]):
                     self.skipTest(f"Expected Runtime Error: {str(e)}")
 
+                if op.name in CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                    continue
+
                 if not generate_new_truth:
                     raise e
                 forward_failed = True
@@ -9808,6 +10818,12 @@ def req_grad(t):
 # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
 @skipIfSlowGradcheckEnv
 class TestCommon(TestCase):
+
+    UNIMPLEMENTED_OPS = {
+        'aminmax': [torch.float32],
+        'roll': [torch.float32],
+    }
+
     exact_dtype = True
 
     # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI
@@ -9838,6 +10854,10 @@ def tearDownClass(cls):
     # MPS only supports float32
     @ops(_ref_test_ops, allowed_dtypes=(torch.float32,))
     def test_numpy_ref_mps(self, device, dtype, op):
+        key = op.name + op.variant_test_name
+        if key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            self.skipTest(f"Running test with {op.name} expected to fail due to missing op implementation")
+
         # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
         # does not support float64 Tensors.
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8e34ec10a8350..75e87155c7ca0 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -13,7 +13,7 @@
 import torch.backends.mps
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_WITH_MPS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
@@ -555,10 +555,8 @@ def get_device_type_test_bases():
         test_bases.append(CPUTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
-        # Disable MPS testing in generic device testing temporarily while we're
-        # ramping up support.
-        # elif torch.backends.mps.is_available():
-        #   test_bases.append(MPSTestBase)
+        elif torch.backends.mps.is_available():
+          test_bases.append(MPSTestBase)
 
     return test_bases
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 03193f5ed7b27..66466c56aa3a9 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -896,6 +896,7 @@ def _check_module_exists(name: str) -> bool:
 TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1'
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
+TEST_WITH_MPS = os.getenv('PYTORCH_TEST_WITH_MPS', '0') == '1'
 
 # Enables tests that are slow to run (disabled by default)
 TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'