From 3dfcfb5829e9928f7993a58c9780f5425ffd7fbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 19 Nov 2025 14:13:10 +0000
Subject: [PATCH 1/2] Copy Torch from hf-nix

We have decided to make kernel-builder standalone to make it easier to
maintain Torch versions as part of the same repo. This copies over Torch
from hf-nix. Other bits are still needed, but let's start somewhere.
---
 overlay.nix                                   |  20 +
 pkgs/python-modules/torch/archs.nix           | 128 +++
 pkgs/python-modules/torch/binary/default.nix  |  43 +
 .../torch/binary/generate_torch_versions.py   | 311 +++++++
 pkgs/python-modules/torch/binary/generic.nix  | 336 ++++++++
 .../torch/binary/torch-versions-hash.json     | 128 +++
 .../torch/binary/torch-versions.json          |  99 +++
 .../source/2_8/0001-patch-xpu-ops-CMake.patch |  50 ++
 .../2_8/cmake-load-hip-invalid-state.diff     |  40 +
 .../torch/source/2_8/default.nix              | 761 ++++++++++++++++++
 .../source/2_8/fix-cmake-cuda-toolkit.patch   |  51 ++
 .../torch/source/2_8/mkl-rpath.patch          |  17 +
 .../source/2_8/pthreadpool-disable-gcd.diff   |  45 ++
 .../torch/source/2_8/pytorch-pr-108847.patch  |  31 +
 .../source/2_9/0001-patch-xpu-ops-CMake.patch |  50 ++
 .../torch/source/2_9/default.nix              | 744 +++++++++++++++++
 .../source/2_9/fix-cmake-cuda-toolkit.patch   |  51 ++
 .../torch/source/2_9/mkl-rpath.patch          |  17 +
 .../source/2_9/pthreadpool-disable-gcd.diff   |  45 ++
 .../torch/source/2_9/pytorch-pr-108847.patch  |  31 +
 20 files changed, 2998 insertions(+)
 create mode 100644 pkgs/python-modules/torch/archs.nix
 create mode 100644 pkgs/python-modules/torch/binary/default.nix
 create mode 100755 pkgs/python-modules/torch/binary/generate_torch_versions.py
 create mode 100644 pkgs/python-modules/torch/binary/generic.nix
 create mode 100644 pkgs/python-modules/torch/binary/torch-versions-hash.json
 create mode 100644 pkgs/python-modules/torch/binary/torch-versions.json
 create mode 100644 pkgs/python-modules/torch/source/2_8/0001-patch-xpu-ops-CMake.patch
 create mode 100644 pkgs/python-modules/torch/source/2_8/cmake-load-hip-invalid-state.diff
 create mode 100644 pkgs/python-modules/torch/source/2_8/default.nix
 create mode 100644 pkgs/python-modules/torch/source/2_8/fix-cmake-cuda-toolkit.patch
 create mode 100644 pkgs/python-modules/torch/source/2_8/mkl-rpath.patch
 create mode 100644 pkgs/python-modules/torch/source/2_8/pthreadpool-disable-gcd.diff
 create mode 100644 pkgs/python-modules/torch/source/2_8/pytorch-pr-108847.patch
 create mode 100644 pkgs/python-modules/torch/source/2_9/0001-patch-xpu-ops-CMake.patch
 create mode 100644 pkgs/python-modules/torch/source/2_9/default.nix
 create mode 100644 pkgs/python-modules/torch/source/2_9/fix-cmake-cuda-toolkit.patch
 create mode 100644 pkgs/python-modules/torch/source/2_9/mkl-rpath.patch
 create mode 100644 pkgs/python-modules/torch/source/2_9/pthreadpool-disable-gcd.diff
 create mode 100644 pkgs/python-modules/torch/source/2_9/pytorch-pr-108847.patch

diff --git a/overlay.nix b/overlay.nix
index f9cb0467..b60a54ce 100644
--- a/overlay.nix
+++ b/overlay.nix
@@ -50,6 +50,26 @@ final: prev: {
         });
 
         pyclibrary = python-self.callPackage ./pkgs/python-modules/pyclibrary { };
+
+        mkTorch = callPackage ./pkgs/python-modules/torch/binary { };
+
+        torch-bin_2_8 = mkTorch {
+          version = "2.8";
+          xpuPackages = final.xpuPackages_2025_1;
+        };
+
+        torch-bin_2_9 = mkTorch {
+          version = "2.9";
+          xpuPackages = final.xpuPackages_2025_2;
+        };
+
+        torch_2_8 = callPackage ./pkgs/python-modules/torch/source/2_8 {
+          xpuPackages = final.xpuPackages_2025_1;
+        };
+
+        torch_2_9 = callPackage ./pkgs/python-modules/torch/source/2_9 {
+          xpuPackages = final.xpuPackages_2025_2;
+        };
       }
     )
   ];
diff --git a/pkgs/python-modules/torch/archs.nix b/pkgs/python-modules/torch/archs.nix
new file mode 100644
index 00000000..3dd93448
--- /dev/null
+++ b/pkgs/python-modules/torch/archs.nix
@@ -0,0 +1,128 @@
+{
+  "2.8" = {
+    # https://github.com/pytorch/pytorch/blob/release/2.8/.ci/manywheel/build_cuda.sh
+    capsPerCudaVersion = {
+      "12.9" = [
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+        "10.0"
+        "12.0"
+      ];
+      "12.8" = [
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+        "10.0"
+        "12.0"
+      ];
+      "12.6" = [
+        "5.0"
+        "6.0"
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+      ];
+      # Not a supported upstream configuration, but keep it around for
+      # builds that fail on newer CUDA versions.
+      "12.4" = [
+        "5.0"
+        "6.0"
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+      ];
+    };
+    # https://github.com/pytorch/pytorch/blob/ba56102387ef21a3b04b357e5b183d48f0afefc7/.ci/docker/manywheel/build.sh#L82
+    supportedTorchRocmArchs = [
+      "gfx900"
+      "gfx906"
+      "gfx908"
+      "gfx90a"
+      "gfx942"
+      "gfx1030"
+      "gfx1100"
+      "gfx1101"
+      "gfx1102"
+      "gfx1200"
+      "gfx1201"
+    ];
+  };
+
+  "2.9" = {
+    # https://github.com/pytorch/pytorch/blob/release/2.9/.ci/manywheel/build_cuda.sh
+    capsPerCudaVersion = {
+      "13.0" = [
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+        "10.0"
+        "12.0"
+      ];
+      # NOTE: 12.9 does not seem to be in RC builds, check if needed for final release.
+      #       https://download.pytorch.org/whl/test/torch/
+      "12.9" = [
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+        "10.0"
+        "12.0"
+      ];
+      "12.8" = [
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+        "10.0"
+        "12.0"
+      ];
+      "12.6" = [
+        "5.0"
+        "6.0"
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+      ];
+      # Not a supported upstream configuration, but keep it around for
+      # builds that fail on newer CUDA versions.
+      "12.4" = [
+        "5.0"
+        "6.0"
+        "7.0"
+        "7.5"
+        "8.0"
+        "8.6"
+        "9.0"
+      ];
+    };
+
+    supportedTorchRocmArchs = [
+      # https://github.com/pytorch/pytorch/blob/21fec65781bebe867faf209f89bb687ffd236ca4/.ci/docker/manywheel/build.sh#L92
+      "gfx900"
+      "gfx906"
+      "gfx908"
+      "gfx90a"
+      "gfx942"
+      "gfx1030"
+      "gfx1100"
+      "gfx1101"
+      "gfx1102"
+      "gfx1200"
+      "gfx1201"
+    ];
+  };
+}
diff --git a/pkgs/python-modules/torch/binary/default.nix b/pkgs/python-modules/torch/binary/default.nix
new file mode 100644
index 00000000..6ef0127a
--- /dev/null
+++ b/pkgs/python-modules/torch/binary/default.nix
@@ -0,0 +1,43 @@
+{
+  config,
+  lib,
+  stdenv,
+
+  cudaSupport ? config.cudaSupport,
+  rocmSupport ? config.rocmSupport,
+  xpuSupport ? (config.xpuSupport or false),
+
+  callPackage,
+  cudaPackages,
+  rocmPackages,
+}:
+
+{
+  xpuPackages,
+  version,
+}:
+
+let
+  system = stdenv.hostPlatform.system;
+  flattenVersion = version: lib.replaceStrings [ "." ] [ "" ] (lib.versions.pad 2 version);
+  framework =
+    if cudaSupport then
+      "cu${flattenVersion cudaPackages.cudaMajorMinorVersion}"
+    else if rocmSupport then
+      "rocm${flattenVersion (lib.versions.majorMinor rocmPackages.rocm.version)}"
+    else if xpuSupport then
+      "xpu"
+    else
+      "cpu";
+  torchVersions = builtins.fromJSON (builtins.readFile ./torch-versions-hash.json);
+  torchBySystem = torchVersions.${version} or (throw "Unsupported torch version: ${version}");
+  torchByFramework =
+    torchBySystem.${system} or (throw "Unsupported system: ${system} for torch version: ${version}");
+  urlHash =
+    torchByFramework.${framework}
+      or (throw "Unsupported framework: ${framework} for torch version: ${version} on system: ${system}");
+in
+callPackage ./generic.nix {
+  inherit xpuPackages;
+  inherit (urlHash) url hash version;
+}
diff --git a/pkgs/python-modules/torch/binary/generate_torch_versions.py b/pkgs/python-modules/torch/binary/generate_torch_versions.py
new file mode 100755
index 00000000..26da246e
--- /dev/null
+++ b/pkgs/python-modules/torch/binary/generate_torch_versions.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env nix-shell
+#!nix-shell -i python3 -p python3
+"""
+Script to generate torch-versions-hash.json from torch-versions.json
+
+This script downloads all the variants that are specified and computes
+their Nix store hashes. Variants for which the hash was already computed
+will not be proccessed again to avoid redownloading/hashing.
+"""
+
+import json
+import subprocess
+import sys
+import urllib.parse
+from typing import Dict
+import os
+
+OUTPUT_FILE = "torch-versions-hash.json"
+
+PYTHON_VERSION = "cp313"
+
+
+def load_existing_hashes() -> Dict[str, str]:
+    """Load existing URL -> hash mappings from output file"""
+    if os.path.exists(OUTPUT_FILE):
+        try:
+            with open(OUTPUT_FILE, "r") as f:
+                data = json.load(f)
+                url_to_hash = {}
+                for version_data in data.values():
+                    for system_data in version_data.values():
+                        for framework_data in system_data.values():
+                            if (
+                                isinstance(framework_data, dict)
+                                and "url" in framework_data
+                                and "hash" in framework_data
+                            ):
+                                if framework_data["hash"]:
+                                    url_to_hash[framework_data["url"]] = framework_data[
+                                        "hash"
+                                    ]
+                return url_to_hash
+        except (json.JSONDecodeError, IOError) as e:
+            # If we fail to parse the file, emit a warning and start from scratch.
+            print(
+                f"Warning: Could not load existing {OUTPUT_FILE}: {e}", file=sys.stderr
+            )
+    return {}
+
+
+def cuda_version_to_framework(cuda_version: str) -> str:
+    """Convert CUDA version like '11.8' to framework identifier like 'cu118'"""
+    return f"cu{cuda_version.replace('.', '')}"
+
+
+def rocm_version_to_framework(rocm_version: str) -> str:
+    """Convert ROCm version like '6.3' to framework identifier like 'rocm6.3'"""
+    return f"rocm{rocm_version}"
+
+
+def version_to_major_minor(version: str) -> str:
+    """Convert version like '2.8.0' to '2.8'"""
+    parts = version.split(".")
+    if len(parts) >= 2:
+        return f"{parts[0]}.{parts[1]}"
+    return version
+
+
+def system_to_platform(system: str, framework_type: str = None) -> str:
+    """Convert system identifier to platform string for wheel naming"""
+    # XPU uses a different platform format
+    if framework_type == "xpu":
+        xpu_platform_map = {
+            "x86_64-linux": "linux_x86_64",
+        }
+        return xpu_platform_map.get(system, system)
+
+    platform_map = {
+        "x86_64-linux": "manylinux_2_28_x86_64",
+        "aarch64-linux": "manylinux_2_28_aarch64",
+        "aarch64-darwin": "macosx_11_0_arm64",
+    }
+    return platform_map.get(system, system)
+
+
+def generate_pytorch_url(
+    torch_version: str,
+    framework_version: str,
+    framework_type: str,
+    python_version: str,
+    system: str,
+) -> str:
+    """Generate PyTorch wheel download URL."""
+    platform = system_to_platform(system, framework_type)
+
+    # macOS uses CPU wheels (no CUDA/ROCm/XPU support)
+    if "darwin" in system:
+        framework_dir = "cpu"
+        version_part = torch_version
+        abi_tag = "none" if "darwin" in system else python_version
+        wheel_name = f"torch-{version_part}-{python_version}-{abi_tag}-{platform}.whl"
+    elif framework_type == "cpu":
+        framework_dir = "cpu"
+        version_part = f"{torch_version}%2Bcpu"
+        abi_tag = python_version
+        wheel_name = f"torch-{version_part}-{python_version}-{abi_tag}-{platform}.whl"
+    elif framework_type == "xpu":
+        framework = "xpu"
+        framework_dir = framework
+        version_part = f"{torch_version}%2B{framework}"
+        abi_tag = python_version
+        wheel_name = f"torch-{version_part}-{python_version}-{abi_tag}-{platform}.whl"
+    else:
+        if framework_type == "cuda":
+            framework = cuda_version_to_framework(framework_version)
+        elif framework_type == "rocm":
+            framework = rocm_version_to_framework(framework_version)
+        else:
+            raise ValueError(f"Unsupported framework type: {framework_type}")
+
+        framework_dir = framework
+        version_part = f"{torch_version}%2B{framework}"
+        abi_tag = python_version
+        wheel_name = f"torch-{version_part}-{python_version}-{abi_tag}-{platform}.whl"
+
+    return f"https://download.pytorch.org/whl/{framework_dir}/{wheel_name}"
+
+
+def compute_nix_hash(url: str) -> str:
+    try:
+        print(f"Fetching hash for: {url}")
+
+        # Some URL encodings are not valid in store paths, so unquote.
+        filename = url.split("/")[-1]
+        clean_filename = urllib.parse.unquote(filename)
+
+        result = subprocess.run(
+            ["nix-prefetch-url", "--type", "sha256", "--name", clean_filename, url],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        base32_hash = result.stdout.strip()
+
+        # Convert base32 hash to SRI format.
+        convert_result = subprocess.run(
+            [
+                "nix",
+                "hash",
+                "convert",
+                "--hash-algo",
+                "sha256",
+                "--from",
+                "nix32",
+                base32_hash,
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        return convert_result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"Error computing hash for {url}: {e.stderr}", file=sys.stderr)
+        sys.exit(1)
+    except FileNotFoundError as e:
+        if "nix-prefetch-url" in str(e):
+            print(
+                "Error: nix-prefetch-url not found. Please ensure Nix is installed.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        else:
+            print(
+                "Error: nix command not found. Please ensure Nix is installed.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+
+def main():
+    existing_hashes = load_existing_hashes()
+    cache_hits = 0
+    cache_misses = 0
+
+    try:
+        with open("torch-versions.json", "r") as f:
+            torch_versions = json.load(f)
+    except FileNotFoundError:
+        print("Error: torch-versions.json not found", file=sys.stderr)
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error parsing torch-versions.json: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    urls_hashes = {}
+
+    print(f"Processing {len(torch_versions)} entries from torch-versions.json")
+    print(f"Found {len(existing_hashes)} existing hashes")
+
+    for entry in torch_versions:
+        torch_version = entry.get("torchVersion")
+        cuda_version = entry.get("cudaVersion")
+        rocm_version = entry.get("rocmVersion")
+        xpu_version = entry.get("xpuVersion")
+        cpu = entry.get("cpu", False)
+        metal = entry.get("metal", False)
+        systems = entry.get("systems", [])
+
+        if not torch_version:
+            print(f"Skipping entry without torchVersion: {entry}", file=sys.stderr)
+            continue
+
+        version_key = version_to_major_minor(torch_version)
+
+        if cuda_version:
+            framework_type = "cuda"
+            framework_version = cuda_version
+            print(f"Processing torch {torch_version} with CUDA {cuda_version}")
+        elif rocm_version:
+            framework_type = "rocm"
+            framework_version = rocm_version
+            print(f"Processing torch {torch_version} with ROCm {rocm_version}")
+        elif xpu_version:
+            framework_type = "xpu"
+            framework_version = xpu_version
+            print(f"Processing torch {torch_version} with XPU {xpu_version}")
+        elif cpu:
+            framework_type = "cpu"
+            framework_version = "cpu"
+            print(f"Processing torch {torch_version} (CPU build)")
+        elif metal:
+            framework_type = "cpu"
+            framework_version = "cpu"
+            print(
+                f"Processing torch {torch_version} (CPU-only build with Metal support)"
+            )
+        else:
+            print(
+                f"Skipping entry without framework specification: {entry}",
+                file=sys.stderr,
+            )
+            continue
+
+        if version_key not in urls_hashes:
+            urls_hashes[version_key] = {}
+
+        for system in systems:
+            print(f"  Processing system: {system}")
+
+            if system not in urls_hashes[version_key]:
+                urls_hashes[version_key][system] = {}
+
+            if "darwin" in system:
+                framework = "cpu"
+            else:
+                if framework_type == "cuda":
+                    framework = cuda_version_to_framework(framework_version)
+                elif framework_type == "rocm":
+                    framework = rocm_version_to_framework(framework_version)
+                elif framework_type == "xpu":
+                    framework = "xpu"
+                elif framework_type == "cpu":
+                    framework = "cpu"
+                else:
+                    print(
+                        f"    ⚠️  Warning: Unknown framework type {framework_type} for Linux system {system}",
+                        file=sys.stderr,
+                    )
+                    continue
+
+            url = generate_pytorch_url(
+                torch_version, framework_version, framework_type, PYTHON_VERSION, system
+            )
+            print(f"    URL: {url}")
+
+            was_cached = url in existing_hashes
+            if was_cached:
+                hash_value = existing_hashes[url]
+            else:
+                hash_value = compute_nix_hash(url)
+
+            if was_cached:
+                cache_hits += 1
+            else:
+                cache_misses += 1
+
+            urls_hashes[version_key][system][framework.replace(".", "")] = {
+                "url": url,
+                "hash": hash_value,
+                "version": torch_version,
+            }
+
+            print(f"    Hash: {hash_value}")
+
+    try:
+        with open(OUTPUT_FILE, "w") as f:
+            json.dump(urls_hashes, f, indent=2)
+        print(f"Successfully generated {OUTPUT_FILE}")
+    except IOError as e:
+        print(f"Error writing {OUTPUT_FILE}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    total_urls = cache_hits + cache_misses
+    if total_urls > 0:
+        print(
+            f"Cache statistics: {cache_hits}/{total_urls} hits ({cache_hits/total_urls*100:.1f}% hit rate)"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pkgs/python-modules/torch/binary/generic.nix b/pkgs/python-modules/torch/binary/generic.nix
new file mode 100644
index 00000000..8acd481c
--- /dev/null
+++ b/pkgs/python-modules/torch/binary/generic.nix
@@ -0,0 +1,336 @@
+{
+  config,
+  lib,
+  stdenv,
+  symlinkJoin,
+  buildPythonPackage,
+  fetchurl,
+
+  cudaSupport ? config.cudaSupport,
+  rocmSupport ? config.rocmSupport,
+  tritonSupport ? (!stdenv.hostPlatform.isDarwin),
+  xpuSupport ? (config.xpuSupport or false),
+
+  # Native build inputs
+  autoAddDriverRunpath,
+  autoPatchelfHook,
+  python,
+  pythonRelaxWheelDepsHook,
+  pythonWheelDepsCheckHook,
+
+  # Build inputs
+  cudaPackages,
+  rocmPackages,
+  xpuPackages,
+
+  # Python dependencies
+  filelock,
+  fsspec,
+  jinja2,
+  networkx,
+  numpy,
+  pyyaml,
+  requests,
+  setuptools,
+  sympy,
+  triton,
+  triton-cuda,
+  typing-extensions,
+
+  url,
+  hash,
+  version,
+  # Remove, needed for compat.
+  cxx11Abi ? true,
+
+  effectiveStdenv ? if cudaSupport then cudaPackages.backendStdenv else stdenv,
+}:
+let
+  effectiveTriton =
+    if cudaSupport then
+      triton-cuda
+    else if xpuSupport then
+      python.pkgs.triton-xpu_2_8
+    else
+      triton;
+
+  archs = (import ../archs.nix).${lib.versions.majorMinor version};
+
+  supportedTorchCudaCapabilities =
+    let
+      inherit (archs) capsPerCudaVersion;
+      real = capsPerCudaVersion."${lib.versions.majorMinor cudaPackages.cudaMajorMinorVersion}";
+      ptx = lib.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+  supportedCudaCapabilities = lib.intersectLists cudaPackages.flags.cudaCapabilities supportedTorchCudaCapabilities;
+  inherit (archs) supportedTorchRocmArchs;
+
+  aotritonVersions = with rocmPackages; {
+    "2.8" = aotriton_0_10;
+    "2.9" = aotriton_0_11;
+  };
+
+  aotriton =
+    let
+      torchMajorMinor = lib.versions.majorMinor version;
+    in
+    aotritonVersions.${torchMajorMinor}
+      or (throw "aotriton version is not specified Torch ${torchMajorMinor}");
+
+  rocmtoolkit_joined = symlinkJoin {
+    name = "rocm-merged";
+
+    paths = with rocmPackages; [
+      aotriton
+      clr
+      comgr
+      hipblas
+      hipblas-common-devel
+      hipblaslt
+      hipfft
+      hipify-clang
+      hiprand
+      hipsolver
+      hipsparse
+      hipsparselt
+      hsa-rocr
+      miopen-hip
+      rccl
+      rocblas
+      rocm-core
+      rocm-device-libs
+      rocm-hip-runtime
+      rocm-smi-lib
+      rocminfo
+      rocrand
+      rocsolver
+      rocsparse
+      roctracer
+    ];
+
+    postBuild = ''
+      # Fix `setuptools` not being found
+      rm -rf $out/nix-support
+
+      # Variables that we want to pass through to downstream derivations.
+      mkdir -p $out/nix-support
+      echo 'export ROCM_PATH="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export ROCM_SOURCE_DIR="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export CMAKE_CXX_FLAGS="-I${placeholder "out"}/include -I${placeholder "out"}/include/rocblas"' >> $out/nix-support/setup-hook
+    '';
+  };
+
+in
+buildPythonPackage {
+  pname = "torch";
+  inherit version;
+  format = "wheel";
+
+  stdenv = effectiveStdenv;
+
+  outputs = [
+    "out" # output standard python package
+    "cxxdev" # propagated deps for the cmake consumers of torch
+  ];
+  cudaPropagateToOutput = "cxxdev";
+  rocmPropagateToOutput = "cxxdev";
+
+  src = fetchurl {
+    inherit url hash;
+  };
+
+  nativeBuildInputs = [
+    pythonRelaxWheelDepsHook
+    pythonWheelDepsCheckHook
+  ]
+  ++ lib.optionals stdenv.hostPlatform.isLinux [
+    autoPatchelfHook
+  ]
+  ++ lib.optionals cudaSupport [
+    autoAddDriverRunpath
+    cudaPackages.setupCudaHook
+  ]
+  ++ lib.optionals rocmSupport [
+    rocmPackages.setupRocmHook
+  ];
+
+  buildInputs =
+    lib.optionals cudaSupport (
+      with cudaPackages;
+      [
+        # Use lib output to avoid libcuda.so.1 stub getting used.
+        cuda_cudart
+        cuda_cupti
+        cuda_nvrtc
+        cudnn
+        libcublas
+        libcufft
+        libcufile
+        libcurand
+        libcusolver
+        libcusparse
+        libcusparse_lt
+        nccl
+      ]
+    )
+    ++ lib.optionals (cudaSupport && lib.versionAtLeast version "2.9") [
+      cudaPackages.libnvshmem
+    ]
+    ++ lib.optionals rocmSupport ([
+      rocmtoolkit_joined
+    ])
+    ++ lib.optionals xpuSupport (
+      with xpuPackages;
+      [
+        intel-oneapi-ccl
+        intel-oneapi-compiler-dpcpp-cpp-runtime
+        intel-oneapi-compiler-shared-runtime
+        intel-oneapi-mkl-core
+        intel-oneapi-mkl-sycl-blas
+        intel-oneapi-mkl-sycl-dft
+        intel-oneapi-mkl-sycl-lapack
+        intel-oneapi-mpi
+        intel-pti
+      ]
+    );
+
+  dependencies = [
+    filelock
+    fsspec
+    jinja2
+    networkx
+    numpy
+    pyyaml
+    requests
+    setuptools
+    sympy
+    typing-extensions
+  ]
+  ++ lib.optionals tritonSupport [
+    effectiveTriton
+  ];
+
+  pythonRelaxWheelDeps = [
+    "sympy"
+    "triton"
+  ];
+
+  # These are framework dependencies that are normally installed as Python
+  # dependencies, but we don't need them or provide them because we burn
+  # the Nix store paths of the framework into the Torch libraries..
+  pythonRemoveWheelDeps =
+    lib.optionals cudaSupport [
+      "nvidia-cuda-runtime"
+      "nvidia-cuda-nvrtc"
+      "nvidia-cuda-cupti"
+      "nvidia-cudnn"
+      "nvidia-cublas"
+      "nvidia-cufft"
+      "nvidia-curand"
+      "nvidia-cusolver"
+      "nvidia-cusparse"
+      "nvidia-cusparselt"
+      "nvidia-nccl"
+      "nvidia-nvshmem"
+      "nvidia-nvtx"
+      "nvidia-nvjitlink"
+      "nvidia-cufile"
+    ]
+    ++ lib.optionals rocmSupport [
+      "pytorch-triton-rocm"
+    ]
+    ++ lib.optionals xpuSupport [
+      "intel-cmplr-lib-rt"
+      "intel-cmplr-lib-ur"
+      "intel-cmplr-lic-rt"
+      "intel-sycl-rt"
+      "oneccl-devel"
+      "oneccl"
+      "impi-rt"
+      "onemkl-sycl-blas"
+      "onemkl-sycl-dft"
+      "onemkl-sycl-lapack"
+      "onemkl-sycl-rng"
+      "onemkl-sycl-sparse"
+      "dpcpp-cpp-rt"
+      "intel-opencl-rt"
+      "mkl"
+      "intel-openmp"
+      "tbb"
+      "tcmlib"
+      "umf"
+      "intel-pti"
+      "pytorch-triton-xpu"
+    ];
+
+  propagatedCxxBuildInputs = lib.optionals rocmSupport [ rocmtoolkit_joined ];
+
+  postInstall =
+    lib.optionalString cudaSupport ''
+      # Remove to use FindCUDAToolkit from CMake.
+      rm -f $out/${python.sitePackages}/torch/share/cmake/Caffe2/FindCUDAToolkit.cmake
+    ''
+    + lib.optionalString rocmSupport ''
+      # Remove all ROCm libraries, we want to link against Nix packages.
+      # This keeps the outputs lean and requires downstream to specify
+      # dependencies.
+      rm -rf $out/${python.sitePackages}/torch/lib/{libamd*,libaotriton*,libdrm*,libelf*,libgomp*,libhip*,libhsa*,libMIOpen*,libnuma*,librccl*,libroc*,libtinfo*}.so*
+      rm -rf $out/${python.sitePackages}/torch/lib/{rocblas,hipblaslt,hipsparselt}
+    '';
+
+  autoPatchelfIgnoreMissingDeps = lib.optionals stdenv.hostPlatform.isLinux [
+    "libcuda.so.1"
+  ];
+
+  # We want to have glibc in RPATH as well, because kernel-builder build
+  # environments use an older glibc.
+  autoPatchelfFlags = [ "--keep-libc" ];
+
+  # See https://github.com/NixOS/nixpkgs/issues/296179
+  #
+  # This is a quick hack to add `libnvrtc` to the runpath so that torch can find
+  # it when it is needed at runtime.
+  extraRunpaths = lib.optionals cudaSupport [ "${lib.getLib cudaPackages.cuda_nvrtc}/lib" ];
+  postPhases = lib.optionals stdenv.hostPlatform.isLinux [ "postPatchelfPhase" ];
+  postPatchelfPhase = ''
+    while IFS= read -r -d $'\0' elf ; do
+      for extra in $extraRunpaths ; do
+        echo patchelf "$elf" --add-rpath "$extra" >&2
+        patchelf "$elf" --add-rpath "$extra"
+      done
+    done < <(
+      find "''${!outputLib}" "$out" -type f -iname '*.so' -print0
+    )
+  '';
+
+  postFixup = ''
+    mkdir -p "$cxxdev/nix-support"
+    printWords "''${propagatedCxxBuildInputs[@]}" >> "$cxxdev/nix-support/propagated-build-inputs"
+  '';
+
+  dontStrip = true;
+
+  pythonImportsCheck = [ "torch" ];
+
+  passthru = {
+    inherit
+      cudaSupport
+      cudaPackages
+      cxx11Abi
+      rocmSupport
+      rocmPackages
+      xpuSupport
+      xpuPackages
+      ;
+
+    cudaCapabilities = if cudaSupport then supportedCudaCapabilities else [ ];
+    rocmArchs = if rocmSupport then supportedTorchRocmArchs else [ ];
+  };
+
+  meta = with lib; {
+    description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
+    homepage = "https://pytorch.org/";
+    license = lib.licenses.bsd3;
+  };
+}
diff --git a/pkgs/python-modules/torch/binary/torch-versions-hash.json b/pkgs/python-modules/torch/binary/torch-versions-hash.json
new file mode 100644
index 00000000..d1124f96
--- /dev/null
+++ b/pkgs/python-modules/torch/binary/torch-versions-hash.json
@@ -0,0 +1,128 @@
+{
+  "2.8": {
+    "x86_64-linux": {
+      "cu126": {
+        "url": "https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-0IFEAR5BC50VkU5yVuGxcIqQSEyywDcSGZ4CkYVtQXc=",
+        "version": "2.8.0"
+      },
+      "cu128": {
+        "url": "https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-OoUjaaON7DQ9RezQvDZg95uIoj4Mh40YcH98E79JU48=",
+        "version": "2.8.0"
+      },
+      "cu129": {
+        "url": "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-VjdAFnviGJtxUwtQPwyKjXqCZ91J1N5vnF8dI/viN98=",
+        "version": "2.8.0"
+      },
+      "rocm63": {
+        "url": "https://download.pytorch.org/whl/rocm6.3/torch-2.8.0%2Brocm6.3-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-LNqsVUXWbxEDCRSuA8hdFh8ksUBTtn9JUAqPUmOO8QQ=",
+        "version": "2.8.0"
+      },
+      "rocm64": {
+        "url": "https://download.pytorch.org/whl/rocm6.4/torch-2.8.0%2Brocm6.4-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-Ji6+rGkF34AZafttvcCFx1whxWCdyDVzNOwLCiQuUOs=",
+        "version": "2.8.0"
+      },
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-j4He20xgduwyWsw7R1JfnFUOUoShjq4dkGHFQ/e2594=",
+        "version": "2.8.0"
+      },
+      "xpu": {
+        "url": "https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp313-cp313-linux_x86_64.whl",
+        "hash": "sha256-ZZkiWk6DcU2MM7a/3q+9hTiQJ9KScGNbtzhdYiFL7p4=",
+        "version": "2.8.0"
+      }
+    },
+    "aarch64-linux": {
+      "cu129": {
+        "url": "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-rS1kMWY156sG9slzolJSbVmpKiBFglwQL4dpFKcjBNA=",
+        "version": "2.8.0"
+      },
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-pQZLXiN3LI0WQGjMfBLgGnX697lI7NlaDUAH10h+XyU=",
+        "version": "2.8.0"
+      }
+    },
+    "aarch64-darwin": {
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl",
+        "hash": "sha256-BX79MKZ3jS7l4jdM1jpj9jMRqm8zMh5ifGVd9gq905A=",
+        "version": "2.8.0"
+      }
+    }
+  },
+  "2.9": {
+    "x86_64-linux": {
+      "cu126": {
+        "url": "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-DEW2Nu1sAszuYk7FPYRMMrHbm+sj1hnzfEEVJGeQTNU=",
+        "version": "2.9.0"
+      },
+      "cu128": {
+        "url": "https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-l97wCH+O8XG5AC6lALr/3UQMe91VnCPDi7+HgbZ+k2Q=",
+        "version": "2.9.0"
+      },
+      "cu130": {
+        "url": "https://download.pytorch.org/whl/cu130/torch-2.9.0%2Bcu130-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-kqktts3jjQUiDB996Rzqz/AgVGOGxbegomjcquF7XBg=",
+        "version": "2.9.0"
+      },
+      "rocm63": {
+        "url": "https://download.pytorch.org/whl/rocm6.3/torch-2.9.0%2Brocm6.3-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-gw3ZjX2JlS7MV8aO6nCyTMAsDO1CSa7z9UzHsW/Wbic=",
+        "version": "2.9.0"
+      },
+      "rocm64": {
+        "url": "https://download.pytorch.org/whl/rocm6.4/torch-2.9.0%2Brocm6.4-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-yKiHbnxb2P3MOUKJB20rgyla3a1PXKYOo4ZRI9Xcn3Y=",
+        "version": "2.9.0"
+      },
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.9.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl",
+        "hash": "sha256-bJshdYRACWPVtNrds3Eex6N3jqshHhhlT7oHbM47hoI=",
+        "version": "2.9.0"
+      },
+      "xpu": {
+        "url": "https://download.pytorch.org/whl/xpu/torch-2.9.0%2Bxpu-cp313-cp313-linux_x86_64.whl",
+        "hash": "sha256-mNagbdf7GFh0NnsYvWCfBfFv3OQUKlmAypRGGUmWXNI=",
+        "version": "2.9.0"
+      }
+    },
+    "aarch64-linux": {
+      "cu126": {
+        "url": "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-qn+dy/BTDsEKIRRxyK+1Mo1bvKtvFP0beLXm8vFO1hc=",
+        "version": "2.9.0"
+      },
+      "cu128": {
+        "url": "https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-TXb3E0WvR/Aix/pV7dDBgQ0Br4ncue3P36/j0qD3prg=",
+        "version": "2.9.0"
+      },
+      "cu130": {
+        "url": "https://download.pytorch.org/whl/cu130/torch-2.9.0%2Bcu130-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-7PPSS9TA5uQlvXeKbembUieeACGmDX6xGrDC1mnz+bA=",
+        "version": "2.9.0"
+      },
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.9.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl",
+        "hash": "sha256-vkQ42NrX8NWl5U8P7viok0RolOyH8QK7HYLcxFGFQuQ=",
+        "version": "2.9.0"
+      }
+    },
+    "aarch64-darwin": {
+      "cpu": {
+        "url": "https://download.pytorch.org/whl/cpu/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl",
+        "hash": "sha256-4kg22Wi1TvTfsFWUABphlYcRrJIkAmKR5OP5L4Om/X8=",
+        "version": "2.9.0"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/pkgs/python-modules/torch/binary/torch-versions.json b/pkgs/python-modules/torch/binary/torch-versions.json
new file mode 100644
index 00000000..70167a72
--- /dev/null
+++ b/pkgs/python-modules/torch/binary/torch-versions.json
@@ -0,0 +1,99 @@
+[
+  {
+    "torchVersion": "2.8.0",
+    "cudaVersion": "12.6",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "cudaVersion": "12.8",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "cudaVersion": "12.9",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux", "aarch64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "rocmVersion": "6.3",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "rocmVersion": "6.4",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "cxx11Abi": true,
+    "metal": true,
+    "systems": ["aarch64-darwin"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "cxx11Abi": true,
+    "cpu": true,
+    "systems": ["aarch64-linux", "x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.8.0",
+    "xpuVersion": "2025.1.3",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+
+  {
+    "torchVersion": "2.9.0",
+    "cudaVersion": "12.6",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux", "aarch64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "cudaVersion": "12.8",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux", "aarch64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "cudaVersion": "13.0",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux", "aarch64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "rocmVersion": "6.3",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "rocmVersion": "6.4",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "cxx11Abi": true,
+    "metal": true,
+    "systems": ["aarch64-darwin"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "cxx11Abi": true,
+    "cpu": true,
+    "systems": ["aarch64-linux", "x86_64-linux"]
+  },
+  {
+    "torchVersion": "2.9.0",
+    "xpuVersion": "2025.2.1",
+    "cxx11Abi": true,
+    "systems": ["x86_64-linux"]
+  }
+]
diff --git a/pkgs/python-modules/torch/source/2_8/0001-patch-xpu-ops-CMake.patch b/pkgs/python-modules/torch/source/2_8/0001-patch-xpu-ops-CMake.patch
new file mode 100644
index 00000000..479446b0
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/0001-patch-xpu-ops-CMake.patch
@@ -0,0 +1,50 @@
+From 3b5dd2c142173a9a6afe6b21a3107e04d7fcc0ea Mon Sep 17 00:00:00 2001
+From: "Wang, Yi A" <yi.a.wang@intel.com>
+Date: Mon, 25 Aug 2025 17:39:38 -0700
+Subject: [PATCH] patch xpu ops CMake
+
+Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
+---
+ cmake/BuildFlags.cmake       | 1 +
+ cmake/Modules/FindSYCL.cmake | 6 +++++-
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
+index ec9aaccb..8f4740df 100644
+--- a/cmake/BuildFlags.cmake
++++ b/cmake/BuildFlags.cmake
+@@ -83,6 +83,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
++    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1)
+   endif()
+ 
+   if(CMAKE_BUILD_TYPE MATCHES Debug)
+diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake
+index 86457ba3..0feec0fa 100644
+--- a/cmake/Modules/FindSYCL.cmake
++++ b/cmake/Modules/FindSYCL.cmake
+@@ -34,7 +34,8 @@
+ #  SYCL_ADD_EXECUTABLE
+ #
+ #  SYCL_ADD_LIBRARY
+-
++set(SYCL_EXTRA_INCLUDE_DIRS $ENV{SYCL_EXTRA_INCLUDE_DIRS})
++string(REPLACE " " ";" SYCL_EXTRA_INCLUDE_DIRS "${SYCL_EXTRA_INCLUDE_DIRS}")
+ macro(SYCL_FIND_HELPER_FILE _name _extension)
+   set(_full_name "${_name}.${_extension}")
+   # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+@@ -216,6 +217,9 @@ macro(SYCL_WRAP_SRCS sycl_target generated_files)
+   set(generated_extension ${CMAKE_${SYCL_C_OR_CXX}_OUTPUT_EXTENSION})
+ 
+   set(SYCL_include_dirs "${SYCL_INCLUDE_DIR}")
++  if(DEFINED SYCL_EXTRA_INCLUDE_DIRS)
++    list(APPEND SYCL_include_dirs ${SYCL_EXTRA_INCLUDE_DIRS})
++  endif()
+   list(APPEND SYCL_include_dirs "$<TARGET_PROPERTY:${sycl_target},INCLUDE_DIRECTORIES>")
+ 
+   set(SYCL_compile_definitions "$<TARGET_PROPERTY:${sycl_target},COMPILE_DEFINITIONS>")
+-- 
+2.43.0
+
diff --git a/pkgs/python-modules/torch/source/2_8/cmake-load-hip-invalid-state.diff b/pkgs/python-modules/torch/source/2_8/cmake-load-hip-invalid-state.diff
new file mode 100644
index 00000000..fa1a3e10
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/cmake-load-hip-invalid-state.diff
@@ -0,0 +1,40 @@
+diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
+index cae0ca62f23..ed2bd0862f6 100644
+--- a/cmake/public/LoadHIP.cmake
++++ b/cmake/public/LoadHIP.cmake
+@@ -93,24 +93,25 @@ if(HIP_FOUND)
+   # hip (lower-case) package. Both are probed above and will be in
+   # ROCM_INCLUDE_DIRS if available.
+   find_file(ROCM_VERSION_HEADER_PATH
+-    NAMES rocm-core/rocm_version.h
++    NAMES
++      rocm-core/rocm_version.h  # Higher priority
++      hip/hip_version.h         # Lower priority
+     NO_DEFAULT_PATH
+     PATHS ${ROCM_INCLUDE_DIRS}
+   )
+-  set(ROCM_LIB_NAME "ROCM")
+-  if(NOT ROCM_VERSION_HEADER_PATH)
+-    find_file(ROCM_VERSION_HEADER_PATH
+-      NAMES hip/hip_version.h
+-      NO_DEFAULT_PATH
+-      PATHS ${ROCM_INCLUDE_DIRS}
+-    )
+-    set(ROCM_LIB_NAME "HIP")
+-  endif()
++
+   if(NOT ROCM_VERSION_HEADER_PATH)
+     message(FATAL_ERROR "Could not find hip/hip_version.h or rocm-core/rocm_version.h in ${ROCM_INCLUDE_DIRS}")
+   endif()
++
+   get_filename_component(ROCM_HEADER_NAME ${ROCM_VERSION_HEADER_PATH} NAME)
+ 
++  if(ROCM_HEADER_NAME STREQUAL "rocm_version.h")
++    set(ROCM_LIB_NAME "ROCM")
++  else()
++    set(ROCM_LIB_NAME "HIP")
++  endif()
++
+   if(EXISTS ${ROCM_VERSION_HEADER_PATH})
+     set(ROCM_HEADER_FILE ${ROCM_VERSION_HEADER_PATH})
+   else()
diff --git a/pkgs/python-modules/torch/source/2_8/default.nix b/pkgs/python-modules/torch/source/2_8/default.nix
new file mode 100644
index 00000000..fec710fd
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/default.nix
@@ -0,0 +1,761 @@
+{
+  stdenv,
+  stdenvAdapters,
+  lib,
+  fetchFromGitHub,
+  buildPythonPackage,
+  python,
+  config,
+  cudaSupport ? config.cudaSupport,
+  cudaPackages,
+  autoAddDriverRunpath,
+  effectiveMagma ?
+    if cudaSupport then
+      magma-cuda-static
+    else if rocmSupport then
+      magma-hip
+    else
+      magma,
+  effectiveStdenv ? if cudaSupport then cudaPackages.backendStdenv else stdenv,
+  magma,
+  magma-hip,
+  magma-cuda-static,
+  # Use the system NCCL as long as we're targeting CUDA on a supported platform.
+  useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported || rocmSupport),
+  MPISupport ? false,
+  mpi,
+  nvtx,
+  buildDocs ? false,
+  cxx11Abi ? true,
+
+  # tests.cudaAvailable:
+  callPackage,
+
+  # Native build inputs
+  cmake,
+  symlinkJoin,
+  which,
+  pybind11,
+  removeReferencesTo,
+
+  # Build inputs
+  apple-sdk_15,
+  libdrm,
+  numactl,
+
+  # dependencies
+  astunparse,
+  binutils,
+  expecttest,
+  filelock,
+  fsspec,
+  hypothesis,
+  jinja2,
+  networkx,
+  packaging,
+  psutil,
+  pyyaml,
+  requests,
+  setuptools,
+  sympy,
+  types-dataclasses,
+  typing-extensions,
+  # ROCm build and `torch.compile` requires `triton`
+  tritonSupport ? (!stdenv.hostPlatform.isDarwin),
+  triton,
+
+  # TODO: 1. callPackage needs to learn to distinguish between the task
+  #          of "asking for an attribute from the parent scope" and
+  #          the task of "exposing a formal parameter in .override".
+  # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.)
+  #          as they routinely end up consuming the wrong arguments\
+  #          (dependencies without cuda support).
+  #          Instead we should rely on overlays and nixpkgsFun.
+  # (@SomeoneSerge)
+  _tritonEffective ?
+    if cudaSupport then
+      triton-cuda
+    else if xpuSupport then
+      python.pkgs.triton-xpu_2_8
+    else
+      triton,
+  triton-cuda,
+
+  # Disable MKLDNN on aarch64-darwin, it negatively impacts performance,
+  # this is also what official pytorch build does
+  mklDnnSupport ? !(stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isAarch64),
+
+  # virtual pkg that consistently instantiates blas across nixpkgs
+  # See https://github.com/NixOS/nixpkgs/pull/83888
+  blas,
+
+  # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
+  ninja,
+
+  # dependencies for torch.utils.tensorboard
+  pillow,
+  six,
+  tensorboard,
+  protobuf,
+
+  # ROCm dependencies
+  rocmSupport ? config.rocmSupport,
+  rocmPackages,
+  xpuSupport ? (config.xpuSupport or false),
+  xpuPackages,
+  gpuTargets ? [ ],
+}:
+
+let
+  inherit (lib)
+    attrsets
+    lists
+    strings
+    trivial
+    ;
+  inherit (cudaPackages) cudnn nccl;
+  cudaFlags = cudaPackages.flags;
+
+  triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
+
+  setBool = v: if v then "1" else "0";
+
+  archs = (import ../../archs.nix)."2.8";
+
+  supportedTorchCudaCapabilities =
+    let
+      inherit (archs) capsPerCudaVersion;
+      real = capsPerCudaVersion."${lib.versions.majorMinor cudaPackages.cudaMajorMinorVersion}";
+      ptx = lists.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+
+  inherit (archs) supportedTorchRocmArchs;
+
+  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
+  #   of the first list *from* the second list. That means:
+  #   lists.subtractLists a b = b - a
+
+  # For CUDA
+  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
+  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
+
+  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
+  gpuArchWarner =
+    supported: unsupported:
+    trivial.throwIf (supported == [ ]) (
+      "No supported GPU targets specified. Requested GPU targets: "
+      + strings.concatStringsSep ", " unsupported
+    ) supported;
+
+  # Create the gpuTargetString.
+  gpuTargetString = strings.concatStringsSep ";" (
+    if gpuTargets != [ ] then
+      # If gpuTargets is specified, it always takes priority.
+      gpuTargets
+    else if cudaSupport then
+      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
+    else if rocmSupport then
+      supportedTorchRocmArchs
+    else
+      throw "No GPU targets specified"
+  );
+
+  # Use rocm/pytorch for ROCm 7, since it's not supported by PyTorch 2.8.
+  rocmTorch = rocmSupport && (lib.versions.majorMinor rocmPackages.rocm.version == "7.0");
+  rocmtoolkit_joined = symlinkJoin {
+    name = "rocm-merged";
+
+    paths = with rocmPackages; [
+      aotriton_0_10
+      clr
+      comgr
+      hipblas
+      hipblas-common-devel
+      hipblaslt
+      hipfft
+      hipify-clang
+      hiprand
+      hipsolver
+      hipsparse
+      hipsparselt
+      hsa-rocr
+      miopen-hip
+      rccl
+      rocblas
+      rocm-core
+      rocm-device-libs
+      rocm-hip-runtime
+      rocm-smi-lib
+      rocminfo
+      rocrand
+      rocsolver
+      rocsparse
+      roctracer
+    ];
+
+    postBuild = ''
+      # Fix `setuptools` not being found
+      rm -rf $out/nix-support
+
+      # Variables that we want to pass through to downstream derivations.
+      mkdir -p $out/nix-support
+      echo 'export ROCM_PATH="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export ROCM_SOURCE_DIR="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export CMAKE_CXX_FLAGS="-I${placeholder "out"}/include -I${placeholder "out"}/include/rocblas"' >> $out/nix-support/setup-hook
+    '';
+  };
+
+  brokenConditions = attrsets.filterAttrs (_: cond: cond) {
+    "CUDA and ROCm are mutually exclusive" = cudaSupport && rocmSupport;
+    "CUDA is not targeting Linux" = cudaSupport && !stdenv.hostPlatform.isLinux;
+    "Unsupported CUDA version" =
+      cudaSupport
+      && !(builtins.elem cudaPackages.cudaMajorVersion [
+        "11"
+        "12"
+      ]);
+    "MPI cudatoolkit does not match cudaPackages.cudatoolkit" =
+      MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
+    # This used to be a deep package set comparison between cudaPackages and
+    # effectiveMagma.cudaPackages, making torch too strict in cudaPackages.
+    # In particular, this triggered warnings from cuda's `aliases.nix`
+    "Magma cudaPackages does not match cudaPackages" =
+      cudaSupport
+      && (effectiveMagma.cudaPackages.cudaMajorMinorVersion != cudaPackages.cudaMajorMinorVersion);
+    #"Rocm support is currently broken because `rocmPackages.hipblaslt` is unpackaged. (2024-06-09)" =
+    #  rocmSupport;
+  };
+  torchXpuOpsSrc =
+    if xpuSupport then
+      fetchFromGitHub {
+        owner = "intel";
+        repo = "torch-xpu-ops";
+        rev = "3a9419c8bb6a98dd3e3cd473c36691fb4abeae40";
+        hash = "sha256-cNNnqJXfFO7UOJtXmDGKS2s1Jjs0+/AztPMjE3K/YG0=";
+      }
+    else
+      null;
+in
+buildPythonPackage rec {
+  pname = "torch";
+  version = "2.8.0";
+  pyproject = true;
+
+  stdenv = effectiveStdenv;
+
+  outputs = [
+    "out" # output standard python package
+    "dev" # output libtorch headers
+    "lib" # output libtorch libraries
+    "cxxdev" # propagated deps for the cmake consumers of torch
+  ];
+  cudaPropagateToOutput = "cxxdev";
+  rocmPropagateToOutput = "cxxdev";
+
+  src =
+    if rocmTorch then
+      fetchFromGitHub {
+        owner = "ROCm";
+        repo = "pytorch";
+        rev = "245bf6edbc7e4b6aabbb4a218b518b853225956c";
+        fetchSubmodules = true;
+        hash = "sha256-vQL9rjrQjQDnMwNB0NNKuecp8PENOib9Y+K5J9ZaiFM=";
+      }
+    else
+      fetchFromGitHub {
+        owner = "pytorch";
+        repo = "pytorch";
+        tag = "v${version}";
+        fetchSubmodules = true;
+        hash = "sha256-5JDYFoBe6bC9Dz143Bm/5OEOWsQxjctAR9fI4f6G2W8=";
+      };
+
+  patches = [
+    ./mkl-rpath.patch
+  ]
+  ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ]
+  ++ lib.optionals (!rocmTorch) [ ./cmake-load-hip-invalid-state.diff ]
+  ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [
+    # pthreadpool added support for Grand Central Dispatch in April
+    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
+    # that is available starting with macOS 10.13. However, our current
+    # base is 10.12. Until we upgrade, we can fall back on the older
+    # pthread support.
+    ./pthreadpool-disable-gcd.diff
+  ]
+  ++ lib.optionals stdenv.hostPlatform.isLinux [
+    # Propagate CUPTI to Kineto by overriding the search path with environment variables.
+    # https://github.com/pytorch/pytorch/pull/108847
+    ./pytorch-pr-108847.patch
+  ];
+
+  postUnpack = lib.optionalString xpuSupport ''
+    cp -r --no-preserve=mode ${torchXpuOpsSrc} $sourceRoot/third_party/torch-xpu-ops
+    patch -d $sourceRoot/third_party/torch-xpu-ops -p1 < ${./0001-patch-xpu-ops-CMake.patch}
+  '';
+
+  postPatch =
+    let
+      pyiGenPath = "${typing-extensions}/${python.sitePackages}:${pyyaml}/${python.sitePackages}";
+    in
+    ''
+      substituteInPlace pyproject.toml \
+        --replace-fail "setuptools>=62.3.0,<80.0" \
+                       "setuptools>=62.3.0"
+
+      substituteInPlace cmake/public/cuda.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found two conflicting CUDA' \
+          'message(WARNING "Found two conflicting CUDA' \
+        --replace-warn \
+          "set(CUDAToolkit_ROOT" \
+          "# Upstream: set(CUDAToolkit_ROOT"
+      substituteInPlace third_party/gloo/cmake/Cuda.cmake \
+        --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit"
+
+      # annotations (3.7), print_function (3.0), with_statement (2.6) are all supported
+      sed -i -e "/from __future__ import/d" **.py
+      substituteInPlace third_party/NNPACK/CMakeLists.txt \
+        --replace-fail "PYTHONPATH=" 'PYTHONPATH=$ENV{PYTHONPATH}:'
+      # flag from cmakeFlags doesn't work, not clear why
+      # setting it at the top of NNPACK's own CMakeLists does
+      sed -i '2s;^;set(PYTHON_SIX_SOURCE_DIR ${six.src})\n;' third_party/NNPACK/CMakeLists.txt
+
+      # Ensure that torch profiler unwind uses addr2line from nix
+      substituteInPlace torch/csrc/profiler/unwind/unwind.cpp \
+        --replace-fail 'addr2line_binary_ = "addr2line"' 'addr2line_binary_ = "${lib.getExe' binutils "addr2line"}"'
+
+      # gen_pyi needs typing-extensions.
+      substituteInPlace torch/CMakeLists.txt \
+        --replace-fail "env PYTHONPATH=\"\''${TORCH_ROOT}\"" \
+                       "env PYTHONPATH=\"\''${TORCH_ROOT}:${pyiGenPath}\""
+    ''
+    + lib.optionalString rocmSupport ''
+      # https://github.com/facebookincubator/gloo/pull/297
+      substituteInPlace third_party/gloo/cmake/Hipify.cmake \
+        --replace-fail "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"
+
+      # Replace hard-coded rocm paths
+      substituteInPlace caffe2/CMakeLists.txt \
+        --replace-fail "/opt/rocm" "${rocmtoolkit_joined}"
+
+      # Strangely, this is never set in cmake
+      substituteInPlace cmake/public/LoadHIP.cmake \
+        --replace-fail "set(ROCM_PATH \$ENV{ROCM_PATH})" \
+          "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitVersion rocmPackages.clr.version))})"
+    ''
+    # Detection of NCCL version doesn't work particularly well when using the static binary.
+    + lib.optionalString cudaSupport ''
+      substituteInPlace cmake/Modules/FindNCCL.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found NCCL header version and library version' \
+          'message(WARNING "Found NCCL header version and library version'
+    ''
+    # Remove PyTorch's FindCUDAToolkit.cmake and use CMake's default.
+    # NOTE: Parts of pytorch rely on unmaintained FindCUDA.cmake with custom patches to support e.g.
+    # newer architectures (sm_90a). We do want to delete vendored patches, but have to keep them
+    # until https://github.com/pytorch/pytorch/issues/76082 is addressed
+    + lib.optionalString cudaSupport ''
+      rm cmake/Modules/FindCUDAToolkit.cmake
+    ''
+    + lib.optionalString xpuSupport ''
+      # replace oneapi DIR
+      substituteInPlace cmake/Modules/FindMKL.cmake \
+        --replace-fail 'SET(DEFAULT_INTEL_ONEAPI_DIR "/opt/intel/oneapi")' 'SET(DEFAULT_INTEL_ONEAPI_DIR ${xpuPackages.oneapi-torch-dev}/oneapi)'
+      # replace mkldnn build for xpu
+      sed -i '/ExternalProject_Add(xpu_mkldnn_proj/,/^ *)/s/^/#/' cmake/Modules/FindMKLDNN.cmake
+      substituteInPlace cmake/Modules/FindMKLDNN.cmake \
+        --replace-fail 'ExternalProject_Get_Property(xpu_mkldnn_proj SOURCE_DIR BINARY_DIR)' '# ExternalProject_Get_Property(xpu_mkldnn_proj SOURCE_DIR BINARY_DIR)' \
+        --replace-fail  "set(XPU_MKLDNN_LIBRARIES \''${BINARY_DIR}/src/\''${DNNL_LIB_NAME})" "set(XPU_MKLDNN_LIBRARIES ${xpuPackages.onednn-xpu}/lib/libdnnl.a)" \
+        --replace-fail  "set(XPU_MKLDNN_INCLUDE \''${SOURCE_DIR}/include \''${BINARY_DIR}/include)" "set(XPU_MKLDNN_INCLUDE ${xpuPackages.onednn-xpu}/include)"
+      # comment torch-xpu-ops git clone block in pytorch/caffe2/CMakeLists.txt
+      sed -i '/set(TORCH_XPU_OPS_REPO_URL/,/^  endif()/s/^/#/' caffe2/CMakeLists.txt
+      sed -i '/execute_process(/,/^  endif()/s/^/#/' caffe2/CMakeLists.txt
+    ''
+    # error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
+    # This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
+    +
+      lib.optionalString
+        (stdenv.hostPlatform.isDarwin && lib.versionOlder stdenv.hostPlatform.darwinSdkVersion "11.0")
+        ''
+          substituteInPlace third_party/pocketfft/pocketfft_hdronly.h --replace-fail '#if (__cplusplus >= 201703L) && (!defined(__MINGW32__)) && (!defined(_MSC_VER))
+          inline void *aligned_alloc(size_t align, size_t size)' '#if 0
+          inline void *aligned_alloc(size_t align, size_t size)'
+        '';
+
+  # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
+  # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
+  # without extreme care to ensure they don't lock each other out of shared resources.
+  # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
+  preConfigure =
+    lib.optionalString cudaSupport ''
+      export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
+      export CUPTI_INCLUDE_DIR=${lib.getDev cudaPackages.cuda_cupti}/include
+      export CUPTI_LIBRARY_DIR=${lib.getLib cudaPackages.cuda_cupti}/lib
+    ''
+    + lib.optionalString (cudaSupport && cudaPackages ? cudnn) ''
+      export CUDNN_INCLUDE_DIR=${lib.getLib cudnn}/include
+      export CUDNN_LIB_DIR=${cudnn.lib}/lib
+    ''
+    + lib.optionalString rocmSupport ''
+      export PYTORCH_ROCM_ARCH="${gpuTargetString}"
+      python tools/amd_build/build_amd.py
+    '';
+
+  # Use pytorch's custom configurations
+  dontUseCmakeConfigure = true;
+
+  # causes possible redefinition of _FORTIFY_SOURCE
+  hardeningDisable = [ "fortify3" ];
+
+  BUILD_NAMEDTENSOR = setBool true;
+  BUILD_DOCS = setBool buildDocs;
+
+  # We only do an imports check, so do not build tests either.
+  BUILD_TEST = setBool false;
+
+  # ninja hook doesn't automatically turn on ninja
+  # because pytorch setup.py is responsible for this
+  CMAKE_GENERATOR = "Ninja";
+
+  # Whether to use C++11 ABI (or earlier).
+  _GLIBCXX_USE_CXX11_ABI = setBool cxx11Abi;
+
+  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
+  # it by default. PyTorch currently uses its own vendored version
+  # of oneDNN through Intel iDeep.
+  USE_MKLDNN = setBool mklDnnSupport;
+  USE_MKLDNN_CBLAS = setBool mklDnnSupport;
+
+  # Avoid using pybind11 from git submodule
+  # Also avoids pytorch exporting the headers of pybind11
+  USE_SYSTEM_PYBIND11 = true;
+
+  cmakeFlags = [
+    # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
+    (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaMajorMinorVersion)
+  ]
+  ++ lib.optionals cudaSupport [
+    # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache
+    # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363
+    (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaMajorMinorVersion)
+  ];
+
+  preBuild = ''
+    export MAX_JOBS=$NIX_BUILD_CORES
+    ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only
+    ${cmake}/bin/cmake build
+  '';
+
+  preFixup = ''
+    function join_by { local IFS="$1"; shift; echo "$*"; }
+    function strip2 {
+      IFS=':'
+      read -ra RP <<< $(patchelf --print-rpath $1)
+      IFS=' '
+      RP_NEW=$(join_by : ''${RP[@]:2})
+      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
+    }
+    for f in $(find ''${out} -name 'libcaffe2*.so')
+    do
+      strip2 $f
+    done
+  '';
+
+  # Override the (weirdly) wrong version set by default. See
+  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
+  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
+  PYTORCH_BUILD_VERSION = version;
+  PYTORCH_BUILD_NUMBER = 0;
+
+  # In-tree builds of NCCL are not supported.
+  # Use NCCL when cudaSupport is enabled and nccl is available.
+  USE_NCCL = setBool useSystemNccl;
+  USE_SYSTEM_NCCL = USE_NCCL;
+  USE_STATIC_NCCL = USE_NCCL;
+
+  # Set the correct Python library path, broken since
+  # https://github.com/pytorch/pytorch/commit/3d617333e
+  PYTHON_LIB_REL_PATH = "${placeholder "out"}/${python.sitePackages}";
+
+  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
+  # (upstream seems to have fixed this in the wrong place?)
+  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
+  # https://github.com/pytorch/pytorch/issues/22346
+  #
+  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
+  # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
+  env = {
+    # Builds faster without this and we don't have enough inputs that cmd length is an issue
+    NIX_CC_USE_RESPONSE_FILE = 0;
+
+    NIX_CFLAGS_COMPILE = toString (
+      (lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ] ++ [ "-Wno-error" ])
+    );
+  }
+  // lib.optionalAttrs rocmSupport {
+    AOTRITON_INSTALLED_PREFIX = rocmPackages.aotriton_0_10;
+  }
+  // lib.optionalAttrs stdenv.hostPlatform.isDarwin {
+    USE_MPS = 1;
+  }
+  // lib.optionalAttrs xpuSupport {
+    MKLROOT = xpuPackages.oneapi-torch-dev;
+    SYCL_ROOT = xpuPackages.oneapi-torch-dev;
+  };
+
+  nativeBuildInputs = [
+    cmake
+    ninja
+    pybind11
+    removeReferencesTo
+    which
+  ]
+  ++ lib.optionals cudaSupport (
+    with cudaPackages;
+    [
+      autoAddDriverRunpath
+      cuda_nvcc
+    ]
+  )
+  ++ lib.optionals rocmSupport [
+    rocmtoolkit_joined
+    rocmPackages.setupRocmHook
+  ]
+  ++ lib.optionals xpuSupport (
+    with xpuPackages;
+    [
+      ocloc
+      oneapi-torch-dev
+    ]
+  );
+
+  buildInputs = [
+    blas
+    blas.provider
+  ]
+  ++ lib.optionals cudaSupport (
+    with cudaPackages;
+    [
+      cuda_cccl # <thrust/*>
+      cuda_cudart # cuda_runtime.h and libraries
+      cuda_cupti # For kineto
+      cuda_profiler_api # <cuda_profiler_api.h>
+      cuda_nvcc # crt/host_config.h; even though we include this in nativeBuildInputs, it's needed here too
+      cuda_nvml_dev # <nvml.h>
+      cuda_nvrtc
+      #cuda_nvtx # -llibNVToolsExt
+      nvtx
+      libcublas
+      libcufile
+      libcufft
+      libcurand
+      libcusolver
+      libcusparse
+    ]
+    ++ lists.optionals (cudaPackages ? cudnn) [ cudnn ]
+    ++ lists.optionals useSystemNccl [
+      # Some platforms do not support NCCL (i.e., Jetson)
+      nccl # Provides nccl.h AND a static copy of NCCL!
+    ]
+  )
+  ++ lib.optionals rocmSupport (
+    with rocmPackages;
+    [
+      composablekernel-devel
+      hipcub-devel
+      libdrm
+      openmp
+      rocmtoolkit_joined
+      rocprim-devel
+      rocthrust-devel
+    ]
+  )
+  ++ lib.optionals xpuSupport (
+    with xpuPackages;
+    [
+      oneapi-torch-dev
+      onednn-xpu
+    ]
+  )
+  ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
+  ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
+  ++ lib.optionals stdenv.hostPlatform.isDarwin [
+    apple-sdk_15
+  ]
+  ++ lib.optionals tritonSupport [ _tritonEffective ]
+  ++ lib.optionals MPISupport [ mpi ];
+
+  pythonRelaxDeps = [
+    "sympy"
+  ];
+  dependencies = [
+    astunparse
+    expecttest
+    filelock
+    fsspec
+    hypothesis
+    jinja2
+    networkx
+    ninja
+    packaging
+    psutil
+    pyyaml
+    requests
+    sympy
+    types-dataclasses
+    typing-extensions
+
+    # the following are required for tensorboard support
+    pillow
+    six
+    tensorboard
+    protobuf
+
+    # torch/csrc requires `pybind11` at runtime
+    pybind11
+  ]
+  ++ lib.optionals (lib.versionAtLeast python.version "3.12") [ setuptools ]
+  ++ lib.optionals tritonSupport [ _tritonEffective ];
+
+  propagatedCxxBuildInputs =
+    [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
+
+  # Tests take a long time and may be flaky, so just sanity-check imports
+  doCheck = false;
+
+  pythonImportsCheck = [ "torch" ];
+
+  nativeCheckInputs = [
+    hypothesis
+    ninja
+    psutil
+  ];
+
+  checkPhase =
+    with lib.versions;
+    with lib.strings;
+    concatStringsSep " " [
+      "runHook preCheck"
+      "${python.interpreter} test/run_test.py"
+      "--exclude"
+      (concatStringsSep " " [
+        "utils" # utils requires git, which is not allowed in the check phase
+
+        # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
+        # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
+
+        # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
+        (optionalString (majorMinor version == "1.3") "tensorboard")
+      ])
+      "runHook postCheck"
+    ];
+
+  pythonRemoveDeps = [
+    # In our dist-info the name is just "triton"
+    "pytorch-triton-rocm"
+  ];
+
+  postInstall = ''
+    find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${effectiveStdenv.cc} '{}' +
+
+    mkdir $dev
+    cp -r $out/${python.sitePackages}/torch/include $dev/include
+    cp -r $out/${python.sitePackages}/torch/share $dev/share
+
+    # Fix up library paths for split outputs
+    substituteInPlace \
+      $dev/share/cmake/Torch/TorchConfig.cmake \
+      --replace-fail \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
+
+    substituteInPlace \
+      $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
+      --replace-fail \''${_IMPORT_PREFIX}/lib "$lib/lib"
+
+    mkdir $lib
+    mv $out/${python.sitePackages}/torch/lib $lib/lib
+    ln -s $lib/lib $out/${python.sitePackages}/torch/lib
+  ''
+  + lib.optionalString rocmSupport ''
+    substituteInPlace $dev/share/cmake/Tensorpipe/TensorpipeTargets-release.cmake \
+      --replace-fail "\''${_IMPORT_PREFIX}/lib64" "$lib/lib"
+
+    substituteInPlace $dev/share/cmake/ATen/ATenConfig.cmake \
+      --replace-fail "/build/source/torch/include" "$dev/include"
+  '';
+
+  postFixup = ''
+    mkdir -p "$cxxdev/nix-support"
+    printWords "''${propagatedCxxBuildInputs[@]}" >> "$cxxdev/nix-support/propagated-build-inputs"
+  ''
+  + lib.optionalString stdenv.hostPlatform.isDarwin ''
+    for f in $(ls $lib/lib/*.dylib); do
+        install_name_tool -id $lib/lib/$(basename $f) $f || true
+    done
+
+    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
+    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
+
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
+
+    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
+  '';
+
+  # See https://github.com/NixOS/nixpkgs/issues/296179
+  #
+  # This is a quick hack to add `libnvrtc` to the runpath so that torch can find
+  # it when it is needed at runtime.
+  extraRunpaths = lib.optionals cudaSupport [ "${lib.getLib cudaPackages.cuda_nvrtc}/lib" ];
+  postPhases = lib.optionals stdenv.hostPlatform.isLinux [ "postPatchelfPhase" ];
+  postPatchelfPhase = ''
+    while IFS= read -r -d $'\0' elf ; do
+      for extra in $extraRunpaths ; do
+        echo patchelf "$elf" --add-rpath "$extra" >&2
+        patchelf "$elf" --add-rpath "$extra"
+      done
+    done < <(
+      find "''${!outputLib}" "$out" -type f -iname '*.so' -print0
+    )
+  '';
+
+  # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
+  requiredSystemFeatures = [ "big-parallel" ];
+
+  passthru = {
+    inherit
+      cudaSupport
+      cudaPackages
+      cxx11Abi
+      rocmSupport
+      rocmPackages
+      xpuSupport
+      xpuPackages
+      ;
+    cudaCapabilities = if cudaSupport then supportedCudaCapabilities else [ ];
+    rocmArchs = if rocmSupport then supportedTorchRocmArchs else [ ];
+    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
+    blasProvider = blas.provider;
+    # To help debug when a package is broken due to CUDA support
+    inherit brokenConditions;
+    tests = callPackage ./tests.nix { };
+  };
+
+  meta = {
+    changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
+    # keep PyTorch in the description so the package can be found under that name on search.nixos.org
+    description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
+    homepage = "https://pytorch.org/";
+    license = lib.licenses.bsd3;
+    maintainers = with lib.maintainers; [
+      teh
+      thoughtpolice
+      tscholak
+    ]; # tscholak esp. for darwin-related builds
+    platforms =
+      lib.platforms.linux ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin;
+    broken = builtins.any trivial.id (builtins.attrValues brokenConditions);
+  };
+}
diff --git a/pkgs/python-modules/torch/source/2_8/fix-cmake-cuda-toolkit.patch b/pkgs/python-modules/torch/source/2_8/fix-cmake-cuda-toolkit.patch
new file mode 100644
index 00000000..35b6d23c
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/fix-cmake-cuda-toolkit.patch
@@ -0,0 +1,51 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c4cd4b2c2a..e983b21353 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1319,10 +1319,6 @@ if(BUILD_SHARED_LIBS)
+     DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
+     DESTINATION share/cmake/Caffe2/
+     COMPONENT dev)
+-  install(
+-    FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
+-    DESTINATION share/cmake/Caffe2/
+-    COMPONENT dev)
+   install(
+     FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
+     DESTINATION share/cmake/Caffe2/
+diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
+index b37036d2f2..9878b70cf7 100644
+--- a/cmake/public/cuda.cmake
++++ b/cmake/public/cuda.cmake
+@@ -61,9 +61,15 @@ find_package(CUDAToolkit REQUIRED)
+ cmake_policy(POP)
+ 
+ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
+-  message(FATAL_ERROR "Found two conflicting CUDA versions:\n"
+-                      "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
+-                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
++  if(CUDA_INCLUDE_DIRS IN_LIST CUDAToolkit_INCLUDE_DIR)
++    message(STATUS "CUDA_INCLUDE_DIRS is a substring of CUDAToolkit_INCLUDE_DIR. "
++                   "Setting CUDA_INCLUDE_DIRS to CUDAToolkit_INCLUDE_DIR.")
++    set(CUDA_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIR}")
++  else()
++    message(FATAL_ERROR "Found two conflicting CUDA installs:\n"
++                        "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
++                        "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIR}'")
++  endif()
+ endif()
+ 
+ if(NOT TARGET CUDA::nvToolsExt)
+diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
+index 4d10b3db1a..48a463df59 100644
+--- a/tools/setup_helpers/cmake.py
++++ b/tools/setup_helpers/cmake.py
+@@ -206,6 +206,8 @@ class CMake:
+                     "BUILDING_WITH_TORCH_LIBS",
+                     "CUDA_HOST_COMPILER",
+                     "CUDA_NVCC_EXECUTABLE",
++                    "CUDAToolkit_ROOT",
++                    "CUDAToolkit_INCLUDE_DIR",
+                     "CUDA_SEPARABLE_COMPILATION",
+                     "CUDNN_LIBRARY",
+                     "CUDNN_INCLUDE_DIR",
diff --git a/pkgs/python-modules/torch/source/2_8/mkl-rpath.patch b/pkgs/python-modules/torch/source/2_8/mkl-rpath.patch
new file mode 100644
index 00000000..bbb1bc2e
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/mkl-rpath.patch
@@ -0,0 +1,17 @@
+diff --git a/cmake/public/mkl.cmake b/cmake/public/mkl.cmake
+index 2f6d1fd905..f30464be07 100644
+--- a/cmake/public/mkl.cmake
++++ b/cmake/public/mkl.cmake
+@@ -16,12 +16,6 @@ foreach(MKL_LIB IN LISTS MKL_LIBRARIES)
+   endif()
+ endforeach()
+ 
+-# TODO: This is a hack, it will not pick up architecture dependent
+-# MKL libraries correctly; see https://github.com/pytorch/pytorch/issues/73008
+-set_property(
+-  TARGET caffe2::mkl PROPERTY INTERFACE_LINK_DIRECTORIES
+-  ${MKL_ROOT}/lib ${MKL_ROOT}/lib/intel64 ${MKL_ROOT}/lib/intel64_win ${MKL_ROOT}/lib/win-x64)
+-
+ if(UNIX)
+   if(USE_STATIC_MKL)
+     foreach(MKL_LIB_PATH IN LISTS MKL_LIBRARIES)
diff --git a/pkgs/python-modules/torch/source/2_8/pthreadpool-disable-gcd.diff b/pkgs/python-modules/torch/source/2_8/pthreadpool-disable-gcd.diff
new file mode 100644
index 00000000..eddd4572
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/pthreadpool-disable-gcd.diff
@@ -0,0 +1,45 @@
+diff --git a/third_party/pthreadpool/CMakeLists.txt b/third_party/pthreadpool/CMakeLists.txt
+index 0db3264..1ba91c4 100644
+--- a/third_party/pthreadpool/CMakeLists.txt
++++ b/third_party/pthreadpool/CMakeLists.txt
+@@ -74,9 +74,7 @@ IF(EMSCRIPTEN)
+   LIST(APPEND PTHREADPOOL_SRCS src/shim.c)
+ ELSE()
+   LIST(APPEND PTHREADPOOL_SRCS src/portable-api.c src/memory.c)
+-  IF(APPLE AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "gcd"))
+-    LIST(APPEND PTHREADPOOL_SRCS src/gcd.c)
+-  ELSEIF(CMAKE_SYSTEM_NAME MATCHES "^(Windows|CYGWIN|MSYS)$" AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event"))
++  IF(CMAKE_SYSTEM_NAME MATCHES "^(Windows|CYGWIN|MSYS)$" AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event"))
+     LIST(APPEND PTHREADPOOL_SRCS src/windows.c)
+   ELSE()
+     LIST(APPEND PTHREADPOOL_SRCS src/pthreads.c)
+@@ -111,10 +109,6 @@ ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "futex")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=1)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=0)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_EVENT=0)
+-ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "gcd")
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=1)
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_EVENT=0)
+ ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=0)
+diff --git a/third_party/pthreadpool/src/threadpool-common.h b/third_party/pthreadpool/src/threadpool-common.h
+index ca84744..244d0ca 100644
+--- a/third_party/pthreadpool/src/threadpool-common.h
++++ b/third_party/pthreadpool/src/threadpool-common.h
+@@ -14,14 +14,6 @@
+ 	#endif
+ #endif
+ 
+-#ifndef PTHREADPOOL_USE_GCD
+-	#if defined(__APPLE__)
+-		#define PTHREADPOOL_USE_GCD 1
+-	#else
+-		#define PTHREADPOOL_USE_GCD 0
+-	#endif
+-#endif
+-
+ #ifndef PTHREADPOOL_USE_EVENT
+ 	#if defined(_WIN32) || defined(__CYGWIN__)
+ 		#define PTHREADPOOL_USE_EVENT 1
diff --git a/pkgs/python-modules/torch/source/2_8/pytorch-pr-108847.patch b/pkgs/python-modules/torch/source/2_8/pytorch-pr-108847.patch
new file mode 100644
index 00000000..9511191c
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_8/pytorch-pr-108847.patch
@@ -0,0 +1,31 @@
+From bf4050edab9f294a8e0060c47f906cd7a80f25a2 Mon Sep 17 00:00:00 2001
+From: Samuel Ainsworth <skainsworth@gmail.com>
+Date: Sat, 9 Sep 2023 02:04:09 +0000
+Subject: [PATCH] Dependencies.cmake: support building against CUPTI outside of
+ CUDA_SOURCE_DIR
+
+Limitation discovered in https://github.com/NixOS/nixpkgs/pull/249259.
+---
+ cmake/Dependencies.cmake | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 0602d534dc4c14..5f6a5f79f3e3dc 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1879,6 +1879,7 @@ if(USE_KINETO)
+         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+         ${CUDA_SOURCE_DIR}/lib
+         ${CUDA_SOURCE_DIR}/lib64
++        $ENV{CUPTI_LIBRARY_DIR}
+         NO_DEFAULT_PATH)
+ 
+     find_path(CUPTI_INCLUDE_DIR cupti.h PATHS
+@@ -1886,6 +1887,7 @@ if(USE_KINETO)
+         ${CUDA_INCLUDE_DIRS}
+         ${CUDA_SOURCE_DIR}
+         ${CUDA_SOURCE_DIR}/include
++        $ENV{CUPTI_INCLUDE_DIR}
+         NO_DEFAULT_PATH)
+ 
+     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
diff --git a/pkgs/python-modules/torch/source/2_9/0001-patch-xpu-ops-CMake.patch b/pkgs/python-modules/torch/source/2_9/0001-patch-xpu-ops-CMake.patch
new file mode 100644
index 00000000..479446b0
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/0001-patch-xpu-ops-CMake.patch
@@ -0,0 +1,50 @@
+From 3b5dd2c142173a9a6afe6b21a3107e04d7fcc0ea Mon Sep 17 00:00:00 2001
+From: "Wang, Yi A" <yi.a.wang@intel.com>
+Date: Mon, 25 Aug 2025 17:39:38 -0700
+Subject: [PATCH] patch xpu ops CMake
+
+Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
+---
+ cmake/BuildFlags.cmake       | 1 +
+ cmake/Modules/FindSYCL.cmake | 6 +++++-
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
+index ec9aaccb..8f4740df 100644
+--- a/cmake/BuildFlags.cmake
++++ b/cmake/BuildFlags.cmake
+@@ -83,6 +83,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
+     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
++    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1)
+   endif()
+ 
+   if(CMAKE_BUILD_TYPE MATCHES Debug)
+diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake
+index 86457ba3..0feec0fa 100644
+--- a/cmake/Modules/FindSYCL.cmake
++++ b/cmake/Modules/FindSYCL.cmake
+@@ -34,7 +34,8 @@
+ #  SYCL_ADD_EXECUTABLE
+ #
+ #  SYCL_ADD_LIBRARY
+-
++set(SYCL_EXTRA_INCLUDE_DIRS $ENV{SYCL_EXTRA_INCLUDE_DIRS})
++string(REPLACE " " ";" SYCL_EXTRA_INCLUDE_DIRS "${SYCL_EXTRA_INCLUDE_DIRS}")
+ macro(SYCL_FIND_HELPER_FILE _name _extension)
+   set(_full_name "${_name}.${_extension}")
+   # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+@@ -216,6 +217,9 @@ macro(SYCL_WRAP_SRCS sycl_target generated_files)
+   set(generated_extension ${CMAKE_${SYCL_C_OR_CXX}_OUTPUT_EXTENSION})
+ 
+   set(SYCL_include_dirs "${SYCL_INCLUDE_DIR}")
++  if(DEFINED SYCL_EXTRA_INCLUDE_DIRS)
++    list(APPEND SYCL_include_dirs ${SYCL_EXTRA_INCLUDE_DIRS})
++  endif()
+   list(APPEND SYCL_include_dirs "$<TARGET_PROPERTY:${sycl_target},INCLUDE_DIRECTORIES>")
+ 
+   set(SYCL_compile_definitions "$<TARGET_PROPERTY:${sycl_target},COMPILE_DEFINITIONS>")
+-- 
+2.43.0
+
diff --git a/pkgs/python-modules/torch/source/2_9/default.nix b/pkgs/python-modules/torch/source/2_9/default.nix
new file mode 100644
index 00000000..24b7c8f4
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/default.nix
@@ -0,0 +1,744 @@
+{
+  stdenv,
+  stdenvAdapters,
+  lib,
+  fetchFromGitHub,
+  fetchpatch,
+  buildPythonPackage,
+  python,
+  config,
+  cudaSupport ? config.cudaSupport,
+  cudaPackages,
+  autoAddDriverRunpath,
+  effectiveMagma ?
+    if cudaSupport then
+      magma-cuda-static
+    else if rocmSupport then
+      magma-hip
+    else
+      magma,
+  effectiveStdenv ? if cudaSupport then cudaPackages.backendStdenv else stdenv,
+  magma,
+  magma-hip,
+  magma-cuda-static,
+  # Use the system NCCL as long as we're targeting CUDA on a supported platform.
+  useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported || rocmSupport),
+  MPISupport ? false,
+  mpi,
+  nvtx,
+  buildDocs ? false,
+  cxx11Abi ? true,
+
+  # tests.cudaAvailable:
+  callPackage,
+
+  # Native build inputs
+  cmake,
+  symlinkJoin,
+  which,
+  pybind11,
+  removeReferencesTo,
+
+  # Build inputs
+  apple-sdk_15,
+  libdrm,
+  numactl,
+
+  # dependencies
+  astunparse,
+  binutils,
+  expecttest,
+  filelock,
+  fsspec,
+  hypothesis,
+  jinja2,
+  networkx,
+  packaging,
+  psutil,
+  pyyaml,
+  requests,
+  setuptools,
+  sympy,
+  types-dataclasses,
+  typing-extensions,
+  # ROCm build and `torch.compile` requires `triton`
+  tritonSupport ? (!stdenv.hostPlatform.isDarwin),
+  triton,
+
+  # TODO: 1. callPackage needs to learn to distinguish between the task
+  #          of "asking for an attribute from the parent scope" and
+  #          the task of "exposing a formal parameter in .override".
+  # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.)
+  #          as they routinely end up consuming the wrong arguments\
+  #          (dependencies without cuda support).
+  #          Instead we should rely on overlays and nixpkgsFun.
+  # (@SomeoneSerge)
+  _tritonEffective ?
+    if cudaSupport then
+      triton-cuda
+    else if xpuSupport then
+      python.pkgs.triton-xpu_2_9
+    else
+      triton,
+  triton-cuda,
+
+  # Disable MKLDNN on aarch64-darwin, it negatively impacts performance,
+  # this is also what official pytorch build does
+  mklDnnSupport ? !(stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isAarch64),
+
+  # virtual pkg that consistently instantiates blas across nixpkgs
+  # See https://github.com/NixOS/nixpkgs/pull/83888
+  blas,
+
+  # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
+  ninja,
+
+  # dependencies for torch.utils.tensorboard
+  pillow,
+  six,
+  tensorboard,
+  protobuf,
+
+  # ROCm dependencies
+  rocmSupport ? config.rocmSupport,
+  rocmPackages,
+  xpuSupport ? (config.xpuSupport or false),
+  xpuPackages,
+  gpuTargets ? [ ],
+}:
+
+let
+  inherit (lib)
+    attrsets
+    lists
+    strings
+    trivial
+    ;
+  inherit (cudaPackages) cudnn nccl;
+  cudaFlags = cudaPackages.flags;
+
+  triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
+
+  setBool = v: if v then "1" else "0";
+
+  archs = (import ../../archs.nix)."2.9";
+
+  supportedTorchCudaCapabilities =
+    let
+      inherit (archs) capsPerCudaVersion;
+      real = capsPerCudaVersion."${lib.versions.majorMinor cudaPackages.cudaMajorMinorVersion}";
+      ptx = lists.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+
+  inherit (archs) supportedTorchRocmArchs;
+
+  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
+  #   of the first list *from* the second list. That means:
+  #   lists.subtractLists a b = b - a
+
+  # For CUDA
+  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
+  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
+
+  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
+  gpuArchWarner =
+    supported: unsupported:
+    trivial.throwIf (supported == [ ]) (
+      "No supported GPU targets specified. Requested GPU targets: "
+      + strings.concatStringsSep ", " unsupported
+    ) supported;
+
+  # Create the gpuTargetString.
+  gpuTargetString = strings.concatStringsSep ";" (
+    if gpuTargets != [ ] then
+      # If gpuTargets is specified, it always takes priority.
+      gpuTargets
+    else if cudaSupport then
+      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
+    else if rocmSupport then
+      supportedTorchRocmArchs
+    else
+      throw "No GPU targets specified"
+  );
+
+  rocmtoolkit_joined = symlinkJoin {
+    name = "rocm-merged";
+
+    paths = with rocmPackages; [
+      aotriton_0_11
+      clr
+      comgr
+      hipblas
+      hipblas-common-devel
+      hipblaslt
+      hipfft
+      hipify-clang
+      hiprand
+      hipsolver
+      hipsparse
+      hipsparselt
+      hsa-rocr
+      miopen-hip
+      rccl
+      rocblas
+      rocm-core
+      rocm-device-libs
+      rocm-hip-runtime
+      rocm-smi-lib
+      rocminfo
+      rocrand
+      rocsolver
+      rocsparse
+      roctracer
+    ];
+
+    postBuild = ''
+      # Fix `setuptools` not being found
+      rm -rf $out/nix-support
+
+      # Variables that we want to pass through to downstream derivations.
+      mkdir -p $out/nix-support
+      echo 'export ROCM_PATH="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export ROCM_SOURCE_DIR="${placeholder "out"}"' >> $out/nix-support/setup-hook
+      echo 'export CMAKE_CXX_FLAGS="-I${placeholder "out"}/include -I${placeholder "out"}/include/rocblas"' >> $out/nix-support/setup-hook
+    '';
+  };
+
+  brokenConditions = attrsets.filterAttrs (_: cond: cond) {
+    "CUDA and ROCm are mutually exclusive" = cudaSupport && rocmSupport;
+    "CUDA is not targeting Linux" = cudaSupport && !stdenv.hostPlatform.isLinux;
+    "Unsupported CUDA version" =
+      cudaSupport
+      && !(builtins.elem cudaPackages.cudaMajorVersion [
+        "12"
+        "13"
+      ]);
+    "MPI cudatoolkit does not match cudaPackages.cudatoolkit" =
+      MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
+    # This used to be a deep package set comparison between cudaPackages and
+    # effectiveMagma.cudaPackages, making torch too strict in cudaPackages.
+    # In particular, this triggered warnings from cuda's `aliases.nix`
+    "Magma cudaPackages does not match cudaPackages" =
+      cudaSupport
+      && (effectiveMagma.cudaPackages.cudaMajorMinorVersion != cudaPackages.cudaMajorMinorVersion);
+    #"Rocm support is currently broken because `rocmPackages.hipblaslt` is unpackaged. (2024-06-09)" =
+    #  rocmSupport;
+  };
+  torchXpuOpsSrc =
+    if xpuSupport then
+      fetchFromGitHub {
+        owner = "intel";
+        repo = "torch-xpu-ops";
+        rev = "f8408a642da568051ab82e20f2947b89e491fbeb";
+        hash = "sha256-eoT8mvaPw1NFFTYFVT6NUqOFOo4rDdNrIseF+FDpXUk=";
+      }
+    else
+      null;
+in
+buildPythonPackage rec {
+  pname = "torch";
+  version = "2.9.0";
+  pyproject = true;
+
+  stdenv = effectiveStdenv;
+
+  outputs = [
+    "out" # output standard python package
+    "dev" # output libtorch headers
+    "lib" # output libtorch libraries
+    "cxxdev" # propagated deps for the cmake consumers of torch
+  ];
+  cudaPropagateToOutput = "cxxdev";
+  rocmPropagateToOutput = "cxxdev";
+
+  src = fetchFromGitHub {
+    owner = "pytorch";
+    repo = "pytorch";
+    tag = "v${version}";
+    fetchSubmodules = true;
+    hash = "sha256-Jszhe67FteiSbkbUEjVIkWVUjUY8IS5qVHct4HvcfIg=";
+  };
+
+  patches = [
+    ./mkl-rpath.patch
+  ]
+  ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ]
+  ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [
+    # pthreadpool added support for Grand Central Dispatch in April
+    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
+    # that is available starting with macOS 10.13. However, our current
+    # base is 10.12. Until we upgrade, we can fall back on the older
+    # pthread support.
+    ./pthreadpool-disable-gcd.diff
+  ]
+  ++ lib.optionals stdenv.hostPlatform.isLinux [
+    # Propagate CUPTI to Kineto by overriding the search path with environment variables.
+    # https://github.com/pytorch/pytorch/pull/108847
+    ./pytorch-pr-108847.patch
+  ];
+
+  postUnpack = lib.optionalString xpuSupport ''
+    cp -r --no-preserve=mode ${torchXpuOpsSrc} $sourceRoot/third_party/torch-xpu-ops
+    patch -d $sourceRoot/third_party/torch-xpu-ops -p1 < ${./0001-patch-xpu-ops-CMake.patch}
+  '';
+
+  postPatch =
+    let
+      pyiGenPath = "${typing-extensions}/${python.sitePackages}:${pyyaml}/${python.sitePackages}";
+    in
+    ''
+      substituteInPlace pyproject.toml \
+        --replace-fail "setuptools>=70.1.0,<80.0" \
+                       "setuptools>=70.1.0"
+
+      substituteInPlace cmake/public/cuda.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found two conflicting CUDA' \
+          'message(WARNING "Found two conflicting CUDA' \
+        --replace-warn \
+          "set(CUDAToolkit_ROOT" \
+          "# Upstream: set(CUDAToolkit_ROOT"
+      substituteInPlace third_party/gloo/cmake/Cuda.cmake \
+        --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit"
+
+      # annotations (3.7), print_function (3.0), with_statement (2.6) are all supported
+      sed -i -e "/from __future__ import/d" **.py
+      #substituteInPlace third_party/NNPACK/CMakeLists.txt \
+      #  --replace-fail "PYTHONPATH=" 'PYTHONPATH=$ENV{PYTHONPATH}:'
+      # flag from cmakeFlags doesn't work, not clear why
+      # setting it at the top of NNPACK's own CMakeLists does
+      sed -i '2s;^;set(PYTHON_SIX_SOURCE_DIR ${six.src})\n;' third_party/NNPACK/CMakeLists.txt
+
+      # Ensure that torch profiler unwind uses addr2line from nix
+      substituteInPlace torch/csrc/profiler/unwind/unwind.cpp \
+        --replace-fail 'addr2line_binary_ = "addr2line"' 'addr2line_binary_ = "${lib.getExe' binutils "addr2line"}"'
+
+      # gen_pyi needs typing-extensions.
+      #substituteInPlace torch/CMakeLists.txt \
+      #  --replace-fail "env PYTHONPATH=\"\''${TORCH_ROOT}\"" \
+      #                 "env PYTHONPATH=\"\''${TORCH_ROOT}:${pyiGenPath}\""
+    ''
+    + lib.optionalString rocmSupport ''
+      # https://github.com/facebookincubator/gloo/pull/297
+      substituteInPlace third_party/gloo/cmake/Hipify.cmake \
+        --replace-fail "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"
+
+      # Replace hard-coded rocm paths
+      substituteInPlace caffe2/CMakeLists.txt \
+        --replace-fail "/opt/rocm" "${rocmtoolkit_joined}"
+    ''
+    # Detection of NCCL version doesn't work particularly well when using the static binary.
+    + lib.optionalString cudaSupport ''
+      substituteInPlace cmake/Modules/FindNCCL.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found NCCL header version and library version' \
+          'message(WARNING "Found NCCL header version and library version'
+    ''
+    # Remove PyTorch's FindCUDAToolkit.cmake and use CMake's default.
+    # NOTE: Parts of pytorch rely on unmaintained FindCUDA.cmake with custom patches to support e.g.
+    # newer architectures (sm_90a). We do want to delete vendored patches, but have to keep them
+    # until https://github.com/pytorch/pytorch/issues/76082 is addressed
+    + lib.optionalString cudaSupport ''
+      rm cmake/Modules/FindCUDAToolkit.cmake
+    ''
+    + lib.optionalString xpuSupport ''
+      # replace oneapi DIR
+      substituteInPlace cmake/Modules/FindMKL.cmake \
+        --replace-fail 'SET(DEFAULT_INTEL_ONEAPI_DIR "/opt/intel/oneapi")' 'SET(DEFAULT_INTEL_ONEAPI_DIR ${xpuPackages.oneapi-torch-dev}/oneapi)'
+      # replace mkldnn build for xpu
+      sed -i '/ExternalProject_Add(xpu_mkldnn_proj/,/^ *)/s/^/#/' cmake/Modules/FindMKLDNN.cmake
+      substituteInPlace cmake/Modules/FindMKLDNN.cmake \
+        --replace-fail 'ExternalProject_Get_Property(xpu_mkldnn_proj SOURCE_DIR BINARY_DIR)' '# ExternalProject_Get_Property(xpu_mkldnn_proj SOURCE_DIR BINARY_DIR)' \
+        --replace-fail  "set(XPU_MKLDNN_LIBRARIES \''${BINARY_DIR}/src/\''${DNNL_LIB_NAME})" "set(XPU_MKLDNN_LIBRARIES ${xpuPackages.onednn-xpu}/lib/libdnnl.a)" \
+        --replace-fail  "set(XPU_MKLDNN_INCLUDE \''${SOURCE_DIR}/include \''${BINARY_DIR}/include)" "set(XPU_MKLDNN_INCLUDE ${xpuPackages.onednn-xpu}/include)"
+      # comment torch-xpu-ops git clone block in pytorch/caffe2/CMakeLists.txt
+      sed -i '/set(TORCH_XPU_OPS_REPO_URL/,/^  endif()/s/^/#/' caffe2/CMakeLists.txt
+      sed -i '/execute_process(/,/^  endif()/s/^/#/' caffe2/CMakeLists.txt
+    ''
+    # error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
+    # This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
+    +
+      lib.optionalString
+        (stdenv.hostPlatform.isDarwin && lib.versionOlder stdenv.hostPlatform.darwinSdkVersion "11.0")
+        ''
+          substituteInPlace third_party/pocketfft/pocketfft_hdronly.h --replace-fail '#if (__cplusplus >= 201703L) && (!defined(__MINGW32__)) && (!defined(_MSC_VER))
+          inline void *aligned_alloc(size_t align, size_t size)' '#if 0
+          inline void *aligned_alloc(size_t align, size_t size)'
+        '';
+
+  # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
+  # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
+  # without extreme care to ensure they don't lock each other out of shared resources.
+  # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
+  preConfigure =
+    lib.optionalString cudaSupport ''
+      export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
+      export CUPTI_INCLUDE_DIR=${lib.getDev cudaPackages.cuda_cupti}/include
+      export CUPTI_LIBRARY_DIR=${lib.getLib cudaPackages.cuda_cupti}/lib
+    ''
+    + lib.optionalString (cudaSupport && cudaPackages ? cudnn) ''
+      export CUDNN_INCLUDE_DIR=${lib.getLib cudnn}/include
+      export CUDNN_LIB_DIR=${cudnn.lib}/lib
+    ''
+    + lib.optionalString rocmSupport ''
+      export PYTORCH_ROCM_ARCH="${gpuTargetString}"
+      python tools/amd_build/build_amd.py
+    '';
+
+  # Use pytorch's custom configurations
+  dontUseCmakeConfigure = true;
+
+  # causes possible redefinition of _FORTIFY_SOURCE
+  hardeningDisable = [ "fortify3" ];
+
+  BUILD_NAMEDTENSOR = setBool true;
+  BUILD_DOCS = setBool buildDocs;
+
+  # We only do an imports check, so do not build tests either.
+  BUILD_TEST = setBool false;
+
+  # ninja hook doesn't automatically turn on ninja
+  # because pytorch setup.py is responsible for this
+  CMAKE_GENERATOR = "Ninja";
+
+  # Whether to use C++11 ABI (or earlier).
+  _GLIBCXX_USE_CXX11_ABI = setBool cxx11Abi;
+
+  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
+  # it by default. PyTorch currently uses its own vendored version
+  # of oneDNN through Intel iDeep.
+  USE_MKLDNN = setBool mklDnnSupport;
+  USE_MKLDNN_CBLAS = setBool mklDnnSupport;
+
+  # Avoid using pybind11 from git submodule
+  # Also avoids pytorch exporting the headers of pybind11
+  USE_SYSTEM_PYBIND11 = true;
+
+  cmakeFlags = [
+    # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
+    (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaMajorMinorVersion)
+  ]
+  ++ lib.optionals cudaSupport [
+    # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache
+    # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363
+    (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaMajorMinorVersion)
+  ];
+
+  preBuild = ''
+    export MAX_JOBS=$NIX_BUILD_CORES
+    ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only
+    ${cmake}/bin/cmake build
+  '';
+
+  preFixup = ''
+    function join_by { local IFS="$1"; shift; echo "$*"; }
+    function strip2 {
+      IFS=':'
+      read -ra RP <<< $(patchelf --print-rpath $1)
+      IFS=' '
+      RP_NEW=$(join_by : ''${RP[@]:2})
+      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
+    }
+    for f in $(find ''${out} -name 'libcaffe2*.so')
+    do
+      strip2 $f
+    done
+  '';
+
+  # Override the (weirdly) wrong version set by default. See
+  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
+  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
+  PYTORCH_BUILD_VERSION = version;
+  PYTORCH_BUILD_NUMBER = 0;
+
+  # In-tree builds of NCCL are not supported.
+  # Use NCCL when cudaSupport is enabled and nccl is available.
+  USE_NCCL = setBool useSystemNccl;
+  USE_SYSTEM_NCCL = USE_NCCL;
+  USE_STATIC_NCCL = USE_NCCL;
+
+  # Set the correct Python library path, broken since
+  # https://github.com/pytorch/pytorch/commit/3d617333e
+  PYTHON_LIB_REL_PATH = "${placeholder "out"}/${python.sitePackages}";
+
+  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
+  # (upstream seems to have fixed this in the wrong place?)
+  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
+  # https://github.com/pytorch/pytorch/issues/22346
+  #
+  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
+  # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
+  env = {
+    # Builds faster without this and we don't have enough inputs that cmd length is an issue
+    NIX_CC_USE_RESPONSE_FILE = 0;
+
+    NIX_CFLAGS_COMPILE = toString (
+      (lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ] ++ [ "-Wno-error" ])
+    );
+  }
+  // lib.optionalAttrs rocmSupport {
+    AOTRITON_INSTALLED_PREFIX = rocmPackages.aotriton_0_10;
+  }
+  // lib.optionalAttrs stdenv.hostPlatform.isDarwin {
+    USE_MPS = 1;
+  }
+  // lib.optionalAttrs xpuSupport {
+    MKLROOT = xpuPackages.oneapi-torch-dev;
+    SYCL_ROOT = xpuPackages.oneapi-torch-dev;
+  };
+
+  nativeBuildInputs = [
+    cmake
+    ninja
+    pybind11
+    removeReferencesTo
+    which
+  ]
+  ++ lib.optionals cudaSupport (
+    with cudaPackages;
+    [
+      autoAddDriverRunpath
+      cuda_nvcc
+    ]
+  )
+  ++ lib.optionals rocmSupport [
+    rocmtoolkit_joined
+    rocmPackages.setupRocmHook
+  ]
+  ++ lib.optionals xpuSupport (
+    with xpuPackages;
+    [
+      ocloc
+      oneapi-torch-dev
+    ]
+  );
+
+  buildInputs = [
+    blas
+    blas.provider
+  ]
+  ++ lib.optionals cudaSupport (
+    with cudaPackages;
+    [
+      cuda_cccl # <thrust/*>
+      cuda_cudart # cuda_runtime.h and libraries
+      cuda_cupti # For kineto
+      cuda_nvcc # crt/host_config.h; even though we include this in nativeBuildInputs, it's needed here too
+      cuda_nvml_dev # <nvml.h>
+      cuda_nvrtc
+      #cuda_nvtx # -llibNVToolsExt
+      cuda_profiler_api # <cuda_profiler_api.h>
+      nvtx
+      libcublas
+      libcufile
+      libcufft
+      libcurand
+      libcusolver
+      libcusparse
+    ]
+    ++ lists.optionals (cudaPackages ? cudnn) [ cudnn ]
+    ++ lists.optionals useSystemNccl [
+      # Some platforms do not support NCCL (i.e., Jetson)
+      nccl # Provides nccl.h AND a static copy of NCCL!
+    ]
+  )
+  ++ lib.optionals rocmSupport (
+    with rocmPackages;
+    [
+      composablekernel-devel
+      hipcub-devel
+      libdrm
+      openmp
+      rocmtoolkit_joined
+      rocprim-devel
+      rocthrust-devel
+    ]
+  )
+  ++ lib.optionals xpuSupport (
+    with xpuPackages;
+    [
+      oneapi-torch-dev
+      onednn-xpu
+    ]
+  )
+  ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
+  ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
+  ++ lib.optionals stdenv.hostPlatform.isDarwin [
+    apple-sdk_15
+  ]
+  ++ lib.optionals tritonSupport [ _tritonEffective ]
+  ++ lib.optionals MPISupport [ mpi ];
+
+  pythonRelaxDeps = [
+    "sympy"
+  ];
+  dependencies = [
+    astunparse
+    expecttest
+    filelock
+    fsspec
+    hypothesis
+    jinja2
+    networkx
+    ninja
+    packaging
+    psutil
+    pyyaml
+    requests
+    sympy
+    types-dataclasses
+    typing-extensions
+
+    # the following are required for tensorboard support
+    pillow
+    six
+    tensorboard
+    protobuf
+
+    # torch/csrc requires `pybind11` at runtime
+    pybind11
+  ]
+  ++ lib.optionals (lib.versionAtLeast python.version "3.12") [ setuptools ]
+  ++ lib.optionals tritonSupport [ _tritonEffective ];
+
+  propagatedCxxBuildInputs =
+    [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
+
+  # Tests take a long time and may be flaky, so just sanity-check imports
+  doCheck = false;
+
+  pythonImportsCheck = [ "torch" ];
+
+  nativeCheckInputs = [
+    hypothesis
+    ninja
+    psutil
+  ];
+
+  checkPhase =
+    with lib.versions;
+    with lib.strings;
+    concatStringsSep " " [
+      "runHook preCheck"
+      "${python.interpreter} test/run_test.py"
+      "--exclude"
+      (concatStringsSep " " [
+        "utils" # utils requires git, which is not allowed in the check phase
+
+        # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
+        # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
+
+        # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
+        (optionalString (majorMinor version == "1.3") "tensorboard")
+      ])
+      "runHook postCheck"
+    ];
+
+  pythonRemoveDeps = [
+    # In our dist-info the name is just "triton"
+    "pytorch-triton-rocm"
+  ];
+
+  postInstall = ''
+    find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${effectiveStdenv.cc} '{}' +
+
+    mkdir $dev
+    cp -r $out/${python.sitePackages}/torch/include $dev/include
+    cp -r $out/${python.sitePackages}/torch/share $dev/share
+
+    # Fix up library paths for split outputs
+    substituteInPlace \
+      $dev/share/cmake/Torch/TorchConfig.cmake \
+      --replace-fail \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
+
+    substituteInPlace \
+      $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
+      --replace-fail \''${_IMPORT_PREFIX}/lib "$lib/lib"
+
+    mkdir $lib
+    mv $out/${python.sitePackages}/torch/lib $lib/lib
+    ln -s $lib/lib $out/${python.sitePackages}/torch/lib
+  ''
+  + lib.optionalString rocmSupport ''
+    substituteInPlace $dev/share/cmake/Tensorpipe/TensorpipeTargets-release.cmake \
+      --replace-fail "\''${_IMPORT_PREFIX}/lib64" "$lib/lib"
+
+    substituteInPlace $dev/share/cmake/ATen/ATenConfig.cmake \
+      --replace-fail "/build/source/torch/include" "$dev/include"
+  '';
+
+  postFixup = ''
+    mkdir -p "$cxxdev/nix-support"
+    printWords "''${propagatedCxxBuildInputs[@]}" >> "$cxxdev/nix-support/propagated-build-inputs"
+  ''
+  + lib.optionalString stdenv.hostPlatform.isDarwin ''
+    for f in $(ls $lib/lib/*.dylib); do
+        install_name_tool -id $lib/lib/$(basename $f) $f || true
+    done
+
+    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
+    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
+
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
+
+    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
+    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
+  '';
+
+  # See https://github.com/NixOS/nixpkgs/issues/296179
+  #
+  # This is a quick hack to add `libnvrtc` to the runpath so that torch can find
+  # it when it is needed at runtime.
+  extraRunpaths = lib.optionals cudaSupport [ "${lib.getLib cudaPackages.cuda_nvrtc}/lib" ];
+  postPhases = lib.optionals stdenv.hostPlatform.isLinux [ "postPatchelfPhase" ];
+  postPatchelfPhase = ''
+    while IFS= read -r -d $'\0' elf ; do
+      for extra in $extraRunpaths ; do
+        echo patchelf "$elf" --add-rpath "$extra" >&2
+        patchelf "$elf" --add-rpath "$extra"
+      done
+    done < <(
+      find "''${!outputLib}" "$out" -type f -iname '*.so' -print0
+    )
+  '';
+
+  # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
+  requiredSystemFeatures = [ "big-parallel" ];
+
+  passthru = {
+    inherit
+      cudaSupport
+      cudaPackages
+      cxx11Abi
+      rocmSupport
+      rocmPackages
+      xpuSupport
+      xpuPackages
+      ;
+    cudaCapabilities = if cudaSupport then supportedCudaCapabilities else [ ];
+    rocmArchs = if rocmSupport then supportedTorchRocmArchs else [ ];
+    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
+    blasProvider = blas.provider;
+    # To help debug when a package is broken due to CUDA support
+    inherit brokenConditions;
+    tests = callPackage ./tests.nix { };
+  };
+
+  meta = {
+    changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
+    # keep PyTorch in the description so the package can be found under that name on search.nixos.org
+    description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
+    homepage = "https://pytorch.org/";
+    license = lib.licenses.bsd3;
+    maintainers = with lib.maintainers; [
+      teh
+      thoughtpolice
+      tscholak
+    ]; # tscholak esp. for darwin-related builds
+    platforms =
+      lib.platforms.linux ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin;
+    broken = builtins.any trivial.id (builtins.attrValues brokenConditions);
+  };
+}
diff --git a/pkgs/python-modules/torch/source/2_9/fix-cmake-cuda-toolkit.patch b/pkgs/python-modules/torch/source/2_9/fix-cmake-cuda-toolkit.patch
new file mode 100644
index 00000000..35b6d23c
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/fix-cmake-cuda-toolkit.patch
@@ -0,0 +1,51 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c4cd4b2c2a..e983b21353 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1319,10 +1319,6 @@ if(BUILD_SHARED_LIBS)
+     DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
+     DESTINATION share/cmake/Caffe2/
+     COMPONENT dev)
+-  install(
+-    FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
+-    DESTINATION share/cmake/Caffe2/
+-    COMPONENT dev)
+   install(
+     FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
+     DESTINATION share/cmake/Caffe2/
+diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
+index b37036d2f2..9878b70cf7 100644
+--- a/cmake/public/cuda.cmake
++++ b/cmake/public/cuda.cmake
+@@ -61,9 +61,15 @@ find_package(CUDAToolkit REQUIRED)
+ cmake_policy(POP)
+ 
+ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
+-  message(FATAL_ERROR "Found two conflicting CUDA versions:\n"
+-                      "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
+-                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
++  if(CUDA_INCLUDE_DIRS IN_LIST CUDAToolkit_INCLUDE_DIR)
++    message(STATUS "CUDA_INCLUDE_DIRS is a substring of CUDAToolkit_INCLUDE_DIR. "
++                   "Setting CUDA_INCLUDE_DIRS to CUDAToolkit_INCLUDE_DIR.")
++    set(CUDA_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIR}")
++  else()
++    message(FATAL_ERROR "Found two conflicting CUDA installs:\n"
++                        "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
++                        "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIR}'")
++  endif()
+ endif()
+ 
+ if(NOT TARGET CUDA::nvToolsExt)
+diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
+index 4d10b3db1a..48a463df59 100644
+--- a/tools/setup_helpers/cmake.py
++++ b/tools/setup_helpers/cmake.py
+@@ -206,6 +206,8 @@ class CMake:
+                     "BUILDING_WITH_TORCH_LIBS",
+                     "CUDA_HOST_COMPILER",
+                     "CUDA_NVCC_EXECUTABLE",
++                    "CUDAToolkit_ROOT",
++                    "CUDAToolkit_INCLUDE_DIR",
+                     "CUDA_SEPARABLE_COMPILATION",
+                     "CUDNN_LIBRARY",
+                     "CUDNN_INCLUDE_DIR",
diff --git a/pkgs/python-modules/torch/source/2_9/mkl-rpath.patch b/pkgs/python-modules/torch/source/2_9/mkl-rpath.patch
new file mode 100644
index 00000000..bbb1bc2e
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/mkl-rpath.patch
@@ -0,0 +1,17 @@
+diff --git a/cmake/public/mkl.cmake b/cmake/public/mkl.cmake
+index 2f6d1fd905..f30464be07 100644
+--- a/cmake/public/mkl.cmake
++++ b/cmake/public/mkl.cmake
+@@ -16,12 +16,6 @@ foreach(MKL_LIB IN LISTS MKL_LIBRARIES)
+   endif()
+ endforeach()
+ 
+-# TODO: This is a hack, it will not pick up architecture dependent
+-# MKL libraries correctly; see https://github.com/pytorch/pytorch/issues/73008
+-set_property(
+-  TARGET caffe2::mkl PROPERTY INTERFACE_LINK_DIRECTORIES
+-  ${MKL_ROOT}/lib ${MKL_ROOT}/lib/intel64 ${MKL_ROOT}/lib/intel64_win ${MKL_ROOT}/lib/win-x64)
+-
+ if(UNIX)
+   if(USE_STATIC_MKL)
+     foreach(MKL_LIB_PATH IN LISTS MKL_LIBRARIES)
diff --git a/pkgs/python-modules/torch/source/2_9/pthreadpool-disable-gcd.diff b/pkgs/python-modules/torch/source/2_9/pthreadpool-disable-gcd.diff
new file mode 100644
index 00000000..eddd4572
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/pthreadpool-disable-gcd.diff
@@ -0,0 +1,45 @@
+diff --git a/third_party/pthreadpool/CMakeLists.txt b/third_party/pthreadpool/CMakeLists.txt
+index 0db3264..1ba91c4 100644
+--- a/third_party/pthreadpool/CMakeLists.txt
++++ b/third_party/pthreadpool/CMakeLists.txt
+@@ -74,9 +74,7 @@ IF(EMSCRIPTEN)
+   LIST(APPEND PTHREADPOOL_SRCS src/shim.c)
+ ELSE()
+   LIST(APPEND PTHREADPOOL_SRCS src/portable-api.c src/memory.c)
+-  IF(APPLE AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "gcd"))
+-    LIST(APPEND PTHREADPOOL_SRCS src/gcd.c)
+-  ELSEIF(CMAKE_SYSTEM_NAME MATCHES "^(Windows|CYGWIN|MSYS)$" AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event"))
++  IF(CMAKE_SYSTEM_NAME MATCHES "^(Windows|CYGWIN|MSYS)$" AND (PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "default" OR PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event"))
+     LIST(APPEND PTHREADPOOL_SRCS src/windows.c)
+   ELSE()
+     LIST(APPEND PTHREADPOOL_SRCS src/pthreads.c)
+@@ -111,10 +109,6 @@ ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "futex")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=1)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=0)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_EVENT=0)
+-ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "gcd")
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=1)
+-  TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_EVENT=0)
+ ELSEIF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "event")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_GCD=0)
+diff --git a/third_party/pthreadpool/src/threadpool-common.h b/third_party/pthreadpool/src/threadpool-common.h
+index ca84744..244d0ca 100644
+--- a/third_party/pthreadpool/src/threadpool-common.h
++++ b/third_party/pthreadpool/src/threadpool-common.h
+@@ -14,14 +14,6 @@
+ 	#endif
+ #endif
+ 
+-#ifndef PTHREADPOOL_USE_GCD
+-	#if defined(__APPLE__)
+-		#define PTHREADPOOL_USE_GCD 1
+-	#else
+-		#define PTHREADPOOL_USE_GCD 0
+-	#endif
+-#endif
+-
+ #ifndef PTHREADPOOL_USE_EVENT
+ 	#if defined(_WIN32) || defined(__CYGWIN__)
+ 		#define PTHREADPOOL_USE_EVENT 1
diff --git a/pkgs/python-modules/torch/source/2_9/pytorch-pr-108847.patch b/pkgs/python-modules/torch/source/2_9/pytorch-pr-108847.patch
new file mode 100644
index 00000000..9511191c
--- /dev/null
+++ b/pkgs/python-modules/torch/source/2_9/pytorch-pr-108847.patch
@@ -0,0 +1,31 @@
+From bf4050edab9f294a8e0060c47f906cd7a80f25a2 Mon Sep 17 00:00:00 2001
+From: Samuel Ainsworth <skainsworth@gmail.com>
+Date: Sat, 9 Sep 2023 02:04:09 +0000
+Subject: [PATCH] Dependencies.cmake: support building against CUPTI outside of
+ CUDA_SOURCE_DIR
+
+Limitation discovered in https://github.com/NixOS/nixpkgs/pull/249259.
+---
+ cmake/Dependencies.cmake | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 0602d534dc4c14..5f6a5f79f3e3dc 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1879,6 +1879,7 @@ if(USE_KINETO)
+         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+         ${CUDA_SOURCE_DIR}/lib
+         ${CUDA_SOURCE_DIR}/lib64
++        $ENV{CUPTI_LIBRARY_DIR}
+         NO_DEFAULT_PATH)
+ 
+     find_path(CUPTI_INCLUDE_DIR cupti.h PATHS
+@@ -1886,6 +1887,7 @@ if(USE_KINETO)
+         ${CUDA_INCLUDE_DIRS}
+         ${CUDA_SOURCE_DIR}
+         ${CUDA_SOURCE_DIR}/include
++        $ENV{CUPTI_INCLUDE_DIR}
+         NO_DEFAULT_PATH)
+ 
+     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)

From 2bc1a3a26ff46e8ff86aaf940b71b8c979e0577e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 19 Nov 2025 14:34:08 +0000
Subject: [PATCH 2/2] Add zlib as a dependency to aarch64-linux Torch

Torch vendors libgfortran, which has zlib as a dependency. We might want
to use system libgfortran in the future instead, but it requires more
investigation/testing.

Fixes #301.
---
 pkgs/python-modules/torch/binary/generic.nix | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkgs/python-modules/torch/binary/generic.nix b/pkgs/python-modules/torch/binary/generic.nix
index 8acd481c..4f4a90b0 100644
--- a/pkgs/python-modules/torch/binary/generic.nix
+++ b/pkgs/python-modules/torch/binary/generic.nix
@@ -22,6 +22,7 @@
   cudaPackages,
   rocmPackages,
   xpuPackages,
+  zlib,
 
   # Python dependencies
   filelock,
@@ -193,7 +194,9 @@ buildPythonPackage {
         intel-oneapi-mpi
         intel-pti
       ]
-    );
+    )
+    # Torch on aarch64-linux vendors libgfortran, which requires zlib.
+    ++ lib.optionals (stdenv.hostPlatform.isLinux && stdenv.hostPlatform.isAarch64) [ zlib ];
 
   dependencies = [
     filelock