From 129974205665cad36f89dc6454e3cbcd52765828 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 7 Jun 2023 23:31:28 -0400
Subject: [PATCH] [cuda] Port over allocator and buffer implementation 
 (#13985)

This commit ports over existing CUDA driver allocator and buffer
implementation. The main logic is kept as-is, with one noticeable
changes--context wrapper is dropped and fields in it are directly
put in various API calls. This is to make supporting multiple
device and stream easier later. Other changes are just polishing
on comments and errors.

Progress towards https://github.com/openxla/iree/issues/13245
---
 experimental/cuda2/CMakeLists.txt             |   6 +
 experimental/cuda2/api.h                      |  24 +
 experimental/cuda2/cuda_allocator.c           | 623 ++++++++++++++++++
 experimental/cuda2/cuda_allocator.h           |  33 +
 experimental/cuda2/cuda_buffer.c              | 166 +++++
 experimental/cuda2/cuda_buffer.h              |  66 ++
 .../cuda2/cuda_dynamic_symbol_table.h         |  12 +
 experimental/cuda2/memory_pools.c             | 278 ++++++++
 experimental/cuda2/memory_pools.h             |  73 ++
 9 files changed, 1281 insertions(+)
 create mode 100644 experimental/cuda2/cuda_allocator.c
 create mode 100644 experimental/cuda2/cuda_allocator.h
 create mode 100644 experimental/cuda2/cuda_buffer.c
 create mode 100644 experimental/cuda2/cuda_buffer.h
 create mode 100644 experimental/cuda2/memory_pools.c
 create mode 100644 experimental/cuda2/memory_pools.h

diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index c104bcde8f4e..ca1959516917 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -17,7 +17,13 @@ iree_cc_library(
     "api.h"
   SRCS
     "api.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
     "cuda_driver.c"
+    "memory_pools.c"
+    "memory_pools.h"
   DEPS
     ::dynamic_symbols
     iree::base
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index 565fd50ca972..a56b656b2b7c 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -16,6 +16,30 @@
 extern "C" {
 #endif  // __cplusplus
 
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda_device_t
+//===----------------------------------------------------------------------===//
+
+// Parameters defining a CUmemoryPool.
+typedef struct iree_hal_cuda2_memory_pool_params_t {
+  // Minimum number of bytes to keep in the pool when trimming with
+  // iree_hal_device_trim.
+  uint64_t minimum_capacity;
+  // Soft maximum number of bytes to keep in the pool.
+  // When more than this is allocated the extra will be freed at the next
+  // device synchronization in order to remain under the threshold.
+  uint64_t release_threshold;
+  // TODO: per-device access permissions array.
+} iree_hal_cuda2_memory_pool_params_t;
+
+// Parameters for each CUmemoryPool used for queue-ordered allocations.
+typedef struct iree_hal_cuda2_memory_pooling_params_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  iree_hal_cuda2_memory_pool_params_t device_local;
+  // Used for any host-visible/host-local memory types.
+  iree_hal_cuda2_memory_pool_params_t other;
+} iree_hal_cuda2_memory_pooling_params_t;
+
 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_driver_t
 //===----------------------------------------------------------------------===//
diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
new file mode 100644
index 000000000000..5ec9a396af32
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.c
@@ -0,0 +1,623 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/cuda_allocator.h"
+
+#include <stddef.h>
+
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2 unpooled";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+typedef struct iree_hal_cuda2_allocator_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
+  iree_hal_resource_t resource;
+
+  // The device that this allocator allocates memory from.
+  iree_hal_device_t* base_device;
+  CUdevice device;
+
+  // The CUDA stream that allocations should be used in.
+  CUstream stream;
+
+  iree_hal_cuda2_memory_pools_t* pools;
+
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  iree_allocator_t host_allocator;
+
+  // Whether the GPU and CPU can concurrently access CUDA managed data in a
+  // coherent way. We would need to explicitly perform flushing and invalidation
+  // between GPU and CPU if not.
+  bool supports_concurrent_managed_access;
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_cuda2_allocator_t;
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;
+
+static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
+    iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
+  return (iree_hal_cuda2_allocator_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(base_device);
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(pools);
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // To support device-local + host-visible memory we need concurrent managed
+  // access indicating that the host and devices can concurrently access the
+  // device memory. If we don't have this feature then we fall back to forcing
+  // all device-local + host-visible memory into host-local + device-visible
+  // page-locked memory. The compiler tries to avoid this for high-traffic
+  // buffers except for readback staging buffers.
+  int supports_concurrent_managed_access = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, IREE_CURESULT_TO_STATUS(
+              cuda_symbols,
+              cuDeviceGetAttribute(
+                  &supports_concurrent_managed_access,
+                  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
+              "cuDeviceGetAttribute"));
+
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, supports_concurrent_managed_access
+              ? "has CONCURRENT_MANAGED_ACCESS"
+              : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
+                "device-local + host-visible memory)");
+
+  iree_hal_cuda2_allocator_t* allocator = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*allocator), (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
+                                 &allocator->resource);
+    allocator->base_device = base_device;
+    allocator->device = device;
+    allocator->stream = stream;
+    allocator->pools = pools;
+    allocator->symbols = cuda_symbols;
+    allocator->host_allocator = host_allocator;
+    allocator->supports_concurrent_managed_access =
+        supports_concurrent_managed_access != 0;
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda2_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(allocator->host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda2_allocator_t* allocator =
+      (iree_hal_cuda2_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_cuda2_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda2_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_cuda2_allocator_t* allocator =
+        iree_hal_cuda2_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+    iree_hal_cuda2_memory_pools_merge_statistics(allocator->pools,
+                                                 out_statistics);
+  });
+}
+
+static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_host_size_t capacity,
+    iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
+    iree_host_size_t* IREE_RESTRICT out_count) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(heaps);
+  IREE_ASSERT_ARGUMENT(out_count);
+
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
+  // set of heaps (likely still a cached and uncached, at minimum).
+  iree_host_size_t count = 3;
+  if (allocator->supports_concurrent_managed_access) {
+    ++count;  // device-local | host-visible
+  }
+  if (out_count) *out_count = count;
+  if (capacity < count) {
+    // NOTE: lightweight as this is hit in normal pre-sizing usage.
+    return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+  }
+
+  // Don't think there's a query for these.
+  // Max allocation size may be much smaller in certain memory types such as
+  // page-locked memory and it'd be good to enforce that.
+  const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0;
+  const iree_device_size_t min_alignment = 64;
+
+  int i = 0;
+
+  // Device-local memory (dispatch resources):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+      .allowed_usage =
+          IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  if (allocator->supports_concurrent_managed_access) {
+    // Device-local managed memory with host mapping support:
+    heaps[i++] = (iree_hal_allocator_memory_heap_t){
+        .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+                IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+        .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                         IREE_HAL_BUFFER_USAGE_DISPATCH |
+                         IREE_HAL_BUFFER_USAGE_MAPPING,
+        .max_allocation_size = max_allocation_size,
+        .min_alignment = min_alignment,
+    };
+  }
+
+  // Write-combined page-locked host-local memory (upload):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  // Cached page-locked host-local memory (download):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT |
+              IREE_HAL_MEMORY_TYPE_HOST_CACHED,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  IREE_ASSERT(i == count);
+  return iree_ok_status();
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_cuda2_allocator_query_buffer_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t* IREE_RESTRICT allocation_size) {
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // Buffers are importable in CUDA under most cases, though performance may
+  // vary wildly. We don't fully verify that the buffer parameters are
+  // self-consistent and just look at whether we can get a device pointer.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+    }
+    if (iree_any_bit_set(params->usage,
+                         IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  // If concurrent managed access is not supported then make device-local +
+  // host-visible allocations fall back to host-local + device-visible
+  // page-locked memory. This will be significantly slower for the device to
+  // access but the compiler only uses this type for readback staging buffers
+  // and it's better to function than function fast.
+  if (!allocator->supports_concurrent_managed_access &&
+      iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE;
+    params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE);
+    params->type |=
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  }
+
+  // We are now optimal.
+  params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL;
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (*allocation_size == 0) *allocation_size = 4;
+
+  return compatibility;
+}
+
+static void iree_hal_cuda2_buffer_free(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)");
+      break;
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
+  if (iree_all_bits_set(compat_params.type,
+                        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    if (iree_all_bits_set(compat_params.type,
+                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device local + host visible.
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
+                                                CU_MEM_ATTACH_GLOBAL));
+      if (iree_status_is_ok(status) &&
+          allocator->supports_concurrent_managed_access) {
+        // Prefetch the buffer to the GPU stream.
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
+            cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
+                               allocator->stream));
+      }
+      host_ptr = (void*)device_ptr;
+    } else {
+      // Device only.
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
+    }
+  } else {
+    // Host local cases.
+    buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
+    unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
+    if (!iree_all_bits_set(compat_params.type,
+                           IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+      flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
+    }
+    status = IREE_CURESULT_TO_STATUS(
+        allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    if (iree_status_is_ok(status)) {
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
+          cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, allocation_size,
+        /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr,
+        iree_hal_buffer_release_callback_null(),
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->base_device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
+                           (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
+                           allocation_size);
+    IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, compat_params.type, allocation_size));
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static void iree_hal_cuda2_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  const iree_hal_cuda2_buffer_type_t buffer_type =
+      iree_hal_cuda2_buffer_type(base_buffer);
+
+  // WARNING: we may be called from a random thread and need to ensure that we
+  // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
+  // change the context here will result in full device synchronization. In the
+  // future we'll need to do something fairly complex such as having a dedicated
+  // thread with a persistently bound context that does nothing but free
+  // buffers. The load on this will be lighter when queue-ordered allocations
+  // are used or any sort of pooling policy is applied.
+  //
+  // WARNING: with CUDA's lazy error propagation it's possible that by the time
+  // this code is running something else has triggered device loss and we can't
+  // actually use the context. In that case we can't perform the frees and want
+  // to silently ignore them: whatever the user tries to do next will fail in
+  // the same way and if we were deallocating this buffer as part of a tear-down
+  // on failure we don't want to end up dying during cleanup.
+  iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
+                             iree_hal_cuda2_buffer_device_pointer(base_buffer),
+                             iree_hal_cuda2_buffer_host_pointer(base_buffer));
+
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_FREE_NAMED(
+          IREE_HAL_CUDA_ALLOCATOR_ID,
+          (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
+      IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
+          &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
+          iree_hal_buffer_allocation_size(base_buffer)));
+      break;
+    }
+    default:
+      // Buffer type not tracked.
+      break;
+  }
+
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_cuda2_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_device_size_t allocation_size = external_buffer->size;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+
+  switch (external_buffer->type) {
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: {
+      if (iree_all_bits_set(compat_params.type,
+                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "unable to register host allocations as device-local memory");
+      }
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED;
+      host_ptr = external_buffer->handle.host_allocation.ptr;
+      uint32_t register_flags = 0;
+      if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) {
+        register_flags = CU_MEMHOSTREGISTER_READ_ONLY;
+      }
+      if (iree_any_bit_set(compat_params.usage,
+                           IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
+        register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
+      }
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
+          cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
+          "cuMemHostRegister");
+      if (iree_status_is_ok(status)) {
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
+            cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
+            "cuMemHostGetDevicePointer");
+      }
+      break;
+    }
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "handle-based imports not yet implemented");
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "external buffer type not supported");
+  }
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, external_buffer->size, /*byte_offset=*/0,
+        /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
+        host_ptr, release_callback,
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_cuda2_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "exporting to external buffers not supported");
+}
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
+    .destroy = iree_hal_cuda2_allocator_destroy,
+    .host_allocator = iree_hal_cuda2_allocator_host_allocator,
+    .trim = iree_hal_cuda2_allocator_trim,
+    .query_statistics = iree_hal_cuda2_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
+    .query_buffer_compatibility =
+        iree_hal_cuda2_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda2_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda2_allocator_export_buffer,
+};
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
new file mode 100644
index 000000000000..2ff33ea467c0
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.h
@@ -0,0 +1,33 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/memory_pools.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a CUDA memory allocator.
+// |device| and |stream| will be used for management operations.
+// |pools| provides memory pools that may be shared across multiple allocators
+// and the pointer must remain valid for the lifetime of the allocator.
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
new file mode 100644
index 000000000000..e88c9e3b3de3
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.c
@@ -0,0 +1,166 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/cuda_buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda2_buffer_t {
+  iree_hal_buffer_t base;
+  iree_hal_cuda2_buffer_type_t type;
+  void* host_ptr;
+  CUdeviceptr device_ptr;
+  iree_hal_buffer_release_callback_t release_callback;
+} iree_hal_cuda2_buffer_t;
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable;
+
+static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (iree_hal_cuda2_buffer_t*)base_value;
+}
+
+static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast(
+    const iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (const iree_hal_cuda2_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda2_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda2_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, byte_offset, byte_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_cuda2_buffer_vtable, &buffer->base);
+    buffer->type = buffer_type;
+    buffer->host_ptr = host_ptr;
+    buffer->device_ptr = device_ptr;
+    buffer->release_callback = release_callback;
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (buffer->release_callback.fn) {
+    buffer->release_callback.fn(buffer->release_callback.user_data,
+                                base_buffer);
+  }
+  iree_allocator_free(host_allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda2_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+
+  // TODO(benvanik): add upload/download for unmapped buffers.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(base_buffer),
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer),
+                                     IREE_HAL_BUFFER_USAGE_MAPPING));
+
+  uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset;
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(data_ptr, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  mapping->contents = iree_make_byte_span(data_ptr, local_byte_length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->type;
+}
+
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->device_ptr;
+}
+
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->host_ptr;
+}
+
+void iree_hal_cuda2_buffer_drop_release_callback(
+    iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+  buffer->release_callback = iree_hal_buffer_release_callback_null();
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_cuda2_buffer_destroy,
+    .map_range = iree_hal_cuda2_buffer_map_range,
+    .unmap_range = iree_hal_cuda2_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda2_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda2_buffer_flush_range,
+};
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
new file mode 100644
index 000000000000..23cade3c57ad
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.h
@@ -0,0 +1,66 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum iree_hal_cuda2_buffer_type_e {
+  // Device local buffer; allocated with cuMemAlloc/cuMemAllocManaged, freed
+  // with cuMemFree.
+  IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0,
+  // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost.
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST,
+  // Host local buffer; registered with cuMemHostRegister, freed with
+  // cuMemHostUnregister.
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED,
+  // Device local buffer, allocated with cuMemAllocFromPoolAsync, freed with
+  // cuMemFree/cuMemFreeAsync.
+  IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+} iree_hal_cuda2_buffer_type_t;
+
+// Wraps a CUDA allocation in an iree_hal_buffer_t.
+iree_status_t iree_hal_cuda2_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
+
+// Returns the underlying CUDA buffer type of the given |buffer|.
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA base device pointer for the given |buffer|.
+//
+// Note that this is the entire allocated_buffer and must be offset by the
+// buffer byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA host pointer for the given |buffer|, if available.
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+
+// Drops the release callback so that when the buffer is destroyed no callback
+// will be made. This is not thread safe but all callers are expected to be
+// holding an allocation and the earliest the buffer could be destroyed is after
+// this call returns and the caller has released its reference.
+void iree_hal_cuda2_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
diff --git a/experimental/cuda2/cuda_dynamic_symbol_table.h b/experimental/cuda2/cuda_dynamic_symbol_table.h
index b4aaa93fc750..fb8ff5a8ecd8 100644
--- a/experimental/cuda2/cuda_dynamic_symbol_table.h
+++ b/experimental/cuda2/cuda_dynamic_symbol_table.h
@@ -49,6 +49,18 @@ IREE_CU_PFN_DECL(cuMemHostAlloc, void**, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostRegister, void*, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostUnregister, void*)
 IREE_CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int)
+IREE_CU_PFN_DECL(cuMemPoolCreate, CUmemoryPool*, const CUmemPoolProps*)
+IREE_CU_PFN_DECL(cuMemPoolDestroy, CUmemoryPool)
+IREE_CU_PFN_DECL(cuMemPoolSetAccess, CUmemoryPool, const CUmemAccessDesc*,
+                 size_t)
+IREE_CU_PFN_DECL(cuMemPoolGetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolSetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolTrimTo, CUmemoryPool, size_t)
+IREE_CU_PFN_DECL(cuMemAllocFromPoolAsync, CUdeviceptr*, size_t, CUmemoryPool,
+                 CUstream)
+IREE_CU_PFN_DECL(cuMemFreeAsync, CUdeviceptr dptr, CUstream hStream)
 IREE_CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*)
 IREE_CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int,
                  CUjit_option*, void**)
diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c
new file mode 100644
index 000000000000..e29c5121c51a
--- /dev/null
+++ b/experimental/cuda2/memory_pools.c
@@ -0,0 +1,278 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/memory_pools.h"
+
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+#include "iree/base/tracing.h"
+
+// NOTE: these are currently global for all devices; we could make
+// device-specific ones by malloc() and leaking (with LSAN note) unique string
+// values instead.
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
+    "CUDA pool: device-local reserved";
+static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
+    "CUDA pool: other reserved";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+static iree_status_t iree_hal_cuda2_create_memory_pool(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    iree_hal_cuda2_memory_pool_params_t params,
+    CUmemoryPool* IREE_RESTRICT out_pool) {
+  *out_pool = NULL;
+
+  CUmemPoolProps pool_props = {
+      .allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
+      // TODO: allow sharing of certain pool memory types by fd/HANDLE.
+      .handleTypes = CU_MEM_HANDLE_TYPE_NONE,
+      .location =
+          {
+              .type = CU_MEM_LOCATION_TYPE_DEVICE,
+              .id = cu_device,
+          },
+      .win32SecurityAttributes = NULL,
+      .reserved = {0},
+  };
+
+  CUmemoryPool pool = NULL;
+  IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
+                            "cuMemPoolCreate");
+
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      cuda_symbols,
+      cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                            &params.release_threshold),
+      "cuMemPoolSetAttribute");
+
+  if (iree_status_is_ok(status)) {
+    *out_pool = pool;
+  } else {
+    IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(pooling_params);
+  IREE_ASSERT_ARGUMENT(out_pools);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_pools, 0, sizeof(*out_pools));
+  out_pools->cuda_symbols = cuda_symbols;
+  out_pools->host_allocator = host_allocator;
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
+                                               pooling_params->device_local,
+                                               &out_pools->device_local);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_create_memory_pool(
+        cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (pools->device_local) {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemPoolDestroy(pools->device_local));
+    pools->device_local = NULL;
+  }
+
+  if (pools->other) {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
+    pools->other = NULL;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_cuda2_memory_pool_track_alloc(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
+  (void)allocation_size;
+  IREE_TRACE_ALLOC_NAMED(
+      is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                      : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+      (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_allocated =
+        is_device_local ? &pools->statistics.device_bytes_allocated
+                        : &pools->statistics.host_bytes_allocated;
+    iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+static void iree_hal_cuda2_memory_pool_track_free(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  IREE_TRACE_FREE_NAMED(is_device_local
+                            ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                            : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+                        (void*)iree_hal_cuda2_buffer_device_pointer(buffer));
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_freed =
+        is_device_local ? &pools->statistics.device_bytes_freed
+                        : &pools->statistics.host_bytes_freed;
+    iree_device_size_t allocation_size =
+        iree_hal_buffer_allocation_size(buffer);
+    iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics) {
+  IREE_STATISTICS({
+    statistics->device_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
+    statistics->host_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
+    statistics->device_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
+    statistics->host_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
+    if (pools->device_local) {
+      cuuint64_t pool_peak = 0;
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
+          cuMemPoolGetAttribute(pools->device_local,
+                                CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
+      statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+    if (pools->other) {
+      cuuint64_t pool_peak = 0;
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
+          cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+                                &pool_peak));
+      statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+  });
+}
+
+// NOTE: this is only issued if the buffer is destroyed without having had been
+// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
+// the release callback so that this isn't called and we don't double-free.
+static void iree_hal_cuda2_async_buffer_release_callback(
+    void* user_data, iree_hal_buffer_t* buffer) {
+  iree_hal_cuda2_memory_pools_t* pools =
+      (iree_hal_cuda2_memory_pools_t*)user_data;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size);
+
+  iree_hal_buffer_params_canonicalize(&params);
+
+  // TODO: more pools and better selection; this is coarsely deciding between
+  // only device local (variables, constants, transients) and other (staging,
+  // external) but could use more buffer properties (including usage/export
+  // flags) to better isolate the different usage patterns and keep the pools
+  // operating with reasonable limits. We should be using the |pool| arg.
+  CUmemoryPool memory_pool =
+      iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
+          ? pools->device_local
+          : pools->other;
+
+  CUdeviceptr device_ptr = 0;
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols,
+      cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
+                              stream),
+      "cuMemAllocFromPoolAsync");
+
+  // Wrap the allocated CUDA buffer in a HAL buffer.
+  // NOTE: we don't provide a device allocator because we didn't allocate from
+  // one and instead we use a release callback to perform the free if the user
+  // doesn't dealloca the buffer.
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_release_callback_t release_callback = {
+        .fn = iree_hal_cuda2_async_buffer_release_callback,
+        .user_data = pools,
+    };
+    status = iree_hal_cuda2_buffer_wrap(
+        /*device_allocator=*/NULL, params.type, params.access, params.usage,
+        allocation_size, /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+        device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator,
+        &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Update statistics (note that it may not yet be accurate).
+    iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
+    *out_buffer = buffer;
+  } else if (buffer) {
+    iree_hal_buffer_release(buffer);
+  } else {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemFreeAsync(device_ptr, stream));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(
+      z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
+
+  // Try to schedule the buffer for freeing.
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols, cuMemFreeAsync(device_ptr, stream),
+      "cuMemFreeAsync");
+
+  // Drop the release callback so that we don't try to double-free the buffer.
+  iree_hal_cuda2_buffer_drop_release_callback(buffer);
+
+  // Update statistics (note that it may not yet be accurate).
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h
new file mode 100644
index 000000000000..8eccf9ef7105
--- /dev/null
+++ b/experimental/cuda2/memory_pools.h
@@ -0,0 +1,73 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+#define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+
+#include "experimental/cuda2/api.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Retained CUDA memory pools for various allocation types.
+typedef struct iree_hal_cuda2_memory_pools_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  CUmemoryPool device_local;
+  // Used for any host-visible/host-local memory types.
+  CUmemoryPool other;
+
+  const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols;
+  iree_allocator_t host_allocator;
+
+  IREE_STATISTICS(struct {
+    iree_atomic_int64_t device_bytes_allocated;
+    iree_atomic_int64_t device_bytes_freed;
+    iree_atomic_int64_t host_bytes_allocated;
+    iree_atomic_int64_t host_bytes_freed;
+  } statistics;)
+} iree_hal_cuda2_memory_pools_t;
+
+// Initializes |out_pools| by configuring new CUDA memory pools.
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools);
+
+// Deinitializes the |pools| and releases the underlying CUDA resources.
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools);
+
+// Merges statistics information from |pools| into |statistics|.
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics);
+
+// Asynchronously allocates a buffer from an appropriate pool.
+// The allocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+// Asynchronously deallocates a buffer from its pool.
+// The deallocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_