From c3fc7c1dee2fd0af511f3430865e449d75366db0 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 7 Jun 2023 17:07:46 -0700
Subject: [PATCH 1/3] NFC: copy over CUDA allocator / buffer / memory pool

---
 experimental/cuda2/cuda_allocator.c | 594 ++++++++++++++++++++++++++++
 experimental/cuda2/cuda_allocator.h |  33 ++
 experimental/cuda2/cuda_buffer.c    | 164 ++++++++
 experimental/cuda2/cuda_buffer.h    |  62 +++
 experimental/cuda2/memory_pools.c   | 273 +++++++++++++
 experimental/cuda2/memory_pools.h   |  71 ++++
 6 files changed, 1197 insertions(+)
 create mode 100644 experimental/cuda2/cuda_allocator.c
 create mode 100644 experimental/cuda2/cuda_allocator.h
 create mode 100644 experimental/cuda2/cuda_buffer.c
 create mode 100644 experimental/cuda2/cuda_buffer.h
 create mode 100644 experimental/cuda2/memory_pools.c
 create mode 100644 experimental/cuda2/memory_pools.h

diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
new file mode 100644
index 000000000000..eba64090819d
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.c
@@ -0,0 +1,594 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/drivers/cuda/cuda_allocator.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "iree/hal/drivers/cuda/dynamic_symbols.h"
+#include "iree/hal/drivers/cuda/status_util.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA unpooled";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+typedef struct iree_hal_cuda_allocator_t {
+  iree_hal_resource_t resource;
+  iree_hal_device_t* base_device;
+  iree_hal_cuda_context_wrapper_t* context;
+  CUdevice device;
+  CUstream stream;
+  iree_hal_cuda_memory_pools_t* pools;
+  bool supports_concurrent_managed_access;
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_cuda_allocator_t;
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable;
+
+static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast(
+    iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable);
+  return (iree_hal_cuda_allocator_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_allocator_create(
+    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
+    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(base_device);
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(pools);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // To support device-local + host-visible memory we need concurrent managed
+  // access indicating that the host and devices can concurrently access the
+  // device memory. If we don't have this feature then we fall back to forcing
+  // all device-local + host-visible memory into host-local + device-visible
+  // page-locked memory. The compiler tries to avoid this for high-traffic
+  // buffers except for readback staging buffers.
+  int supports_concurrent_managed_access = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, CU_RESULT_TO_STATUS(
+              context->syms,
+              cuDeviceGetAttribute(
+                  &supports_concurrent_managed_access,
+                  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
+              "cuDeviceGetAttribute"));
+
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, supports_concurrent_managed_access
+              ? "has CONCURRENT_MANAGED_ACCESS"
+              : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
+                "device-local + host-visible memory)");
+
+  iree_hal_cuda_allocator_t* allocator = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      context->host_allocator, sizeof(*allocator), (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable,
+                                 &allocator->resource);
+    allocator->base_device = base_device;
+    allocator->context = context;
+    allocator->device = device;
+    allocator->stream = stream;
+    allocator->pools = pools;
+    allocator->supports_concurrent_managed_access =
+        supports_concurrent_managed_access != 0;
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_cuda_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda_allocator_t* allocator =
+      (iree_hal_cuda_allocator_t*)base_allocator;
+  return allocator->context->host_allocator;
+}
+
+static iree_status_t iree_hal_cuda_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_cuda_allocator_t* allocator =
+        iree_hal_cuda_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+    iree_hal_cuda_memory_pools_merge_statistics(allocator->pools,
+                                                out_statistics);
+  });
+}
+
+static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_host_size_t capacity,
+    iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
+    iree_host_size_t* IREE_RESTRICT out_count) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
+  // set of heaps (likely still a cached and uncached, at minimum).
+  iree_host_size_t count = 3;
+  if (allocator->supports_concurrent_managed_access) {
+    ++count;  // device-local | host-visible
+  }
+  if (out_count) *out_count = count;
+  if (capacity < count) {
+    // NOTE: lightweight as this is hit in normal pre-sizing usage.
+    return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+  }
+
+  // Don't think there's a query for these.
+  // Max allocation size may be much smaller in certain memory types such as
+  // page-locked memory and it'd be good to enforce that.
+  const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0;
+  const iree_device_size_t min_alignment = 64;
+
+  int i = 0;
+
+  // Device-local memory (dispatch resources):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+      .allowed_usage =
+          IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  if (allocator->supports_concurrent_managed_access) {
+    // Device-local managed memory with host mapping support:
+    heaps[i++] = (iree_hal_allocator_memory_heap_t){
+        .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+                IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+        .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                         IREE_HAL_BUFFER_USAGE_DISPATCH |
+                         IREE_HAL_BUFFER_USAGE_MAPPING,
+        .max_allocation_size = max_allocation_size,
+        .min_alignment = min_alignment,
+    };
+  }
+
+  // Write-combined page-locked host-local memory (upload):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  // Cached page-locked host-local memory (download):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT |
+              IREE_HAL_MEMORY_TYPE_HOST_CACHED,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  IREE_ASSERT(i == count);
+  return iree_ok_status();
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_cuda_allocator_query_buffer_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t* IREE_RESTRICT allocation_size) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // Buffers are importable in CUDA under most cases, though performance may
+  // vary wildly. We don't fully verify that the buffer parameters are
+  // self-consistent and just look at whether we can get a device pointer.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+    }
+    if (iree_any_bit_set(params->usage,
+                         IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  // If concurrent managed access is not supported then make device-local +
+  // host-visible allocations fall back to host-local + device-visible
+  // page-locked memory. This will be significantly slower for the device to
+  // access but the compiler only uses this type for readback staging buffers
+  // and it's better to function than function fast.
+  if (!allocator->supports_concurrent_managed_access &&
+      iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE;
+    params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE);
+    params->type |=
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  }
+
+  // We are now optimal.
+  params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL;
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (*allocation_size == 0) *allocation_size = 4;
+
+  return compatibility;
+}
+
+static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
+                                      iree_hal_cuda_buffer_type_t buffer_type,
+                                      CUdeviceptr device_ptr, void* host_ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
+      CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
+      CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
+      CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)");
+      break;
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate");
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
+  if (iree_all_bits_set(compat_params.type,
+                        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    // Device local case.
+    if (iree_all_bits_set(compat_params.type,
+                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status =
+          CU_RESULT_TO_STATUS(allocator->context->syms,
+                              cuMemAllocManaged(&device_ptr, allocation_size,
+                                                CU_MEM_ATTACH_GLOBAL));
+      if (iree_status_is_ok(status) &&
+          allocator->supports_concurrent_managed_access) {
+        // Prefetch the buffer on the GPU device.
+        status = CU_RESULT_TO_STATUS(
+            allocator->context->syms,
+            cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
+                               allocator->stream));
+      }
+      host_ptr = (void*)device_ptr;
+    } else {
+      // Device only.
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status = CU_RESULT_TO_STATUS(allocator->context->syms,
+                                   cuMemAlloc(&device_ptr, allocation_size));
+    }
+  } else {
+    buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
+    unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
+    if (!iree_all_bits_set(compat_params.type,
+                           IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+      flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
+    }
+    status =
+        CU_RESULT_TO_STATUS(allocator->context->syms,
+                            cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    if (iree_status_is_ok(status)) {
+      status = CU_RESULT_TO_STATUS(
+          allocator->context->syms,
+          cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, allocation_size,
+        /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr,
+        iree_hal_buffer_release_callback_null(),
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->base_device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
+                           (void*)iree_hal_cuda_buffer_device_pointer(buffer),
+                           allocation_size);
+    IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, compat_params.type, allocation_size));
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
+                                host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static void iree_hal_cuda_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  const iree_hal_cuda_buffer_type_t buffer_type =
+      iree_hal_cuda_buffer_type(base_buffer);
+
+  // WARNING: we may be called from a random thread and need to ensure that we
+  // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
+  // change the context here will result in full device synchronization. In the
+  // future we'll need to do something fairly complex such as having a dedicated
+  // thread with a persistently bound context that does nothing but free
+  // buffers. The load on this will be lighter when queue-ordered allocations
+  // are used or any sort of pooling policy is applied.
+  //
+  // WARNING: with CUDA's lazy error propagation it's possible that by the time
+  // this code is running something else has triggered device loss and we can't
+  // actually use the context. In that case we can't perform the frees and want
+  // to silently ignore them: whatever the user tries to do next will fail in
+  // the same way and if we were deallocating this buffer as part of a tear-down
+  // on failure we don't want to end up dying during cleanup.
+  iree_hal_cuda_buffer_free(allocator->context, buffer_type,
+                            iree_hal_cuda_buffer_device_pointer(base_buffer),
+                            iree_hal_cuda_buffer_host_pointer(base_buffer));
+
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_FREE_NAMED(
+          IREE_HAL_CUDA_ALLOCATOR_ID,
+          (void*)iree_hal_cuda_buffer_device_pointer(base_buffer));
+      IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
+          &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
+          iree_hal_buffer_allocation_size(base_buffer)));
+      break;
+    }
+    default:
+      // Buffer type not tracked.
+      break;
+  }
+
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_cuda_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_device_size_t allocation_size = external_buffer->size;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+
+  switch (external_buffer->type) {
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: {
+      if (iree_all_bits_set(compat_params.type,
+                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "unable to register host allocations as device-local memory");
+      }
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED;
+      host_ptr = external_buffer->handle.host_allocation.ptr;
+      uint32_t register_flags = 0;
+      if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) {
+        register_flags = CU_MEMHOSTREGISTER_READ_ONLY;
+      }
+      if (iree_any_bit_set(compat_params.usage,
+                           IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
+        register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
+      }
+      status = CU_RESULT_TO_STATUS(
+          allocator->context->syms,
+          cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
+          "cuMemHostRegister");
+      if (iree_status_is_ok(status)) {
+        status = CU_RESULT_TO_STATUS(
+            allocator->context->syms,
+            cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
+            "cuMemHostGetDevicePointer");
+      }
+      break;
+    }
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "handle-based imports not yet implemented");
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "external buffer type not supported");
+  }
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, external_buffer->size,
+        /*byte_offset=*/0,
+        /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
+        host_ptr, release_callback,
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
+                                host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_cuda_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "exporting to external buffers not supported");
+}
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = {
+    .destroy = iree_hal_cuda_allocator_destroy,
+    .host_allocator = iree_hal_cuda_allocator_host_allocator,
+    .trim = iree_hal_cuda_allocator_trim,
+    .query_statistics = iree_hal_cuda_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps,
+    .query_buffer_compatibility =
+        iree_hal_cuda_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda_allocator_export_buffer,
+};
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
new file mode 100644
index 000000000000..0df31ee00c77
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.h
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/cuda/context_wrapper.h"
+#include "iree/hal/drivers/cuda/memory_pools.h"
+#include "iree/hal/drivers/cuda/status_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a CUDA memory allocator.
+// |device| and |stream| will be used for management operations.
+// |pools| provides memory pools that may be shared across multiple allocators
+// and the pointer must remain valid for the lifetime of the allocator.
+iree_status_t iree_hal_cuda_allocator_create(
+    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
+    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
new file mode 100644
index 000000000000..87c41475b597
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.c
@@ -0,0 +1,164 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/drivers/cuda/cuda_buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda_buffer_t {
+  iree_hal_buffer_t base;
+  iree_hal_cuda_buffer_type_t type;
+  void* host_ptr;
+  CUdeviceptr device_ptr;
+  iree_hal_buffer_release_callback_t release_callback;
+} iree_hal_cuda_buffer_t;
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable;
+
+static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
+  return (iree_hal_cuda_buffer_t*)base_value;
+}
+
+static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast(
+    const iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
+  return (const iree_hal_cuda_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, byte_offset, byte_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_cuda_buffer_vtable, &buffer->base);
+    buffer->type = buffer_type;
+    buffer->host_ptr = host_ptr;
+    buffer->device_ptr = device_ptr;
+    buffer->release_callback = release_callback;
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (buffer->release_callback.fn) {
+    buffer->release_callback.fn(buffer->release_callback.user_data,
+                                base_buffer);
+  }
+  iree_allocator_free(host_allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+
+  // TODO(benvanik): add upload/download for unmapped buffers.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(base_buffer),
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer),
+                                     IREE_HAL_BUFFER_USAGE_MAPPING));
+
+  uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset;
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(data_ptr, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  mapping->contents = iree_make_byte_span(data_ptr, local_byte_length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // Nothing to do (today).
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda_buffer_t* buffer =
+      iree_hal_cuda_buffer_const_cast(base_buffer);
+  return buffer->type;
+}
+
+CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda_buffer_t* buffer =
+      iree_hal_cuda_buffer_const_cast(base_buffer);
+  return buffer->device_ptr;
+}
+
+void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda_buffer_t* buffer =
+      iree_hal_cuda_buffer_const_cast(base_buffer);
+  return buffer->host_ptr;
+}
+
+void iree_hal_cuda_buffer_drop_release_callback(
+    iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  buffer->release_callback = iree_hal_buffer_release_callback_null();
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_cuda_buffer_destroy,
+    .map_range = iree_hal_cuda_buffer_map_range,
+    .unmap_range = iree_hal_cuda_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda_buffer_flush_range,
+};
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
new file mode 100644
index 000000000000..640d6f52dcfb
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.h
@@ -0,0 +1,62 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum iree_hal_cuda_buffer_type_e {
+  // cuMemAlloc/cuMemAllocManaged + cuMemFree
+  IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0,
+  // cuMemHostAlloc + cuMemFreeHost
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST,
+  // cuMemHostRegister + cuMemHostUnregister
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED,
+  // cuMemAllocFromPoolAsync + cuMemFree/cuMemFreeAsync
+  IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+} iree_hal_cuda_buffer_type_t;
+
+// Wraps a CUDA allocation in an iree_hal_buffer_t.
+iree_status_t iree_hal_cuda_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
+
+// Returns the underlying CUDA buffer type.
+iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA base pointer for the given |buffer|.
+// This is the entire allocated_buffer and must be offset by the buffer
+// byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA host pointer for the given |buffer|, if available.
+void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+
+// Drops the release callback so that when the buffer is destroyed no callback
+// will be made. This is not thread safe but all callers are expected to be
+// holding an allocation and the earliest the buffer could be destroyed is after
+// this call returns and the caller has released its reference.
+void iree_hal_cuda_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_BUFFER_H_
diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c
new file mode 100644
index 000000000000..56b50c82bffd
--- /dev/null
+++ b/experimental/cuda2/memory_pools.c
@@ -0,0 +1,273 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/drivers/cuda/memory_pools.h"
+
+#include "iree/base/tracing.h"
+#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "iree/hal/drivers/cuda/status_util.h"
+
+// NOTE: these are currently global for all devices; we could make
+// device-specific ones by malloc() and leaking (with LSAN note) unique string
+// values instead.
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
+    "CUDA pool: device-local reserved";
+static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
+    "CUDA pool: other reserved";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+static iree_status_t iree_hal_cuda_create_memory_pool(
+    iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_cuda_memory_pool_params_t params,
+    CUmemoryPool* IREE_RESTRICT out_pool) {
+  *out_pool = NULL;
+
+  CUmemPoolProps pool_props = {
+      .allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
+      // TODO: allow sharing of certain pool memory types by fd/HANDLE.
+      .handleTypes = CU_MEM_HANDLE_TYPE_NONE,
+      .location =
+          {
+              .type = CU_MEM_LOCATION_TYPE_DEVICE,
+              .id = context->cu_device,
+          },
+      .win32SecurityAttributes = NULL,
+      .reserved = {0},
+  };
+
+  CUmemoryPool pool = NULL;
+  CUDA_RETURN_IF_ERROR(context->syms, cuMemPoolCreate(&pool, &pool_props),
+                       "cuMemPoolCreate");
+
+  iree_status_t status = CU_RESULT_TO_STATUS(
+      context->syms,
+      cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                            &params.release_threshold),
+      "cuMemPoolSetAttribute");
+
+  if (iree_status_is_ok(status)) {
+    *out_pool = pool;
+  } else {
+    CUDA_IGNORE_ERROR(context->syms, cuMemPoolDestroy(pool));
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda_memory_pools_initialize(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(pooling_params);
+  IREE_ASSERT_ARGUMENT(out_pools);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_pools, 0, sizeof(*out_pools));
+  out_pools->context = context;
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_create_memory_pool(
+        context, pooling_params->device_local, &out_pools->device_local);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_create_memory_pool(context, pooling_params->other,
+                                              &out_pools->other);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda_memory_pools_deinitialize(
+    iree_hal_cuda_memory_pools_t* pools) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (pools->device_local) {
+    CUDA_IGNORE_ERROR(pools->context->syms,
+                      cuMemPoolDestroy(pools->device_local));
+    pools->device_local = NULL;
+  }
+
+  if (pools->other) {
+    CUDA_IGNORE_ERROR(pools->context->syms, cuMemPoolDestroy(pools->other));
+    pools->other = NULL;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_cuda_memory_pool_track_alloc(
+    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
+  (void)allocation_size;
+  IREE_TRACE_ALLOC_NAMED(
+      is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                      : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+      (void*)iree_hal_cuda_buffer_device_pointer(buffer), allocation_size);
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_allocated =
+        is_device_local ? &pools->statistics.device_bytes_allocated
+                        : &pools->statistics.host_bytes_allocated;
+    iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+static void iree_hal_cuda_memory_pool_track_free(
+    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  IREE_TRACE_FREE_NAMED(is_device_local
+                            ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                            : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+                        (void*)iree_hal_cuda_buffer_device_pointer(buffer));
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_freed =
+        is_device_local ? &pools->statistics.device_bytes_freed
+                        : &pools->statistics.host_bytes_freed;
+    iree_device_size_t allocation_size =
+        iree_hal_buffer_allocation_size(buffer);
+    iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+void iree_hal_cuda_memory_pools_merge_statistics(
+    iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics) {
+  IREE_STATISTICS({
+    statistics->device_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
+    statistics->host_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
+    statistics->device_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
+    statistics->host_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
+    if (pools->device_local) {
+      cuuint64_t pool_peak = 0;
+      CUDA_IGNORE_ERROR(
+          pools->context->syms,
+          cuMemPoolGetAttribute(pools->device_local,
+                                CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
+      statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+    if (pools->other) {
+      cuuint64_t pool_peak = 0;
+      CUDA_IGNORE_ERROR(
+          pools->context->syms,
+          cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+                                &pool_peak));
+      statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+  });
+}
+
+// NOTE: this is only issued if the buffer is destroyed without having had been
+// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
+// the release callback so that this isn't called and we don't double-free.
+static void iree_hal_cuda_async_buffer_release_callback(
+    void* user_data, iree_hal_buffer_t* buffer) {
+  iree_hal_cuda_memory_pools_t* pools =
+      (iree_hal_cuda_memory_pools_t*)user_data;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
+  CUDA_IGNORE_ERROR(pools->context->syms, cuMemFree(device_ptr));
+  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_cuda_memory_pools_alloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size);
+
+  iree_hal_buffer_params_canonicalize(&params);
+
+  // TODO: more pools and better selection; this is coarsely deciding between
+  // only device local (variables, constants, transients) and other (staging,
+  // external) but could use more buffer properties (including usage/export
+  // flags) to better isolate the different usage patterns and keep the pools
+  // operating with reasonable limits. We should be using the |pool| arg.
+  CUmemoryPool memory_pool =
+      iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
+          ? pools->device_local
+          : pools->other;
+
+  CUdeviceptr device_ptr = 0;
+  iree_status_t status = CU_RESULT_TO_STATUS(
+      pools->context->syms,
+      cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
+                              stream),
+      "cuMemAllocFromPoolAsync");
+
+  // Wrap the allocated CUDA buffer in a HAL buffer.
+  // NOTE: we don't provide a device allocator because we didn't allocate from
+  // one and instead we use a release callback to perform the free if the user
+  // doesn't dealloca the buffer.
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_release_callback_t release_callback = {
+        .fn = iree_hal_cuda_async_buffer_release_callback,
+        .user_data = pools,
+    };
+    status = iree_hal_cuda_buffer_wrap(
+        /*device_allocator=*/NULL, params.type, params.access, params.usage,
+        allocation_size, /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+        device_ptr, /*host_ptr=*/NULL, release_callback,
+        pools->context->host_allocator, &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Update statistics (note that it may not yet be accurate).
+    iree_hal_cuda_memory_pool_track_alloc(pools, buffer);
+    *out_buffer = buffer;
+  } else if (buffer) {
+    iree_hal_buffer_release(buffer);
+  } else {
+    CUDA_IGNORE_ERROR(pools->context->syms, cuMemFreeAsync(device_ptr, stream));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_cuda_memory_pools_dealloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(
+      z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
+
+  // Try to schedule the buffer for freeing.
+  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
+  iree_status_t status =
+      CU_RESULT_TO_STATUS(pools->context->syms,
+                          cuMemFreeAsync(device_ptr, stream), "cuMemFreeAsync");
+
+  // Drop the release callback so that we don't try to double-free the buffer.
+  iree_hal_cuda_buffer_drop_release_callback(buffer);
+
+  // Update statistics (note that it may not yet be accurate).
+  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h
new file mode 100644
index 000000000000..328a7b95151b
--- /dev/null
+++ b/experimental/cuda2/memory_pools.h
@@ -0,0 +1,71 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+#define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/cuda/api.h"
+#include "iree/hal/drivers/cuda/context_wrapper.h"
+#include "iree/hal/drivers/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Retained CUDA memory pools for various allocation types.
+typedef struct iree_hal_cuda_memory_pools_t {
+  // CUDA context the pools are attached to.
+  iree_hal_cuda_context_wrapper_t* context;
+  // Used exclusively for DEVICE_LOCAL allocations.
+  CUmemoryPool device_local;
+  // Used for any host-visible/host-local memory types.
+  CUmemoryPool other;
+
+  IREE_STATISTICS(struct {
+    iree_atomic_int64_t device_bytes_allocated;
+    iree_atomic_int64_t device_bytes_freed;
+    iree_atomic_int64_t host_bytes_allocated;
+    iree_atomic_int64_t host_bytes_freed;
+  } statistics;)
+} iree_hal_cuda_memory_pools_t;
+
+// Initializes |out_pools| by configuring new CUDA memory pools.
+iree_status_t iree_hal_cuda_memory_pools_initialize(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools);
+
+// Deinitializes the |pools| and releases the underlying CUDA resources.
+void iree_hal_cuda_memory_pools_deinitialize(
+    iree_hal_cuda_memory_pools_t* pools);
+
+// Merges statistics information from |pools| into |statistics|.
+void iree_hal_cuda_memory_pools_merge_statistics(
+    iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics);
+
+// Asynchronously allocates a buffer from an appropriate pool.
+// The allocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda_memory_pools_alloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+// Asynchronously deallocates a buffer from its pool.
+// The deallocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda_memory_pools_dealloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_

From 1148f632485bea8f5263a0875624e6337e6e2f79 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 7 Jun 2023 11:14:35 -0700
Subject: [PATCH 2/3] [cuda] Port over allocator and buffer implementation

This commit ports over existing CUDA driver allocator and buffer
implementation. The main logic is kept as-is, with one noticeable
changes--context wrapper is dropped and fields in it are directly
put in various API calls. This is to make supporting multiple
device and stream easier later. Other changes are just polishing
on comments and errors.
---
 experimental/cuda2/CMakeLists.txt             |   6 +
 experimental/cuda2/api.h                      |  24 ++
 experimental/cuda2/cuda_allocator.c           | 259 ++++++++++--------
 experimental/cuda2/cuda_allocator.h           |  28 +-
 experimental/cuda2/cuda_buffer.c              |  88 +++---
 experimental/cuda2/cuda_buffer.h              |  46 ++--
 .../cuda2/cuda_dynamic_symbol_table.h         |  12 +
 experimental/cuda2/memory_pools.c             | 131 ++++-----
 experimental/cuda2/memory_pools.h             |  40 +--
 9 files changed, 358 insertions(+), 276 deletions(-)

diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index c104bcde8f4e..ca1959516917 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -17,7 +17,13 @@ iree_cc_library(
     "api.h"
   SRCS
     "api.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
     "cuda_driver.c"
+    "memory_pools.c"
+    "memory_pools.h"
   DEPS
     ::dynamic_symbols
     iree::base
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index 565fd50ca972..a56b656b2b7c 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -16,6 +16,30 @@
 extern "C" {
 #endif  // __cplusplus
 
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda_device_t
+//===----------------------------------------------------------------------===//
+
+// Parameters defining a CUmemoryPool.
+typedef struct iree_hal_cuda2_memory_pool_params_t {
+  // Minimum number of bytes to keep in the pool when trimming with
+  // iree_hal_device_trim.
+  uint64_t minimum_capacity;
+  // Soft maximum number of bytes to keep in the pool.
+  // When more than this is allocated the extra will be freed at the next
+  // device synchronization in order to remain under the threshold.
+  uint64_t release_threshold;
+  // TODO: per-device access permissions array.
+} iree_hal_cuda2_memory_pool_params_t;
+
+// Parameters for each CUmemoryPool used for queue-ordered allocations.
+typedef struct iree_hal_cuda2_memory_pooling_params_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  iree_hal_cuda2_memory_pool_params_t device_local;
+  // Used for any host-visible/host-local memory types.
+  iree_hal_cuda2_memory_pool_params_t other;
+} iree_hal_cuda2_memory_pooling_params_t;
+
 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_driver_t
 //===----------------------------------------------------------------------===//
diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
index eba64090819d..64ad197f0b80 100644
--- a/experimental/cuda2/cuda_allocator.c
+++ b/experimental/cuda2/cuda_allocator.c
@@ -1,50 +1,66 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_allocator.h"
+#include "experimental/cuda2/cuda_allocator.h"
 
 #include <stddef.h>
 
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
-#include "iree/hal/drivers/cuda/dynamic_symbols.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
-static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA unpooled";
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2 unpooled";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
-typedef struct iree_hal_cuda_allocator_t {
+typedef struct iree_hal_cuda2_allocator_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
   iree_hal_resource_t resource;
+
+  // The device that this allocator allocates memory from.
   iree_hal_device_t* base_device;
-  iree_hal_cuda_context_wrapper_t* context;
   CUdevice device;
+
+  // The CUDA stream that allocations should be used in.
   CUstream stream;
-  iree_hal_cuda_memory_pools_t* pools;
+
+  iree_hal_cuda2_memory_pools_t* pools;
+
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  iree_allocator_t host_allocator;
+
+  // Whether the GPU and CPU can concurrently access CUDA managed data in a
+  // coherent way. We would need to explicitly perform flushing and invalidation
+  // between GPU and CPU if not.
   bool supports_concurrent_managed_access;
 
   IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
-} iree_hal_cuda_allocator_t;
+} iree_hal_cuda2_allocator_t;
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable;
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;
 
-static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast(
+static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
     iree_hal_allocator_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable);
-  return (iree_hal_cuda_allocator_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
+  return (iree_hal_cuda2_allocator_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
-    iree_hal_allocator_t** out_allocator) {
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
   IREE_ASSERT_ARGUMENT(base_device);
-  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(pools);
+  IREE_ASSERT_ARGUMENT(out_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // To support device-local + host-visible memory we need concurrent managed
@@ -55,8 +71,8 @@ iree_status_t iree_hal_cuda_allocator_create(
   // buffers except for readback staging buffers.
   int supports_concurrent_managed_access = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, CU_RESULT_TO_STATUS(
-              context->syms,
+      z0, IREE_CURESULT_TO_STATUS(
+              cuda_symbols,
               cuDeviceGetAttribute(
                   &supports_concurrent_managed_access,
                   CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
@@ -68,17 +84,18 @@ iree_status_t iree_hal_cuda_allocator_create(
               : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
                 "device-local + host-visible memory)");
 
-  iree_hal_cuda_allocator_t* allocator = NULL;
+  iree_hal_cuda2_allocator_t* allocator = NULL;
   iree_status_t status = iree_allocator_malloc(
-      context->host_allocator, sizeof(*allocator), (void**)&allocator);
+      host_allocator, sizeof(*allocator), (void**)&allocator);
   if (iree_status_is_ok(status)) {
-    iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable,
+    iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
                                  &allocator->resource);
     allocator->base_device = base_device;
-    allocator->context = context;
     allocator->device = device;
     allocator->stream = stream;
     allocator->pools = pools;
+    allocator->symbols = cuda_symbols;
+    allocator->host_allocator = host_allocator;
     allocator->supports_concurrent_managed_access =
         supports_concurrent_managed_access != 0;
     *out_allocator = (iree_hal_allocator_t*)allocator;
@@ -88,49 +105,53 @@ iree_status_t iree_hal_cuda_allocator_create(
   return status;
 }
 
-static void iree_hal_cuda_allocator_destroy(
+static void iree_hal_cuda2_allocator_destroy(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
-  iree_allocator_t host_allocator = allocator->context->host_allocator;
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_allocator_free(host_allocator, allocator);
+  iree_allocator_free(allocator->host_allocator, allocator);
 
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_allocator_t iree_hal_cuda_allocator_host_allocator(
+static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
     const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      (iree_hal_cuda_allocator_t*)base_allocator;
-  return allocator->context->host_allocator;
+  iree_hal_cuda2_allocator_t* allocator =
+      (iree_hal_cuda2_allocator_t*)base_allocator;
+  return allocator->host_allocator;
 }
 
-static iree_status_t iree_hal_cuda_allocator_trim(
+static iree_status_t iree_hal_cuda2_allocator_trim(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   return iree_ok_status();
 }
 
-static void iree_hal_cuda_allocator_query_statistics(
+static void iree_hal_cuda2_allocator_query_statistics(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
   IREE_STATISTICS({
-    iree_hal_cuda_allocator_t* allocator =
-        iree_hal_cuda_allocator_cast(base_allocator);
+    iree_hal_cuda2_allocator_t* allocator =
+        iree_hal_cuda2_allocator_cast(base_allocator);
     memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
-    iree_hal_cuda_memory_pools_merge_statistics(allocator->pools,
-                                                out_statistics);
+    iree_hal_cuda2_memory_pools_merge_statistics(allocator->pools,
+                                                 out_statistics);
   });
 }
 
-static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
+static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_host_size_t capacity,
     iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
     iree_host_size_t* IREE_RESTRICT out_count) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(heaps);
+  IREE_ASSERT_ARGUMENT(out_count);
+
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
   // set of heaps (likely still a cached and uncached, at minimum).
@@ -205,12 +226,12 @@ static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
 }
 
 static iree_hal_buffer_compatibility_t
-iree_hal_cuda_allocator_query_buffer_compatibility(
+iree_hal_cuda2_allocator_query_buffer_compatibility(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t* IREE_RESTRICT allocation_size) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // All buffers can be allocated on the heap.
   iree_hal_buffer_compatibility_t compatibility =
@@ -260,24 +281,25 @@ iree_hal_cuda_allocator_query_buffer_compatibility(
   return compatibility;
 }
 
-static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
-                                      iree_hal_cuda_buffer_type_t buffer_type,
-                                      CUdeviceptr device_ptr, void* host_ptr) {
+static void iree_hal_cuda2_buffer_free(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr) {
   IREE_TRACE_ZONE_BEGIN(z0);
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
-      CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
@@ -288,18 +310,21 @@ static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
+static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
@@ -325,26 +350,25 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
-  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate");
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
   IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
   if (iree_all_bits_set(compat_params.type,
                         IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
-    // Device local case.
     if (iree_all_bits_set(compat_params.type,
                           IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device local + host visible.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status =
-          CU_RESULT_TO_STATUS(allocator->context->syms,
-                              cuMemAllocManaged(&device_ptr, allocation_size,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
                                                 CU_MEM_ATTACH_GLOBAL));
       if (iree_status_is_ok(status) &&
           allocator->supports_concurrent_managed_access) {
-        // Prefetch the buffer on the GPU device.
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        // Prefetch the buffer to the GPU stream.
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
                                allocator->stream));
       }
@@ -352,22 +376,22 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
     } else {
       // Device only.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status = CU_RESULT_TO_STATUS(allocator->context->syms,
-                                   cuMemAlloc(&device_ptr, allocation_size));
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
     }
   } else {
+    // Host local cases.
     buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
     unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
     if (!iree_all_bits_set(compat_params.type,
                            IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
       flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
     }
-    status =
-        CU_RESULT_TO_STATUS(allocator->context->syms,
-                            cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    status = IREE_CURESULT_TO_STATUS(
+        allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
     if (iree_status_is_ok(status)) {
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
     }
   }
@@ -375,7 +399,7 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
         compat_params.usage, allocation_size,
         /*byte_offset=*/0,
@@ -398,15 +422,15 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   if (iree_status_is_ok(status)) {
     IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
-                           (void*)iree_hal_cuda_buffer_device_pointer(buffer),
+                           (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
                            allocation_size);
     IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
         &allocator->statistics, compat_params.type, allocation_size));
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -414,14 +438,16 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   return status;
 }
 
-static void iree_hal_cuda_allocator_deallocate_buffer(
+static void iree_hal_cuda2_allocator_deallocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
-  const iree_hal_cuda_buffer_type_t buffer_type =
-      iree_hal_cuda_buffer_type(base_buffer);
+  const iree_hal_cuda2_buffer_type_t buffer_type =
+      iree_hal_cuda2_buffer_type(base_buffer);
 
   // WARNING: we may be called from a random thread and need to ensure that we
   // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
@@ -437,16 +463,16 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   // to silently ignore them: whatever the user tries to do next will fail in
   // the same way and if we were deallocating this buffer as part of a tear-down
   // on failure we don't want to end up dying during cleanup.
-  iree_hal_cuda_buffer_free(allocator->context, buffer_type,
-                            iree_hal_cuda_buffer_device_pointer(base_buffer),
-                            iree_hal_cuda_buffer_host_pointer(base_buffer));
+  iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
+                             iree_hal_cuda2_buffer_device_pointer(base_buffer),
+                             iree_hal_cuda2_buffer_host_pointer(base_buffer));
 
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_FREE_NAMED(
           IREE_HAL_CUDA_ALLOCATOR_ID,
-          (void*)iree_hal_cuda_buffer_device_pointer(base_buffer));
+          (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
       IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
           &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
           iree_hal_buffer_allocation_size(base_buffer)));
@@ -460,20 +486,24 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   iree_hal_buffer_destroy(base_buffer);
 }
 
-static iree_status_t iree_hal_cuda_allocator_import_buffer(
+static iree_status_t iree_hal_cuda2_allocator_import_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
     iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_device_size_t allocation_size = external_buffer->size;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
@@ -499,7 +529,7 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
 
@@ -524,13 +554,13 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
                                IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
         register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
       }
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
           "cuMemHostRegister");
       if (iree_status_is_ok(status)) {
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
             "cuMemHostGetDevicePointer");
       }
@@ -539,18 +569,17 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "handle-based imports not yet implemented");
+                              "unimplmented handle-based imports");
     default:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "external buffer type not supported");
+                              "unimplmented external buffer type");
   }
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
-        compat_params.usage, external_buffer->size,
-        /*byte_offset=*/0,
+        compat_params.usage, external_buffer->size, /*byte_offset=*/0,
         /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
         host_ptr, release_callback,
         iree_hal_allocator_host_allocator(base_allocator), &buffer);
@@ -560,8 +589,8 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -569,26 +598,26 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   return status;
 }
 
-static iree_status_t iree_hal_cuda_allocator_export_buffer(
+static iree_status_t iree_hal_cuda2_allocator_export_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT buffer,
     iree_hal_external_buffer_type_t requested_type,
     iree_hal_external_buffer_flags_t requested_flags,
     iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
-  return iree_make_status(IREE_STATUS_UNAVAILABLE,
-                          "exporting to external buffers not supported");
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "unimplemented exporting to external buffers");
 }
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = {
-    .destroy = iree_hal_cuda_allocator_destroy,
-    .host_allocator = iree_hal_cuda_allocator_host_allocator,
-    .trim = iree_hal_cuda_allocator_trim,
-    .query_statistics = iree_hal_cuda_allocator_query_statistics,
-    .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps,
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
+    .destroy = iree_hal_cuda2_allocator_destroy,
+    .host_allocator = iree_hal_cuda2_allocator_host_allocator,
+    .trim = iree_hal_cuda2_allocator_trim,
+    .query_statistics = iree_hal_cuda2_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
     .query_buffer_compatibility =
-        iree_hal_cuda_allocator_query_buffer_compatibility,
-    .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer,
-    .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer,
-    .import_buffer = iree_hal_cuda_allocator_import_buffer,
-    .export_buffer = iree_hal_cuda_allocator_export_buffer,
+        iree_hal_cuda2_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda2_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda2_allocator_export_buffer,
 };
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
index 0df31ee00c77..0f3134359caf 100644
--- a/experimental/cuda2/cuda_allocator.h
+++ b/experimental/cuda2/cuda_allocator.h
@@ -1,33 +1,31 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
-#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
 
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/memory_pools.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/context_wrapper.h"
-#include "iree/hal/drivers/cuda/memory_pools.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-// Creates a CUDA memory allocator.
-// |device| and |stream| will be used for management operations.
-// |pools| provides memory pools that may be shared across multiple allocators
-// and the pointer must remain valid for the lifetime of the allocator.
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
-    iree_hal_allocator_t** out_allocator);
+// Create a CUDA allocator that allocates device memory from the given
+// |device| and used in the given |stream|.
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
index 87c41475b597..e88c9e3b3de3 100644
--- a/experimental/cuda2/cuda_buffer.c
+++ b/experimental/cuda2/cuda_buffer.c
@@ -1,10 +1,10 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "experimental/cuda2/cuda_buffer.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -13,47 +13,47 @@
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 
-typedef struct iree_hal_cuda_buffer_t {
+typedef struct iree_hal_cuda2_buffer_t {
   iree_hal_buffer_t base;
-  iree_hal_cuda_buffer_type_t type;
+  iree_hal_cuda2_buffer_type_t type;
   void* host_ptr;
   CUdeviceptr device_ptr;
   iree_hal_buffer_release_callback_t release_callback;
-} iree_hal_cuda_buffer_t;
+} iree_hal_cuda2_buffer_t;
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable;
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable;
 
-static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast(
+static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast(
     iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (iree_hal_cuda2_buffer_t*)base_value;
 }
 
-static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast(
+static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast(
     const iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (const iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (const iree_hal_cuda2_buffer_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
   IREE_ASSERT_ARGUMENT(out_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_hal_cuda_buffer_t* buffer = NULL;
+  iree_hal_cuda2_buffer_t* buffer = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
                                allocation_size, byte_offset, byte_length,
                                memory_type, allowed_access, allowed_usage,
-                               &iree_hal_cuda_buffer_vtable, &buffer->base);
+                               &iree_hal_cuda2_buffer_vtable, &buffer->base);
     buffer->type = buffer_type;
     buffer->host_ptr = host_ptr;
     buffer->device_ptr = device_ptr;
@@ -65,8 +65,8 @@ iree_status_t iree_hal_cuda_buffer_wrap(
   return status;
 }
 
-static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
   iree_allocator_t host_allocator = base_buffer->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);
   if (buffer->release_callback.fn) {
@@ -77,12 +77,14 @@ static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_buffer_map_range(
+static iree_status_t iree_hal_cuda2_buffer_map_range(
     iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
     iree_hal_memory_access_t memory_access,
     iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
     iree_hal_buffer_mapping_t* mapping) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
 
   // TODO(benvanik): add upload/download for unmapped buffers.
   IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
@@ -107,58 +109,58 @@ static iree_status_t iree_hal_cuda_buffer_map_range(
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_unmap_range(
+static iree_status_t iree_hal_cuda2_buffer_unmap_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
-  // Nothing to do (today).
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_invalidate_range(
+static iree_status_t iree_hal_cuda2_buffer_invalidate_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_flush_range(
+static iree_status_t iree_hal_cuda2_buffer_flush_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->type;
 }
 
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->device_ptr;
 }
 
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->host_ptr;
 }
 
-void iree_hal_cuda_buffer_drop_release_callback(
+void iree_hal_cuda2_buffer_drop_release_callback(
     iree_hal_buffer_t* base_buffer) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
   buffer->release_callback = iree_hal_buffer_release_callback_null();
 }
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = {
     .recycle = iree_hal_buffer_recycle,
-    .destroy = iree_hal_cuda_buffer_destroy,
-    .map_range = iree_hal_cuda_buffer_map_range,
-    .unmap_range = iree_hal_cuda_buffer_unmap_range,
-    .invalidate_range = iree_hal_cuda_buffer_invalidate_range,
-    .flush_range = iree_hal_cuda_buffer_flush_range,
+    .destroy = iree_hal_cuda2_buffer_destroy,
+    .map_range = iree_hal_cuda2_buffer_map_range,
+    .unmap_range = iree_hal_cuda2_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda2_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda2_buffer_flush_range,
 };
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
index 640d6f52dcfb..23cade3c57ad 100644
--- a/experimental/cuda2/cuda_buffer.h
+++ b/experimental/cuda2/cuda_buffer.h
@@ -1,62 +1,66 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_
-#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
 
+#include "experimental/cuda2/cuda_headers.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/cuda_headers.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-typedef enum iree_hal_cuda_buffer_type_e {
-  // cuMemAlloc/cuMemAllocManaged + cuMemFree
+typedef enum iree_hal_cuda2_buffer_type_e {
+  // Device local buffer; allocated with cuMemAlloc/cuMemAllocManaged, freed
+  // with cuMemFree.
   IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0,
-  // cuMemHostAlloc + cuMemFreeHost
+  // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST,
-  // cuMemHostRegister + cuMemHostUnregister
+  // Host local buffer; registered with cuMemHostRegister, freed with
+  // cuMemHostUnregister.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED,
-  // cuMemAllocFromPoolAsync + cuMemFree/cuMemFreeAsync
+  // Device local buffer, allocated with cuMemAllocFromPoolAsync, freed with
+  // cuMemFree/cuMemFreeAsync.
   IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
-} iree_hal_cuda_buffer_type_t;
+} iree_hal_cuda2_buffer_type_t;
 
 // Wraps a CUDA allocation in an iree_hal_buffer_t.
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
 
-// Returns the underlying CUDA buffer type.
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+// Returns the underlying CUDA buffer type of the given |buffer|.
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* buffer);
 
-// Returns the CUDA base pointer for the given |buffer|.
-// This is the entire allocated_buffer and must be offset by the buffer
-// byte_offset and byte_length when used.
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+// Returns the CUDA base device pointer for the given |buffer|.
+//
+// Note that this is the entire allocated_buffer and must be offset by the
+// buffer byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* buffer);
 
 // Returns the CUDA host pointer for the given |buffer|, if available.
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer);
 
 // Drops the release callback so that when the buffer is destroyed no callback
 // will be made. This is not thread safe but all callers are expected to be
 // holding an allocation and the earliest the buffer could be destroyed is after
 // this call returns and the caller has released its reference.
-void iree_hal_cuda_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
+void iree_hal_cuda2_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
diff --git a/experimental/cuda2/cuda_dynamic_symbol_table.h b/experimental/cuda2/cuda_dynamic_symbol_table.h
index b4aaa93fc750..fb8ff5a8ecd8 100644
--- a/experimental/cuda2/cuda_dynamic_symbol_table.h
+++ b/experimental/cuda2/cuda_dynamic_symbol_table.h
@@ -49,6 +49,18 @@ IREE_CU_PFN_DECL(cuMemHostAlloc, void**, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostRegister, void*, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostUnregister, void*)
 IREE_CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int)
+IREE_CU_PFN_DECL(cuMemPoolCreate, CUmemoryPool*, const CUmemPoolProps*)
+IREE_CU_PFN_DECL(cuMemPoolDestroy, CUmemoryPool)
+IREE_CU_PFN_DECL(cuMemPoolSetAccess, CUmemoryPool, const CUmemAccessDesc*,
+                 size_t)
+IREE_CU_PFN_DECL(cuMemPoolGetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolSetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolTrimTo, CUmemoryPool, size_t)
+IREE_CU_PFN_DECL(cuMemAllocFromPoolAsync, CUdeviceptr*, size_t, CUmemoryPool,
+                 CUstream)
+IREE_CU_PFN_DECL(cuMemFreeAsync, CUdeviceptr dptr, CUstream hStream)
 IREE_CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*)
 IREE_CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int,
                  CUjit_option*, void**)
diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c
index 56b50c82bffd..e29c5121c51a 100644
--- a/experimental/cuda2/memory_pools.c
+++ b/experimental/cuda2/memory_pools.c
@@ -4,11 +4,12 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/memory_pools.h"
+#include "experimental/cuda2/memory_pools.h"
 
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 // NOTE: these are currently global for all devices; we could make
 // device-specific ones by malloc() and leaking (with LSAN note) unique string
@@ -20,9 +21,9 @@ static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
     "CUDA pool: other reserved";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
-static iree_status_t iree_hal_cuda_create_memory_pool(
-    iree_hal_cuda_context_wrapper_t* context,
-    iree_hal_cuda_memory_pool_params_t params,
+static iree_status_t iree_hal_cuda2_create_memory_pool(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    iree_hal_cuda2_memory_pool_params_t params,
     CUmemoryPool* IREE_RESTRICT out_pool) {
   *out_pool = NULL;
 
@@ -33,18 +34,18 @@ static iree_status_t iree_hal_cuda_create_memory_pool(
       .location =
           {
               .type = CU_MEM_LOCATION_TYPE_DEVICE,
-              .id = context->cu_device,
+              .id = cu_device,
           },
       .win32SecurityAttributes = NULL,
       .reserved = {0},
   };
 
   CUmemoryPool pool = NULL;
-  CUDA_RETURN_IF_ERROR(context->syms, cuMemPoolCreate(&pool, &pool_props),
-                       "cuMemPoolCreate");
+  IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
+                            "cuMemPoolCreate");
 
-  iree_status_t status = CU_RESULT_TO_STATUS(
-      context->syms,
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      cuda_symbols,
       cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                             &params.release_threshold),
       "cuMemPoolSetAttribute");
@@ -52,59 +53,62 @@ static iree_status_t iree_hal_cuda_create_memory_pool(
   if (iree_status_is_ok(status)) {
     *out_pool = pool;
   } else {
-    CUDA_IGNORE_ERROR(context->syms, cuMemPoolDestroy(pool));
+    IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
   }
   return status;
 }
 
-iree_status_t iree_hal_cuda_memory_pools_initialize(
-    iree_hal_cuda_context_wrapper_t* context,
-    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
-    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools) {
-  IREE_ASSERT_ARGUMENT(context);
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(pooling_params);
   IREE_ASSERT_ARGUMENT(out_pools);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   memset(out_pools, 0, sizeof(*out_pools));
-  out_pools->context = context;
+  out_pools->cuda_symbols = cuda_symbols;
+  out_pools->host_allocator = host_allocator;
 
   iree_status_t status = iree_ok_status();
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_create_memory_pool(
-        context, pooling_params->device_local, &out_pools->device_local);
+    status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
+                                               pooling_params->device_local,
+                                               &out_pools->device_local);
   }
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_create_memory_pool(context, pooling_params->other,
-                                              &out_pools->other);
+    status = iree_hal_cuda2_create_memory_pool(
+        cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
   }
 
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
-void iree_hal_cuda_memory_pools_deinitialize(
-    iree_hal_cuda_memory_pools_t* pools) {
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
   if (pools->device_local) {
-    CUDA_IGNORE_ERROR(pools->context->syms,
-                      cuMemPoolDestroy(pools->device_local));
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemPoolDestroy(pools->device_local));
     pools->device_local = NULL;
   }
 
   if (pools->other) {
-    CUDA_IGNORE_ERROR(pools->context->syms, cuMemPoolDestroy(pools->other));
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
     pools->other = NULL;
   }
 
   IREE_TRACE_ZONE_END(z0);
 }
 
-static void iree_hal_cuda_memory_pool_track_alloc(
-    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+static void iree_hal_cuda2_memory_pool_track_alloc(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
   bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
                                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
   (void)is_device_local;
@@ -113,7 +117,7 @@ static void iree_hal_cuda_memory_pool_track_alloc(
   IREE_TRACE_ALLOC_NAMED(
       is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
                       : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
-      (void*)iree_hal_cuda_buffer_device_pointer(buffer), allocation_size);
+      (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
   IREE_STATISTICS({
     iree_atomic_int64_t* bytes_allocated =
         is_device_local ? &pools->statistics.device_bytes_allocated
@@ -123,15 +127,15 @@ static void iree_hal_cuda_memory_pool_track_alloc(
   });
 }
 
-static void iree_hal_cuda_memory_pool_track_free(
-    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+static void iree_hal_cuda2_memory_pool_track_free(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
   bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
                                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
   (void)is_device_local;
   IREE_TRACE_FREE_NAMED(is_device_local
                             ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
                             : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
-                        (void*)iree_hal_cuda_buffer_device_pointer(buffer));
+                        (void*)iree_hal_cuda2_buffer_device_pointer(buffer));
   IREE_STATISTICS({
     iree_atomic_int64_t* bytes_freed =
         is_device_local ? &pools->statistics.device_bytes_freed
@@ -143,8 +147,8 @@ static void iree_hal_cuda_memory_pool_track_free(
   });
 }
 
-void iree_hal_cuda_memory_pools_merge_statistics(
-    iree_hal_cuda_memory_pools_t* pools,
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
     iree_hal_allocator_statistics_t* statistics) {
   IREE_STATISTICS({
     statistics->device_bytes_allocated = iree_atomic_load_int64(
@@ -157,16 +161,16 @@ void iree_hal_cuda_memory_pools_merge_statistics(
         &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
     if (pools->device_local) {
       cuuint64_t pool_peak = 0;
-      CUDA_IGNORE_ERROR(
-          pools->context->syms,
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
           cuMemPoolGetAttribute(pools->device_local,
                                 CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
       statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
     }
     if (pools->other) {
       cuuint64_t pool_peak = 0;
-      CUDA_IGNORE_ERROR(
-          pools->context->syms,
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
           cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
                                 &pool_peak));
       statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
@@ -177,21 +181,21 @@ void iree_hal_cuda_memory_pools_merge_statistics(
 // NOTE: this is only issued if the buffer is destroyed without having had been
 // scheduled for deallocation asynchronously. When a buffer is scheduled we drop
 // the release callback so that this isn't called and we don't double-free.
-static void iree_hal_cuda_async_buffer_release_callback(
+static void iree_hal_cuda2_async_buffer_release_callback(
     void* user_data, iree_hal_buffer_t* buffer) {
-  iree_hal_cuda_memory_pools_t* pools =
-      (iree_hal_cuda_memory_pools_t*)user_data;
+  iree_hal_cuda2_memory_pools_t* pools =
+      (iree_hal_cuda2_memory_pools_t*)user_data;
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
-  CUDA_IGNORE_ERROR(pools->context->syms, cuMemFree(device_ptr));
-  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
 
   IREE_TRACE_ZONE_END(z0);
 }
 
-iree_status_t iree_hal_cuda_memory_pools_alloca(
-    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
@@ -211,8 +215,8 @@ iree_status_t iree_hal_cuda_memory_pools_alloca(
           : pools->other;
 
   CUdeviceptr device_ptr = 0;
-  iree_status_t status = CU_RESULT_TO_STATUS(
-      pools->context->syms,
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols,
       cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
                               stream),
       "cuMemAllocFromPoolAsync");
@@ -224,49 +228,50 @@ iree_status_t iree_hal_cuda_memory_pools_alloca(
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
     iree_hal_buffer_release_callback_t release_callback = {
-        .fn = iree_hal_cuda_async_buffer_release_callback,
+        .fn = iree_hal_cuda2_async_buffer_release_callback,
         .user_data = pools,
     };
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         /*device_allocator=*/NULL, params.type, params.access, params.usage,
         allocation_size, /*byte_offset=*/0,
         /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
-        device_ptr, /*host_ptr=*/NULL, release_callback,
-        pools->context->host_allocator, &buffer);
+        device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator,
+        &buffer);
   }
 
   if (iree_status_is_ok(status)) {
     // Update statistics (note that it may not yet be accurate).
-    iree_hal_cuda_memory_pool_track_alloc(pools, buffer);
+    iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
     *out_buffer = buffer;
   } else if (buffer) {
     iree_hal_buffer_release(buffer);
   } else {
-    CUDA_IGNORE_ERROR(pools->context->syms, cuMemFreeAsync(device_ptr, stream));
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemFreeAsync(device_ptr, stream));
   }
 
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
-iree_status_t iree_hal_cuda_memory_pools_dealloca(
-    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_buffer_t* buffer) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE(
       z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
 
   // Try to schedule the buffer for freeing.
-  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
-  iree_status_t status =
-      CU_RESULT_TO_STATUS(pools->context->syms,
-                          cuMemFreeAsync(device_ptr, stream), "cuMemFreeAsync");
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols, cuMemFreeAsync(device_ptr, stream),
+      "cuMemFreeAsync");
 
   // Drop the release callback so that we don't try to double-free the buffer.
-  iree_hal_cuda_buffer_drop_release_callback(buffer);
+  iree_hal_cuda2_buffer_drop_release_callback(buffer);
 
   // Update statistics (note that it may not yet be accurate).
-  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h
index 328a7b95151b..8eccf9ef7105 100644
--- a/experimental/cuda2/memory_pools.h
+++ b/experimental/cuda2/memory_pools.h
@@ -7,61 +7,63 @@
 #ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
 #define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
 
+#include "experimental/cuda2/api.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_headers.h"
 #include "iree/base/api.h"
 #include "iree/base/internal/atomics.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/api.h"
-#include "iree/hal/drivers/cuda/context_wrapper.h"
-#include "iree/hal/drivers/cuda/cuda_headers.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
 // Retained CUDA memory pools for various allocation types.
-typedef struct iree_hal_cuda_memory_pools_t {
-  // CUDA context the pools are attached to.
-  iree_hal_cuda_context_wrapper_t* context;
+typedef struct iree_hal_cuda2_memory_pools_t {
   // Used exclusively for DEVICE_LOCAL allocations.
   CUmemoryPool device_local;
   // Used for any host-visible/host-local memory types.
   CUmemoryPool other;
 
+  const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols;
+  iree_allocator_t host_allocator;
+
   IREE_STATISTICS(struct {
     iree_atomic_int64_t device_bytes_allocated;
     iree_atomic_int64_t device_bytes_freed;
     iree_atomic_int64_t host_bytes_allocated;
     iree_atomic_int64_t host_bytes_freed;
   } statistics;)
-} iree_hal_cuda_memory_pools_t;
+} iree_hal_cuda2_memory_pools_t;
 
 // Initializes |out_pools| by configuring new CUDA memory pools.
-iree_status_t iree_hal_cuda_memory_pools_initialize(
-    iree_hal_cuda_context_wrapper_t* context,
-    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
-    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools);
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools);
 
 // Deinitializes the |pools| and releases the underlying CUDA resources.
-void iree_hal_cuda_memory_pools_deinitialize(
-    iree_hal_cuda_memory_pools_t* pools);
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools);
 
 // Merges statistics information from |pools| into |statistics|.
-void iree_hal_cuda_memory_pools_merge_statistics(
-    iree_hal_cuda_memory_pools_t* pools,
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
     iree_hal_allocator_statistics_t* statistics);
 
 // Asynchronously allocates a buffer from an appropriate pool.
 // The allocation will be stream-ordered on |stream|.
-iree_status_t iree_hal_cuda_memory_pools_alloca(
-    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer);
 
 // Asynchronously deallocates a buffer from its pool.
 // The deallocation will be stream-ordered on |stream|.
-iree_status_t iree_hal_cuda_memory_pools_dealloca(
-    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_buffer_t* buffer);
 
 #ifdef __cplusplus

From f54e50c7e01378ae0607d31b0368603db21dcf87 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 7 Jun 2023 18:13:51 -0700
Subject: [PATCH 3/3] Address comments

---
 experimental/cuda2/cuda_allocator.c | 6 +++---
 experimental/cuda2/cuda_allocator.h | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
index 64ad197f0b80..5ec9a396af32 100644
--- a/experimental/cuda2/cuda_allocator.c
+++ b/experimental/cuda2/cuda_allocator.c
@@ -569,10 +569,10 @@ static iree_status_t iree_hal_cuda2_allocator_import_buffer(
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "unimplmented handle-based imports");
+                              "handle-based imports not yet implemented");
     default:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "unimplmented external buffer type");
+                              "external buffer type not supported");
   }
 
   iree_hal_buffer_t* buffer = NULL;
@@ -605,7 +605,7 @@ static iree_status_t iree_hal_cuda2_allocator_export_buffer(
     iree_hal_external_buffer_flags_t requested_flags,
     iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                          "unimplemented exporting to external buffers");
+                          "exporting to external buffers not supported");
 }
 
 static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
index 0f3134359caf..2ff33ea467c0 100644
--- a/experimental/cuda2/cuda_allocator.h
+++ b/experimental/cuda2/cuda_allocator.h
@@ -16,8 +16,10 @@
 extern "C" {
 #endif  // __cplusplus
 
-// Create a CUDA allocator that allocates device memory from the given
-// |device| and used in the given |stream|.
+// Creates a CUDA memory allocator.
+// |device| and |stream| will be used for management operations.
+// |pools| provides memory pools that may be shared across multiple allocators
+// and the pointer must remain valid for the lifetime of the allocator.
 iree_status_t iree_hal_cuda2_allocator_create(
     iree_hal_device_t* base_device,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,