From 8d35942ee902ec6be06422b60346af74108e4324 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 7 Jun 2023 11:14:35 -0700
Subject: [PATCH] [cuda] Port over allocator and buffer implementation

This commit ports over existing CUDA driver allocator and buffer
implementation. The main logic is kept as-is, with one noticeable
changes--context wrapper is dropped and fields in it are directly
put in various API calls. This is to make supporting multiple
device and stream easier later. Other changes are just polishing
on comments and errors.
---
 experimental/cuda2/CMakeLists.txt   |   4 +
 experimental/cuda2/cuda_allocator.c | 251 ++++++++++++++++------------
 experimental/cuda2/cuda_allocator.h |  22 +--
 experimental/cuda2/cuda_buffer.c    |  84 +++++-----
 experimental/cuda2/cuda_buffer.h    |  41 ++---
 5 files changed, 221 insertions(+), 181 deletions(-)

diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index c104bcde8f4e..83b927b115b8 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -17,6 +17,10 @@ iree_cc_library(
     "api.h"
   SRCS
     "api.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
     "cuda_driver.c"
   DEPS
     ::dynamic_symbols
diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
index 4d22a01fe4fe..4a0cddd8a41f 100644
--- a/experimental/cuda2/cuda_allocator.c
+++ b/experimental/cuda2/cuda_allocator.c
@@ -1,47 +1,63 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_allocator.h"
+#include "experimental/cuda2/cuda_allocator.h"
 
 #include <stddef.h>
 
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
-#include "iree/hal/drivers/cuda/dynamic_symbols.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
-static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA";
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
-typedef struct iree_hal_cuda_allocator_t {
+typedef struct iree_hal_cuda2_allocator_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
   iree_hal_resource_t resource;
+
+  // The device that this allocator allocates memory from.
   iree_hal_device_t* base_device;
-  iree_hal_cuda_context_wrapper_t* context;
   CUdevice device;
+
+  // The CUDA stream that allocations should be used in.
   CUstream stream;
+
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  iree_allocator_t host_allocator;
+
+  // Whether the GPU and CPU can concurrently access CUDA managed data in a
+  // coherent way. We would need to explicitly perform data migration and
+  // synchronization between GPU and CPU if not.
   bool supports_concurrent_managed_access;
 
   IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
-} iree_hal_cuda_allocator_t;
+} iree_hal_cuda2_allocator_t;
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable;
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;
 
-static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast(
+static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
     iree_hal_allocator_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable);
-  return (iree_hal_cuda_allocator_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
+  return (iree_hal_cuda2_allocator_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) {
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_allocator_t host_allocator,
+    iree_hal_allocator_t** out_allocator) {
   IREE_ASSERT_ARGUMENT(base_device);
-  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(out_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // To support device-local + host-visible memory we need concurrent managed
@@ -52,8 +68,8 @@ iree_status_t iree_hal_cuda_allocator_create(
   // buffers except for readback staging buffers.
   int supports_concurrent_managed_access = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, CU_RESULT_TO_STATUS(
-              context->syms,
+      z0, IREE_CURESULT_TO_STATUS(
+              cuda_symbols,
               cuDeviceGetAttribute(
                   &supports_concurrent_managed_access,
                   CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
@@ -65,16 +81,17 @@ iree_status_t iree_hal_cuda_allocator_create(
               : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
                 "device-local + host-visible memory)");
 
-  iree_hal_cuda_allocator_t* allocator = NULL;
+  iree_hal_cuda2_allocator_t* allocator = NULL;
   iree_status_t status = iree_allocator_malloc(
-      context->host_allocator, sizeof(*allocator), (void**)&allocator);
+      host_allocator, sizeof(*allocator), (void**)&allocator);
   if (iree_status_is_ok(status)) {
-    iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable,
+    iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
                                  &allocator->resource);
     allocator->base_device = base_device;
-    allocator->context = context;
     allocator->device = device;
     allocator->stream = stream;
+    allocator->symbols = cuda_symbols;
+    allocator->host_allocator = host_allocator;
     allocator->supports_concurrent_managed_access =
         supports_concurrent_managed_access != 0;
     *out_allocator = (iree_hal_allocator_t*)allocator;
@@ -84,47 +101,51 @@ iree_status_t iree_hal_cuda_allocator_create(
   return status;
 }
 
-static void iree_hal_cuda_allocator_destroy(
+static void iree_hal_cuda2_allocator_destroy(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
-  iree_allocator_t host_allocator = allocator->context->host_allocator;
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_allocator_free(host_allocator, allocator);
+  iree_allocator_free(allocator->host_allocator, allocator);
 
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_allocator_t iree_hal_cuda_allocator_host_allocator(
+static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
     const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      (iree_hal_cuda_allocator_t*)base_allocator;
-  return allocator->context->host_allocator;
+  iree_hal_cuda2_allocator_t* allocator =
+      (iree_hal_cuda2_allocator_t*)base_allocator;
+  return allocator->host_allocator;
 }
 
-static iree_status_t iree_hal_cuda_allocator_trim(
+static iree_status_t iree_hal_cuda2_allocator_trim(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   return iree_ok_status();
 }
 
-static void iree_hal_cuda_allocator_query_statistics(
+static void iree_hal_cuda2_allocator_query_statistics(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
   IREE_STATISTICS({
-    iree_hal_cuda_allocator_t* allocator =
-        iree_hal_cuda_allocator_cast(base_allocator);
+    iree_hal_cuda2_allocator_t* allocator =
+        iree_hal_cuda2_allocator_cast(base_allocator);
     memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
   });
 }
 
-static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
+static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_host_size_t capacity,
     iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
     iree_host_size_t* IREE_RESTRICT out_count) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(heaps);
+  IREE_ASSERT_ARGUMENT(out_count);
+
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
   // set of heaps (likely still a cached and uncached, at minimum).
@@ -199,12 +220,12 @@ static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
 }
 
 static iree_hal_buffer_compatibility_t
-iree_hal_cuda_allocator_query_buffer_compatibility(
+iree_hal_cuda2_allocator_query_buffer_compatibility(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t* IREE_RESTRICT allocation_size) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // All buffers can be allocated on the heap.
   iree_hal_buffer_compatibility_t compatibility =
@@ -254,42 +275,46 @@ iree_hal_cuda_allocator_query_buffer_compatibility(
   return compatibility;
 }
 
-static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
-                                      iree_hal_cuda_buffer_type_t buffer_type,
-                                      CUdeviceptr device_ptr, void* host_ptr) {
+static void iree_hal_cuda2_buffer_free(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr) {
   IREE_TRACE_ZONE_BEGIN(z0);
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
-      CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
       break;
     }
   }
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
+static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
@@ -315,26 +340,25 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
-  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate");
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
   IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
   if (iree_all_bits_set(compat_params.type,
                         IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
-    // Device local case.
     if (iree_all_bits_set(compat_params.type,
                           IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device local + host visible.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status =
-          CU_RESULT_TO_STATUS(allocator->context->syms,
-                              cuMemAllocManaged(&device_ptr, allocation_size,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
                                                 CU_MEM_ATTACH_GLOBAL));
       if (iree_status_is_ok(status) &&
           allocator->supports_concurrent_managed_access) {
-        // Prefetch the buffer on the GPU device.
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        // Prefetch the buffer to the GPU stream.
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
                                allocator->stream));
       }
@@ -342,22 +366,22 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
     } else {
       // Device only.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status = CU_RESULT_TO_STATUS(allocator->context->syms,
-                                   cuMemAlloc(&device_ptr, allocation_size));
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
     }
   } else {
+    // Host local cases.
     buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
     unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
     if (!iree_all_bits_set(compat_params.type,
                            IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
       flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
     }
-    status =
-        CU_RESULT_TO_STATUS(allocator->context->syms,
-                            cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    status = IREE_CURESULT_TO_STATUS(
+        allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
     if (iree_status_is_ok(status)) {
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
     }
   }
@@ -365,7 +389,7 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
         compat_params.usage, allocation_size,
         /*byte_offset=*/0,
@@ -387,15 +411,15 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   if (iree_status_is_ok(status)) {
     IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
-                           (void*)iree_hal_cuda_buffer_device_pointer(buffer),
+                           (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
                            allocation_size);
     IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
         &allocator->statistics, compat_params.type, allocation_size));
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -403,14 +427,16 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   return status;
 }
 
-static void iree_hal_cuda_allocator_deallocate_buffer(
+static void iree_hal_cuda2_allocator_deallocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
-  const iree_hal_cuda_buffer_type_t buffer_type =
-      iree_hal_cuda_buffer_type(base_buffer);
+  const iree_hal_cuda2_buffer_type_t buffer_type =
+      iree_hal_cuda2_buffer_type(base_buffer);
 
   // WARNING: we may be called from a random thread and need to ensure that we
   // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
@@ -426,16 +452,16 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   // to silently ignore them: whatever the user tries to do next will fail in
   // the same way and if we were deallocating this buffer as part of a tear-down
   // on failure we don't want to end up dying during cleanup.
-  iree_hal_cuda_buffer_free(allocator->context, buffer_type,
-                            iree_hal_cuda_buffer_device_pointer(base_buffer),
-                            iree_hal_cuda_buffer_host_pointer(base_buffer));
+  iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
+                             iree_hal_cuda2_buffer_device_pointer(base_buffer),
+                             iree_hal_cuda2_buffer_host_pointer(base_buffer));
 
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_FREE_NAMED(
           IREE_HAL_CUDA_ALLOCATOR_ID,
-          (void*)iree_hal_cuda_buffer_device_pointer(base_buffer));
+          (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
       IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
           &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
           iree_hal_buffer_allocation_size(base_buffer)));
@@ -449,20 +475,24 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   iree_hal_buffer_destroy(base_buffer);
 }
 
-static iree_status_t iree_hal_cuda_allocator_import_buffer(
+static iree_status_t iree_hal_cuda2_allocator_import_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
     iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_device_size_t allocation_size = external_buffer->size;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
@@ -488,7 +518,7 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
 
@@ -513,13 +543,13 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
                                IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
         register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
       }
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
           "cuMemHostRegister");
       if (iree_status_is_ok(status)) {
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
             "cuMemHostGetDevicePointer");
       }
@@ -528,18 +558,17 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "handle-based imports not yet implemented");
+                              "unimplmented handle-based imports");
     default:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "external buffer type not supported");
+                              "unimplmented external buffer type");
   }
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
-        compat_params.usage, external_buffer->size,
-        /*byte_offset=*/0,
+        compat_params.usage, external_buffer->size, /*byte_offset=*/0,
         /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
         host_ptr, release_callback, &buffer);
   }
@@ -548,8 +577,8 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -557,26 +586,26 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   return status;
 }
 
-static iree_status_t iree_hal_cuda_allocator_export_buffer(
+static iree_status_t iree_hal_cuda2_allocator_export_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT buffer,
     iree_hal_external_buffer_type_t requested_type,
     iree_hal_external_buffer_flags_t requested_flags,
     iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
-  return iree_make_status(IREE_STATUS_UNAVAILABLE,
-                          "exporting to external buffers not supported");
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "unimplemented exporting to external buffers");
 }
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = {
-    .destroy = iree_hal_cuda_allocator_destroy,
-    .host_allocator = iree_hal_cuda_allocator_host_allocator,
-    .trim = iree_hal_cuda_allocator_trim,
-    .query_statistics = iree_hal_cuda_allocator_query_statistics,
-    .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps,
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
+    .destroy = iree_hal_cuda2_allocator_destroy,
+    .host_allocator = iree_hal_cuda2_allocator_host_allocator,
+    .trim = iree_hal_cuda2_allocator_trim,
+    .query_statistics = iree_hal_cuda2_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
     .query_buffer_compatibility =
-        iree_hal_cuda_allocator_query_buffer_compatibility,
-    .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer,
-    .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer,
-    .import_buffer = iree_hal_cuda_allocator_import_buffer,
-    .export_buffer = iree_hal_cuda_allocator_export_buffer,
+        iree_hal_cuda2_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda2_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda2_allocator_export_buffer,
 };
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
index b2f272895a91..1a063e3cd7c1 100644
--- a/experimental/cuda2/cuda_allocator.h
+++ b/experimental/cuda2/cuda_allocator.h
@@ -1,28 +1,30 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
-#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
 
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/context_wrapper.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-// Create a cuda allocator.
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator);
+// Create a CUDA allocator that allocates device memory from the given |device|
+// and used in the given |stream|.
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_allocator_t host_allocator,
+    iree_hal_allocator_t** out_allocator);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
index 3b9c9e13f9e0..d5d4a19ea4e3 100644
--- a/experimental/cuda2/cuda_buffer.c
+++ b/experimental/cuda2/cuda_buffer.c
@@ -1,10 +1,10 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "experimental/cuda2/cuda_buffer.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -13,34 +13,34 @@
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 
-typedef struct iree_hal_cuda_buffer_t {
+typedef struct iree_hal_cuda2_buffer_t {
   iree_hal_buffer_t base;
-  iree_hal_cuda_buffer_type_t type;
+  iree_hal_cuda2_buffer_type_t type;
   void* host_ptr;
   CUdeviceptr device_ptr;
   iree_hal_buffer_release_callback_t release_callback;
-} iree_hal_cuda_buffer_t;
+} iree_hal_cuda2_buffer_t;
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable;
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable;
 
-static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast(
+static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast(
     iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (iree_hal_cuda2_buffer_t*)base_value;
 }
 
-static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast(
+static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast(
     const iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (const iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (const iree_hal_cuda2_buffer_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** out_buffer) {
   IREE_ASSERT_ARGUMENT(allocator);
@@ -49,14 +49,14 @@ iree_status_t iree_hal_cuda_buffer_wrap(
 
   iree_allocator_t host_allocator =
       iree_hal_allocator_host_allocator(allocator);
-  iree_hal_cuda_buffer_t* buffer = NULL;
+  iree_hal_cuda2_buffer_t* buffer = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
                                allocation_size, byte_offset, byte_length,
                                memory_type, allowed_access, allowed_usage,
-                               &iree_hal_cuda_buffer_vtable, &buffer->base);
+                               &iree_hal_cuda2_buffer_vtable, &buffer->base);
     buffer->type = buffer_type;
     buffer->host_ptr = host_ptr;
     buffer->device_ptr = device_ptr;
@@ -68,8 +68,8 @@ iree_status_t iree_hal_cuda_buffer_wrap(
   return status;
 }
 
-static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
   iree_allocator_t host_allocator = base_buffer->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);
   if (buffer->release_callback.fn) {
@@ -80,12 +80,14 @@ static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_buffer_map_range(
+static iree_status_t iree_hal_cuda2_buffer_map_range(
     iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
     iree_hal_memory_access_t memory_access,
     iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
     iree_hal_buffer_mapping_t* mapping) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
 
   // TODO(benvanik): add upload/download for unmapped buffers.
   IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
@@ -110,52 +112,52 @@ static iree_status_t iree_hal_cuda_buffer_map_range(
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_unmap_range(
+static iree_status_t iree_hal_cuda2_buffer_unmap_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
-  // Nothing to do (today).
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_invalidate_range(
+static iree_status_t iree_hal_cuda2_buffer_invalidate_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_flush_range(
+static iree_status_t iree_hal_cuda2_buffer_flush_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->type;
 }
 
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->device_ptr;
 }
 
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->host_ptr;
 }
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = {
     .recycle = iree_hal_buffer_recycle,
-    .destroy = iree_hal_cuda_buffer_destroy,
-    .map_range = iree_hal_cuda_buffer_map_range,
-    .unmap_range = iree_hal_cuda_buffer_unmap_range,
-    .invalidate_range = iree_hal_cuda_buffer_invalidate_range,
-    .flush_range = iree_hal_cuda_buffer_flush_range,
+    .destroy = iree_hal_cuda2_buffer_destroy,
+    .map_range = iree_hal_cuda2_buffer_map_range,
+    .unmap_range = iree_hal_cuda2_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda2_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda2_buffer_flush_range,
 };
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
index 0b07e8ddea7f..57bde62aab19 100644
--- a/experimental/cuda2/cuda_buffer.h
+++ b/experimental/cuda2/cuda_buffer.h
@@ -1,54 +1,57 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_
-#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
 
+#include "experimental/cuda2/cuda_headers.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/cuda_headers.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-typedef enum iree_hal_cuda_buffer_type_e {
-  // cuMemAlloc/cuMemAllocManaged + cuMemFree
+typedef enum iree_hal_cuda2_buffer_type_e {
+  // Device local buffer; allocated with
+  // cuMemAlloc/cuMemAllocManaged/cuMemAllocAsync, freed with cuMemFree.
   IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 1u << 0,
-  // cuMemHostAlloc + cuMemFreeHost
+  // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST = 1u << 1,
-  // cuMemHostRegister + cuMemHostUnregister
+  // Host local buffer; registered with cuMemHostRegister, freed with
+  // cuMemHostUnregister.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED = 1u << 2,
-} iree_hal_cuda_buffer_type_t;
+} iree_hal_cuda2_buffer_type_t;
 
 // Wraps a CUDA allocation in an iree_hal_buffer_t.
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** out_buffer);
 
-// Returns the underlying CUDA buffer type.
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+// Returns the underlying CUDA buffer type of the given |buffer|.
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* buffer);
 
-// Returns the CUDA base pointer for the given |buffer|.
-// This is the entire allocated_buffer and must be offset by the buffer
-// byte_offset and byte_length when used.
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+// Returns the CUDA base device pointer for the given |buffer|.
+//
+// Note that this is the entire allocated_buffer and must be offset by the
+// buffer byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* buffer);
 
 // Returns the CUDA host pointer for the given |buffer|, if available.
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_