diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index c104bcde8f4e..83b927b115b8 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -17,6 +17,10 @@ iree_cc_library(
     "api.h"
   SRCS
     "api.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
     "cuda_driver.c"
   DEPS
     ::dynamic_symbols
diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
index 4d22a01fe4fe..4a0cddd8a41f 100644
--- a/experimental/cuda2/cuda_allocator.c
+++ b/experimental/cuda2/cuda_allocator.c
@@ -1,47 +1,63 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_allocator.h"
+#include "experimental/cuda2/cuda_allocator.h"
 
 #include <stddef.h>
 
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
-#include "iree/hal/drivers/cuda/dynamic_symbols.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
-static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA";
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
-typedef struct iree_hal_cuda_allocator_t {
+typedef struct iree_hal_cuda2_allocator_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
   iree_hal_resource_t resource;
+
+  // The device that this allocator allocates memory from.
   iree_hal_device_t* base_device;
-  iree_hal_cuda_context_wrapper_t* context;
   CUdevice device;
+
+  // The CUDA stream that allocations should be used in.
   CUstream stream;
+
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  iree_allocator_t host_allocator;
+
+  // Whether the GPU and CPU can concurrently access CUDA managed data in a
+  // coherent way. We would need to explicitly perform data migration and
+  // synchronization between GPU and CPU if not.
   bool supports_concurrent_managed_access;
 
   IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
-} iree_hal_cuda_allocator_t;
+} iree_hal_cuda2_allocator_t;
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable;
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;
 
-static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast(
+static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
     iree_hal_allocator_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable);
-  return (iree_hal_cuda_allocator_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
+  return (iree_hal_cuda2_allocator_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) {
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_allocator_t host_allocator,
+    iree_hal_allocator_t** out_allocator) {
   IREE_ASSERT_ARGUMENT(base_device);
-  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(out_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // To support device-local + host-visible memory we need concurrent managed
@@ -52,8 +68,8 @@ iree_status_t iree_hal_cuda_allocator_create(
   // buffers except for readback staging buffers.
   int supports_concurrent_managed_access = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, CU_RESULT_TO_STATUS(
-              context->syms,
+      z0, IREE_CURESULT_TO_STATUS(
+              cuda_symbols,
               cuDeviceGetAttribute(
                   &supports_concurrent_managed_access,
                   CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
@@ -65,16 +81,17 @@ iree_status_t iree_hal_cuda_allocator_create(
               : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
                 "device-local + host-visible memory)");
 
-  iree_hal_cuda_allocator_t* allocator = NULL;
+  iree_hal_cuda2_allocator_t* allocator = NULL;
   iree_status_t status = iree_allocator_malloc(
-      context->host_allocator, sizeof(*allocator), (void**)&allocator);
+      host_allocator, sizeof(*allocator), (void**)&allocator);
   if (iree_status_is_ok(status)) {
-    iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable,
+    iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
                                  &allocator->resource);
     allocator->base_device = base_device;
-    allocator->context = context;
     allocator->device = device;
     allocator->stream = stream;
+    allocator->symbols = cuda_symbols;
+    allocator->host_allocator = host_allocator;
     allocator->supports_concurrent_managed_access =
         supports_concurrent_managed_access != 0;
     *out_allocator = (iree_hal_allocator_t*)allocator;
@@ -84,47 +101,51 @@ iree_status_t iree_hal_cuda_allocator_create(
   return status;
 }
 
-static void iree_hal_cuda_allocator_destroy(
+static void iree_hal_cuda2_allocator_destroy(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
-  iree_allocator_t host_allocator = allocator->context->host_allocator;
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_allocator_free(host_allocator, allocator);
+  iree_allocator_free(allocator->host_allocator, allocator);
 
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_allocator_t iree_hal_cuda_allocator_host_allocator(
+static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
     const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
-  iree_hal_cuda_allocator_t* allocator =
-      (iree_hal_cuda_allocator_t*)base_allocator;
-  return allocator->context->host_allocator;
+  iree_hal_cuda2_allocator_t* allocator =
+      (iree_hal_cuda2_allocator_t*)base_allocator;
+  return allocator->host_allocator;
 }
 
-static iree_status_t iree_hal_cuda_allocator_trim(
+static iree_status_t iree_hal_cuda2_allocator_trim(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   return iree_ok_status();
 }
 
-static void iree_hal_cuda_allocator_query_statistics(
+static void iree_hal_cuda2_allocator_query_statistics(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
   IREE_STATISTICS({
-    iree_hal_cuda_allocator_t* allocator =
-        iree_hal_cuda_allocator_cast(base_allocator);
+    iree_hal_cuda2_allocator_t* allocator =
+        iree_hal_cuda2_allocator_cast(base_allocator);
     memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
   });
 }
 
-static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
+static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_host_size_t capacity,
     iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
     iree_host_size_t* IREE_RESTRICT out_count) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(heaps);
+  IREE_ASSERT_ARGUMENT(out_count);
+
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
   // set of heaps (likely still a cached and uncached, at minimum).
@@ -199,12 +220,12 @@ static iree_status_t iree_hal_cuda_allocator_query_memory_heaps(
 }
 
 static iree_hal_buffer_compatibility_t
-iree_hal_cuda_allocator_query_buffer_compatibility(
+iree_hal_cuda2_allocator_query_buffer_compatibility(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t* IREE_RESTRICT allocation_size) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // All buffers can be allocated on the heap.
   iree_hal_buffer_compatibility_t compatibility =
@@ -254,42 +275,46 @@ iree_hal_cuda_allocator_query_buffer_compatibility(
   return compatibility;
 }
 
-static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
-                                      iree_hal_cuda_buffer_type_t buffer_type,
-                                      CUdeviceptr device_ptr, void* host_ptr) {
+static void iree_hal_cuda2_buffer_free(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr) {
   IREE_TRACE_ZONE_BEGIN(z0);
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
-      CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
-      CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr));
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
       break;
     }
   }
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
+static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
@@ -315,26 +340,25 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
-  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate");
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
   IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
   if (iree_all_bits_set(compat_params.type,
                         IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
-    // Device local case.
     if (iree_all_bits_set(compat_params.type,
                           IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device local + host visible.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status =
-          CU_RESULT_TO_STATUS(allocator->context->syms,
-                              cuMemAllocManaged(&device_ptr, allocation_size,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
                                                 CU_MEM_ATTACH_GLOBAL));
       if (iree_status_is_ok(status) &&
           allocator->supports_concurrent_managed_access) {
-        // Prefetch the buffer on the GPU device.
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        // Prefetch the buffer to the GPU stream.
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
                                allocator->stream));
       }
@@ -342,22 +366,22 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
     } else {
       // Device only.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
-      status = CU_RESULT_TO_STATUS(allocator->context->syms,
-                                   cuMemAlloc(&device_ptr, allocation_size));
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
     }
   } else {
+    // Host local cases.
     buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
     unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
     if (!iree_all_bits_set(compat_params.type,
                            IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
       flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
     }
-    status =
-        CU_RESULT_TO_STATUS(allocator->context->syms,
-                            cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    status = IREE_CURESULT_TO_STATUS(
+        allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
     if (iree_status_is_ok(status)) {
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
     }
   }
@@ -365,7 +389,7 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
         compat_params.usage, allocation_size,
         /*byte_offset=*/0,
@@ -387,15 +411,15 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
 
   if (iree_status_is_ok(status)) {
     IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
-                           (void*)iree_hal_cuda_buffer_device_pointer(buffer),
+                           (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
                            allocation_size);
     IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
         &allocator->statistics, compat_params.type, allocation_size));
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -403,14 +427,16 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
   return status;
 }
 
-static void iree_hal_cuda_allocator_deallocate_buffer(
+static void iree_hal_cuda2_allocator_deallocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
-  const iree_hal_cuda_buffer_type_t buffer_type =
-      iree_hal_cuda_buffer_type(base_buffer);
+  const iree_hal_cuda2_buffer_type_t buffer_type =
+      iree_hal_cuda2_buffer_type(base_buffer);
 
   // WARNING: we may be called from a random thread and need to ensure that we
   // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
@@ -426,16 +452,16 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   // to silently ignore them: whatever the user tries to do next will fail in
   // the same way and if we were deallocating this buffer as part of a tear-down
   // on failure we don't want to end up dying during cleanup.
-  iree_hal_cuda_buffer_free(allocator->context, buffer_type,
-                            iree_hal_cuda_buffer_device_pointer(base_buffer),
-                            iree_hal_cuda_buffer_host_pointer(base_buffer));
+  iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
+                             iree_hal_cuda2_buffer_device_pointer(base_buffer),
+                             iree_hal_cuda2_buffer_host_pointer(base_buffer));
 
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_FREE_NAMED(
           IREE_HAL_CUDA_ALLOCATOR_ID,
-          (void*)iree_hal_cuda_buffer_device_pointer(base_buffer));
+          (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
       IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
           &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
           iree_hal_buffer_allocation_size(base_buffer)));
@@ -449,20 +475,24 @@ static void iree_hal_cuda_allocator_deallocate_buffer(
   iree_hal_buffer_destroy(base_buffer);
 }
 
-static iree_status_t iree_hal_cuda_allocator_import_buffer(
+static iree_status_t iree_hal_cuda2_allocator_import_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
     iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  iree_hal_cuda_allocator_t* allocator =
-      iree_hal_cuda_allocator_cast(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
 
   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_device_size_t allocation_size = external_buffer->size;
   iree_hal_buffer_compatibility_t compatibility =
-      iree_hal_cuda_allocator_query_buffer_compatibility(
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
@@ -488,7 +518,7 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   }
 
   iree_status_t status = iree_ok_status();
-  iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
 
@@ -513,13 +543,13 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
                                IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
         register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
       }
-      status = CU_RESULT_TO_STATUS(
-          allocator->context->syms,
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
           cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
           "cuMemHostRegister");
       if (iree_status_is_ok(status)) {
-        status = CU_RESULT_TO_STATUS(
-            allocator->context->syms,
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
             cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
             "cuMemHostGetDevicePointer");
       }
@@ -528,18 +558,17 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "handle-based imports not yet implemented");
+                              "unimplmented handle-based imports");
     default:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                              "external buffer type not supported");
+                              "unimplmented external buffer type");
   }
 
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_buffer_wrap(
+    status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
-        compat_params.usage, external_buffer->size,
-        /*byte_offset=*/0,
+        compat_params.usage, external_buffer->size, /*byte_offset=*/0,
         /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
         host_ptr, release_callback, &buffer);
   }
@@ -548,8 +577,8 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
     *out_buffer = buffer;
   } else {
     if (!buffer) {
-      iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr,
-                                host_ptr);
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
@@ -557,26 +586,26 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer(
   return status;
 }
 
-static iree_status_t iree_hal_cuda_allocator_export_buffer(
+static iree_status_t iree_hal_cuda2_allocator_export_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT buffer,
     iree_hal_external_buffer_type_t requested_type,
     iree_hal_external_buffer_flags_t requested_flags,
     iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
-  return iree_make_status(IREE_STATUS_UNAVAILABLE,
-                          "exporting to external buffers not supported");
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "unimplemented exporting to external buffers");
 }
 
-static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = {
-    .destroy = iree_hal_cuda_allocator_destroy,
-    .host_allocator = iree_hal_cuda_allocator_host_allocator,
-    .trim = iree_hal_cuda_allocator_trim,
-    .query_statistics = iree_hal_cuda_allocator_query_statistics,
-    .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps,
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
+    .destroy = iree_hal_cuda2_allocator_destroy,
+    .host_allocator = iree_hal_cuda2_allocator_host_allocator,
+    .trim = iree_hal_cuda2_allocator_trim,
+    .query_statistics = iree_hal_cuda2_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
     .query_buffer_compatibility =
-        iree_hal_cuda_allocator_query_buffer_compatibility,
-    .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer,
-    .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer,
-    .import_buffer = iree_hal_cuda_allocator_import_buffer,
-    .export_buffer = iree_hal_cuda_allocator_export_buffer,
+        iree_hal_cuda2_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda2_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda2_allocator_export_buffer,
 };
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
index b2f272895a91..1a063e3cd7c1 100644
--- a/experimental/cuda2/cuda_allocator.h
+++ b/experimental/cuda2/cuda_allocator.h
@@ -1,28 +1,30 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
-#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
 
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/context_wrapper.h"
-#include "iree/hal/drivers/cuda/status_util.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-// Create a cuda allocator.
-iree_status_t iree_hal_cuda_allocator_create(
-    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator);
+// Create a CUDA allocator that allocates device memory from the given |device|
+// and used in the given |stream|.
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_allocator_t host_allocator,
+    iree_hal_allocator_t** out_allocator);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
index 3b9c9e13f9e0..d5d4a19ea4e3 100644
--- a/experimental/cuda2/cuda_buffer.c
+++ b/experimental/cuda2/cuda_buffer.c
@@ -1,10 +1,10 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "experimental/cuda2/cuda_buffer.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -13,34 +13,34 @@
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 
-typedef struct iree_hal_cuda_buffer_t {
+typedef struct iree_hal_cuda2_buffer_t {
   iree_hal_buffer_t base;
-  iree_hal_cuda_buffer_type_t type;
+  iree_hal_cuda2_buffer_type_t type;
   void* host_ptr;
   CUdeviceptr device_ptr;
   iree_hal_buffer_release_callback_t release_callback;
-} iree_hal_cuda_buffer_t;
+} iree_hal_cuda2_buffer_t;
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable;
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable;
 
-static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast(
+static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast(
     iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (iree_hal_cuda2_buffer_t*)base_value;
 }
 
-static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast(
+static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast(
     const iree_hal_buffer_t* base_value) {
-  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
-  return (const iree_hal_cuda_buffer_t*)base_value;
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (const iree_hal_cuda2_buffer_t*)base_value;
 }
 
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** out_buffer) {
   IREE_ASSERT_ARGUMENT(allocator);
@@ -49,14 +49,14 @@ iree_status_t iree_hal_cuda_buffer_wrap(
 
   iree_allocator_t host_allocator =
       iree_hal_allocator_host_allocator(allocator);
-  iree_hal_cuda_buffer_t* buffer = NULL;
+  iree_hal_cuda2_buffer_t* buffer = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
                                allocation_size, byte_offset, byte_length,
                                memory_type, allowed_access, allowed_usage,
-                               &iree_hal_cuda_buffer_vtable, &buffer->base);
+                               &iree_hal_cuda2_buffer_vtable, &buffer->base);
     buffer->type = buffer_type;
     buffer->host_ptr = host_ptr;
     buffer->device_ptr = device_ptr;
@@ -68,8 +68,8 @@ iree_status_t iree_hal_cuda_buffer_wrap(
   return status;
 }
 
-static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
   iree_allocator_t host_allocator = base_buffer->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);
   if (buffer->release_callback.fn) {
@@ -80,12 +80,14 @@ static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
   IREE_TRACE_ZONE_END(z0);
 }
 
-static iree_status_t iree_hal_cuda_buffer_map_range(
+static iree_status_t iree_hal_cuda2_buffer_map_range(
     iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
     iree_hal_memory_access_t memory_access,
     iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
     iree_hal_buffer_mapping_t* mapping) {
-  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
 
   // TODO(benvanik): add upload/download for unmapped buffers.
   IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
@@ -110,52 +112,52 @@ static iree_status_t iree_hal_cuda_buffer_map_range(
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_unmap_range(
+static iree_status_t iree_hal_cuda2_buffer_unmap_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
-  // Nothing to do (today).
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_invalidate_range(
+static iree_status_t iree_hal_cuda2_buffer_invalidate_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_buffer_flush_range(
+static iree_status_t iree_hal_cuda2_buffer_flush_range(
     iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
     iree_device_size_t local_byte_length) {
-  // Nothing to do.
+  // Nothing to do today.
   return iree_ok_status();
 }
 
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->type;
 }
 
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->device_ptr;
 }
 
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
-  const iree_hal_cuda_buffer_t* buffer =
-      iree_hal_cuda_buffer_const_cast(base_buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
   return buffer->host_ptr;
 }
 
-static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = {
     .recycle = iree_hal_buffer_recycle,
-    .destroy = iree_hal_cuda_buffer_destroy,
-    .map_range = iree_hal_cuda_buffer_map_range,
-    .unmap_range = iree_hal_cuda_buffer_unmap_range,
-    .invalidate_range = iree_hal_cuda_buffer_invalidate_range,
-    .flush_range = iree_hal_cuda_buffer_flush_range,
+    .destroy = iree_hal_cuda2_buffer_destroy,
+    .map_range = iree_hal_cuda2_buffer_map_range,
+    .unmap_range = iree_hal_cuda2_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda2_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda2_buffer_flush_range,
 };
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
index 0b07e8ddea7f..57bde62aab19 100644
--- a/experimental/cuda2/cuda_buffer.h
+++ b/experimental/cuda2/cuda_buffer.h
@@ -1,54 +1,57 @@
-// Copyright 2021 The IREE Authors
+// Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_
-#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
 
+#include "experimental/cuda2/cuda_headers.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
-#include "iree/hal/drivers/cuda/cuda_headers.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-typedef enum iree_hal_cuda_buffer_type_e {
-  // cuMemAlloc/cuMemAllocManaged + cuMemFree
+typedef enum iree_hal_cuda2_buffer_type_e {
+  // Device local buffer; allocated with
+  // cuMemAlloc/cuMemAllocManaged/cuMemAllocAsync, freed with cuMemFree.
   IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 1u << 0,
-  // cuMemHostAlloc + cuMemFreeHost
+  // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST = 1u << 1,
-  // cuMemHostRegister + cuMemHostUnregister
+  // Host local buffer; registered with cuMemHostRegister, freed with
+  // cuMemHostUnregister.
   IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED = 1u << 2,
-} iree_hal_cuda_buffer_type_t;
+} iree_hal_cuda2_buffer_type_t;
 
 // Wraps a CUDA allocation in an iree_hal_buffer_t.
-iree_status_t iree_hal_cuda_buffer_wrap(
+iree_status_t iree_hal_cuda2_buffer_wrap(
     iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
-    iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** out_buffer);
 
-// Returns the underlying CUDA buffer type.
-iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
+// Returns the underlying CUDA buffer type of the given |buffer|.
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
     const iree_hal_buffer_t* buffer);
 
-// Returns the CUDA base pointer for the given |buffer|.
-// This is the entire allocated_buffer and must be offset by the buffer
-// byte_offset and byte_length when used.
-CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+// Returns the CUDA base device pointer for the given |buffer|.
+//
+// Note that this is the entire allocated_buffer and must be offset by the
+// buffer byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
     const iree_hal_buffer_t* buffer);
 
 // Returns the CUDA host pointer for the given |buffer|, if available.
-void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // IREE_HAL_DRIVERS_CUDA_BUFFER_H_
+#endif  // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_