diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index c104bcde8f4e..ca1959516917 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -17,7 +17,13 @@ iree_cc_library(
     "api.h"
   SRCS
     "api.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
     "cuda_driver.c"
+    "memory_pools.c"
+    "memory_pools.h"
   DEPS
     ::dynamic_symbols
     iree::base
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index 565fd50ca972..a56b656b2b7c 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -16,6 +16,30 @@
 extern "C" {
 #endif  // __cplusplus
 
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda_device_t
+//===----------------------------------------------------------------------===//
+
+// Parameters defining a CUmemoryPool.
+typedef struct iree_hal_cuda2_memory_pool_params_t {
+  // Minimum number of bytes to keep in the pool when trimming with
+  // iree_hal_device_trim.
+  uint64_t minimum_capacity;
+  // Soft maximum number of bytes to keep in the pool.
+  // When more than this is allocated the extra will be freed at the next
+  // device synchronization in order to remain under the threshold.
+  uint64_t release_threshold;
+  // TODO: per-device access permissions array.
+} iree_hal_cuda2_memory_pool_params_t;
+
+// Parameters for each CUmemoryPool used for queue-ordered allocations.
+typedef struct iree_hal_cuda2_memory_pooling_params_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  iree_hal_cuda2_memory_pool_params_t device_local;
+  // Used for any host-visible/host-local memory types.
+  iree_hal_cuda2_memory_pool_params_t other;
+} iree_hal_cuda2_memory_pooling_params_t;
+
 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_driver_t
 //===----------------------------------------------------------------------===//
diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c
new file mode 100644
index 000000000000..5ec9a396af32
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.c
@@ -0,0 +1,623 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/cuda_allocator.h"
+
+#include <stddef.h>
+
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2 unpooled";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+typedef struct iree_hal_cuda2_allocator_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
+  iree_hal_resource_t resource;
+
+  // The device that this allocator allocates memory from.
+  iree_hal_device_t* base_device;
+  CUdevice device;
+
+  // The CUDA stream that allocations should be used in.
+  CUstream stream;
+
+  iree_hal_cuda2_memory_pools_t* pools;
+
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  iree_allocator_t host_allocator;
+
+  // Whether the GPU and CPU can concurrently access CUDA managed data in a
+  // coherent way. We would need to explicitly perform flushing and invalidation
+  // between GPU and CPU if not.
+  bool supports_concurrent_managed_access;
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_cuda2_allocator_t;
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;
+
+static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
+    iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
+  return (iree_hal_cuda2_allocator_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(base_device);
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(pools);
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // To support device-local + host-visible memory we need concurrent managed
+  // access indicating that the host and devices can concurrently access the
+  // device memory. If we don't have this feature then we fall back to forcing
+  // all device-local + host-visible memory into host-local + device-visible
+  // page-locked memory. The compiler tries to avoid this for high-traffic
+  // buffers except for readback staging buffers.
+  int supports_concurrent_managed_access = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, IREE_CURESULT_TO_STATUS(
+              cuda_symbols,
+              cuDeviceGetAttribute(
+                  &supports_concurrent_managed_access,
+                  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
+              "cuDeviceGetAttribute"));
+
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, supports_concurrent_managed_access
+              ? "has CONCURRENT_MANAGED_ACCESS"
+              : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
+                "device-local + host-visible memory)");
+
+  iree_hal_cuda2_allocator_t* allocator = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*allocator), (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
+                                 &allocator->resource);
+    allocator->base_device = base_device;
+    allocator->device = device;
+    allocator->stream = stream;
+    allocator->pools = pools;
+    allocator->symbols = cuda_symbols;
+    allocator->host_allocator = host_allocator;
+    allocator->supports_concurrent_managed_access =
+        supports_concurrent_managed_access != 0;
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda2_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(allocator->host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda2_allocator_t* allocator =
+      (iree_hal_cuda2_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_cuda2_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda2_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_cuda2_allocator_t* allocator =
+        iree_hal_cuda2_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+    iree_hal_cuda2_memory_pools_merge_statistics(allocator->pools,
+                                                 out_statistics);
+  });
+}
+
+static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_host_size_t capacity,
+    iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
+    iree_host_size_t* IREE_RESTRICT out_count) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(heaps);
+  IREE_ASSERT_ARGUMENT(out_count);
+
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
+  // set of heaps (likely still a cached and uncached, at minimum).
+  iree_host_size_t count = 3;
+  if (allocator->supports_concurrent_managed_access) {
+    ++count;  // device-local | host-visible
+  }
+  if (out_count) *out_count = count;
+  if (capacity < count) {
+    // NOTE: lightweight as this is hit in normal pre-sizing usage.
+    return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+  }
+
+  // Don't think there's a query for these.
+  // Max allocation size may be much smaller in certain memory types such as
+  // page-locked memory and it'd be good to enforce that.
+  const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0;
+  const iree_device_size_t min_alignment = 64;
+
+  int i = 0;
+
+  // Device-local memory (dispatch resources):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+      .allowed_usage =
+          IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  if (allocator->supports_concurrent_managed_access) {
+    // Device-local managed memory with host mapping support:
+    heaps[i++] = (iree_hal_allocator_memory_heap_t){
+        .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+                IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+        .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                         IREE_HAL_BUFFER_USAGE_DISPATCH |
+                         IREE_HAL_BUFFER_USAGE_MAPPING,
+        .max_allocation_size = max_allocation_size,
+        .min_alignment = min_alignment,
+    };
+  }
+
+  // Write-combined page-locked host-local memory (upload):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  // Cached page-locked host-local memory (download):
+  heaps[i++] = (iree_hal_allocator_memory_heap_t){
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
+              IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+              IREE_HAL_MEMORY_TYPE_HOST_COHERENT |
+              IREE_HAL_MEMORY_TYPE_HOST_CACHED,
+      .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+                       IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_MAPPING,
+      .max_allocation_size = max_allocation_size,
+      .min_alignment = min_alignment,
+  };
+
+  IREE_ASSERT(i == count);
+  return iree_ok_status();
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_cuda2_allocator_query_buffer_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t* IREE_RESTRICT allocation_size) {
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // Buffers are importable in CUDA under most cases, though performance may
+  // vary wildly. We don't fully verify that the buffer parameters are
+  // self-consistent and just look at whether we can get a device pointer.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+    }
+    if (iree_any_bit_set(params->usage,
+                         IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  // If concurrent managed access is not supported then make device-local +
+  // host-visible allocations fall back to host-local + device-visible
+  // page-locked memory. This will be significantly slower for the device to
+  // access but the compiler only uses this type for readback staging buffers
+  // and it's better to function than function fast.
+  if (!allocator->supports_concurrent_managed_access &&
+      iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE;
+    params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE);
+    params->type |=
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  }
+
+  // We are now optimal.
+  params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL;
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (*allocation_size == 0) *allocation_size = 4;
+
+  return compatibility;
+}
+
+static void iree_hal_cuda2_buffer_free(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
+      IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
+      break;
+    }
+    case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)");
+      break;
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
+  if (iree_all_bits_set(compat_params.type,
+                        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    if (iree_all_bits_set(compat_params.type,
+                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device local + host visible.
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
+                                                CU_MEM_ATTACH_GLOBAL));
+      if (iree_status_is_ok(status) &&
+          allocator->supports_concurrent_managed_access) {
+        // Prefetch the buffer to the GPU stream.
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
+            cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
+                               allocator->stream));
+      }
+      host_ptr = (void*)device_ptr;
+    } else {
+      // Device only.
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
+    }
+  } else {
+    // Host local cases.
+    buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
+    unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
+    if (!iree_all_bits_set(compat_params.type,
+                           IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+      flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
+    }
+    status = IREE_CURESULT_TO_STATUS(
+        allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    if (iree_status_is_ok(status)) {
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
+          cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, allocation_size,
+        /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr,
+        iree_hal_buffer_release_callback_null(),
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->base_device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
+                           (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
+                           allocation_size);
+    IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, compat_params.type, allocation_size));
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static void iree_hal_cuda2_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  const iree_hal_cuda2_buffer_type_t buffer_type =
+      iree_hal_cuda2_buffer_type(base_buffer);
+
+  // WARNING: we may be called from a random thread and need to ensure that we
+  // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
+  // change the context here will result in full device synchronization. In the
+  // future we'll need to do something fairly complex such as having a dedicated
+  // thread with a persistently bound context that does nothing but free
+  // buffers. The load on this will be lighter when queue-ordered allocations
+  // are used or any sort of pooling policy is applied.
+  //
+  // WARNING: with CUDA's lazy error propagation it's possible that by the time
+  // this code is running something else has triggered device loss and we can't
+  // actually use the context. In that case we can't perform the frees and want
+  // to silently ignore them: whatever the user tries to do next will fail in
+  // the same way and if we were deallocating this buffer as part of a tear-down
+  // on failure we don't want to end up dying during cleanup.
+  iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
+                             iree_hal_cuda2_buffer_device_pointer(base_buffer),
+                             iree_hal_cuda2_buffer_host_pointer(base_buffer));
+
+  switch (buffer_type) {
+    case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
+    case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
+      IREE_TRACE_FREE_NAMED(
+          IREE_HAL_CUDA_ALLOCATOR_ID,
+          (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
+      IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
+          &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
+          iree_hal_buffer_allocation_size(base_buffer)));
+      break;
+    }
+    default:
+      // Buffer type not tracked.
+      break;
+  }
+
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_cuda2_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_cuda2_allocator_t* allocator =
+      iree_hal_cuda2_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  iree_device_size_t allocation_size = external_buffer->size;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_cuda2_allocator_query_buffer_compatibility(
+          base_allocator, &compat_params, &allocation_size);
+  if (!iree_all_bits_set(compatibility,
+                         IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1, temp2;
+    iree_string_view_t memory_type_str =
+        iree_hal_memory_type_format(params->type, &temp0);
+    iree_string_view_t usage_str =
+        iree_hal_buffer_usage_format(params->usage, &temp1);
+    iree_string_view_t compatibility_str =
+        iree_hal_buffer_compatibility_format(compatibility, &temp2);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters; "
+        "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
+        (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
+        usage_str.data, (int)compatibility_str.size, compatibility_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot import a buffer with the given parameters");
+#endif  // IREE_STATUS_MODE
+  }
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+
+  switch (external_buffer->type) {
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: {
+      if (iree_all_bits_set(compat_params.type,
+                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "unable to register host allocations as device-local memory");
+      }
+      buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED;
+      host_ptr = external_buffer->handle.host_allocation.ptr;
+      uint32_t register_flags = 0;
+      if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) {
+        register_flags = CU_MEMHOSTREGISTER_READ_ONLY;
+      }
+      if (iree_any_bit_set(compat_params.usage,
+                           IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
+                               IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
+        register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
+      }
+      status = IREE_CURESULT_TO_STATUS(
+          allocator->symbols,
+          cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
+          "cuMemHostRegister");
+      if (iree_status_is_ok(status)) {
+        status = IREE_CURESULT_TO_STATUS(
+            allocator->symbols,
+            cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
+            "cuMemHostGetDevicePointer");
+      }
+      break;
+    }
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
+    case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "handle-based imports not yet implemented");
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "external buffer type not supported");
+  }
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_buffer_wrap(
+        base_allocator, compat_params.type, compat_params.access,
+        compat_params.usage, external_buffer->size, /*byte_offset=*/0,
+        /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
+        host_ptr, release_callback,
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
+                                 host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_cuda2_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "exporting to external buffers not supported");
+}
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
+    .destroy = iree_hal_cuda2_allocator_destroy,
+    .host_allocator = iree_hal_cuda2_allocator_host_allocator,
+    .trim = iree_hal_cuda2_allocator_trim,
+    .query_statistics = iree_hal_cuda2_allocator_query_statistics,
+    .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
+    .query_buffer_compatibility =
+        iree_hal_cuda2_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda2_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda2_allocator_export_buffer,
+};
diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h
new file mode 100644
index 000000000000..2ff33ea467c0
--- /dev/null
+++ b/experimental/cuda2/cuda_allocator.h
@@ -0,0 +1,33 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/memory_pools.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a CUDA memory allocator.
+// |device| and |stream| will be used for management operations.
+// |pools| provides memory pools that may be shared across multiple allocators
+// and the pointer must remain valid for the lifetime of the allocator.
+iree_status_t iree_hal_cuda2_allocator_create(
+    iree_hal_device_t* base_device,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
+    CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_
diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c
new file mode 100644
index 000000000000..e88c9e3b3de3
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.c
@@ -0,0 +1,166 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/cuda_buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda2_buffer_t {
+  iree_hal_buffer_t base;
+  iree_hal_cuda2_buffer_type_t type;
+  void* host_ptr;
+  CUdeviceptr device_ptr;
+  iree_hal_buffer_release_callback_t release_callback;
+} iree_hal_cuda2_buffer_t;
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable;
+
+static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (iree_hal_cuda2_buffer_t*)base_value;
+}
+
+static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast(
+    const iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable);
+  return (const iree_hal_cuda2_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda2_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda2_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, byte_offset, byte_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_cuda2_buffer_vtable, &buffer->base);
+    buffer->type = buffer_type;
+    buffer->host_ptr = host_ptr;
+    buffer->device_ptr = device_ptr;
+    buffer->release_callback = release_callback;
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (buffer->release_callback.fn) {
+    buffer->release_callback.fn(buffer->release_callback.user_data,
+                                base_buffer);
+  }
+  iree_allocator_free(host_allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda2_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  IREE_ASSERT_ARGUMENT(base_buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+
+  // TODO(benvanik): add upload/download for unmapped buffers.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(base_buffer),
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer),
+                                     IREE_HAL_BUFFER_USAGE_MAPPING));
+
+  uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset;
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(data_ptr, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  mapping->contents = iree_make_byte_span(data_ptr, local_byte_length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do today.
+  return iree_ok_status();
+}
+
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->type;
+}
+
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
+    const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->device_ptr;
+}
+
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) {
+  const iree_hal_cuda2_buffer_t* buffer =
+      iree_hal_cuda2_buffer_const_cast(base_buffer);
+  return buffer->host_ptr;
+}
+
+void iree_hal_cuda2_buffer_drop_release_callback(
+    iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer);
+  buffer->release_callback = iree_hal_buffer_release_callback_null();
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_cuda2_buffer_destroy,
+    .map_range = iree_hal_cuda2_buffer_map_range,
+    .unmap_range = iree_hal_cuda2_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda2_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda2_buffer_flush_range,
+};
diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h
new file mode 100644
index 000000000000..23cade3c57ad
--- /dev/null
+++ b/experimental/cuda2/cuda_buffer.h
@@ -0,0 +1,66 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
+
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum iree_hal_cuda2_buffer_type_e {
+  // Device local buffer; allocated with cuMemAlloc/cuMemAllocManaged, freed
+  // with cuMemFree.
+  IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0,
+  // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost.
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST,
+  // Host local buffer; registered with cuMemHostRegister, freed with
+  // cuMemHostUnregister.
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED,
+  // Device local buffer, allocated with cuMemAllocFromPoolAsync, freed with
+  // cuMemFree/cuMemFreeAsync.
+  IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+} iree_hal_cuda2_buffer_type_t;
+
+// Wraps a CUDA allocation in an iree_hal_buffer_t.
+iree_status_t iree_hal_cuda2_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
+    void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
+
+// Returns the underlying CUDA buffer type of the given |buffer|.
+iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA base device pointer for the given |buffer|.
+//
+// Note that this is the entire allocated_buffer and must be offset by the
+// buffer byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda2_buffer_device_pointer(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the CUDA host pointer for the given |buffer|, if available.
+void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer);
+
+// Drops the release callback so that when the buffer is destroyed no callback
+// will be made. This is not thread safe but all callers are expected to be
+// holding an allocation and the earliest the buffer could be destroyed is after
+// this call returns and the caller has released its reference.
+void iree_hal_cuda2_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_
diff --git a/experimental/cuda2/cuda_dynamic_symbol_table.h b/experimental/cuda2/cuda_dynamic_symbol_table.h
index b4aaa93fc750..fb8ff5a8ecd8 100644
--- a/experimental/cuda2/cuda_dynamic_symbol_table.h
+++ b/experimental/cuda2/cuda_dynamic_symbol_table.h
@@ -49,6 +49,18 @@ IREE_CU_PFN_DECL(cuMemHostAlloc, void**, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostRegister, void*, size_t, unsigned int)
 IREE_CU_PFN_DECL(cuMemHostUnregister, void*)
 IREE_CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int)
+IREE_CU_PFN_DECL(cuMemPoolCreate, CUmemoryPool*, const CUmemPoolProps*)
+IREE_CU_PFN_DECL(cuMemPoolDestroy, CUmemoryPool)
+IREE_CU_PFN_DECL(cuMemPoolSetAccess, CUmemoryPool, const CUmemAccessDesc*,
+                 size_t)
+IREE_CU_PFN_DECL(cuMemPoolGetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolSetAttribute, CUmemoryPool, CUmemPool_attribute,
+                 void*)
+IREE_CU_PFN_DECL(cuMemPoolTrimTo, CUmemoryPool, size_t)
+IREE_CU_PFN_DECL(cuMemAllocFromPoolAsync, CUdeviceptr*, size_t, CUmemoryPool,
+                 CUstream)
+IREE_CU_PFN_DECL(cuMemFreeAsync, CUdeviceptr dptr, CUstream hStream)
 IREE_CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*)
 IREE_CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int,
                  CUjit_option*, void**)
diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c
new file mode 100644
index 000000000000..e29c5121c51a
--- /dev/null
+++ b/experimental/cuda2/memory_pools.c
@@ -0,0 +1,278 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/memory_pools.h"
+
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+#include "iree/base/tracing.h"
+
+// NOTE: these are currently global for all devices; we could make
+// device-specific ones by malloc() and leaking (with LSAN note) unique string
+// values instead.
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
+    "CUDA pool: device-local reserved";
+static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
+    "CUDA pool: other reserved";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+static iree_status_t iree_hal_cuda2_create_memory_pool(
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    iree_hal_cuda2_memory_pool_params_t params,
+    CUmemoryPool* IREE_RESTRICT out_pool) {
+  *out_pool = NULL;
+
+  CUmemPoolProps pool_props = {
+      .allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
+      // TODO: allow sharing of certain pool memory types by fd/HANDLE.
+      .handleTypes = CU_MEM_HANDLE_TYPE_NONE,
+      .location =
+          {
+              .type = CU_MEM_LOCATION_TYPE_DEVICE,
+              .id = cu_device,
+          },
+      .win32SecurityAttributes = NULL,
+      .reserved = {0},
+  };
+
+  CUmemoryPool pool = NULL;
+  IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
+                            "cuMemPoolCreate");
+
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      cuda_symbols,
+      cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                            &params.release_threshold),
+      "cuMemPoolSetAttribute");
+
+  if (iree_status_is_ok(status)) {
+    *out_pool = pool;
+  } else {
+    IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
+  IREE_ASSERT_ARGUMENT(cuda_symbols);
+  IREE_ASSERT_ARGUMENT(pooling_params);
+  IREE_ASSERT_ARGUMENT(out_pools);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_pools, 0, sizeof(*out_pools));
+  out_pools->cuda_symbols = cuda_symbols;
+  out_pools->host_allocator = host_allocator;
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
+                                               pooling_params->device_local,
+                                               &out_pools->device_local);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_create_memory_pool(
+        cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (pools->device_local) {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemPoolDestroy(pools->device_local));
+    pools->device_local = NULL;
+  }
+
+  if (pools->other) {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
+    pools->other = NULL;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_cuda2_memory_pool_track_alloc(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
+  (void)allocation_size;
+  IREE_TRACE_ALLOC_NAMED(
+      is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                      : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+      (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_allocated =
+        is_device_local ? &pools->statistics.device_bytes_allocated
+                        : &pools->statistics.host_bytes_allocated;
+    iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+static void iree_hal_cuda2_memory_pool_track_free(
+    iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  IREE_TRACE_FREE_NAMED(is_device_local
+                            ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                            : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+                        (void*)iree_hal_cuda2_buffer_device_pointer(buffer));
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_freed =
+        is_device_local ? &pools->statistics.device_bytes_freed
+                        : &pools->statistics.host_bytes_freed;
+    iree_device_size_t allocation_size =
+        iree_hal_buffer_allocation_size(buffer);
+    iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics) {
+  IREE_STATISTICS({
+    statistics->device_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
+    statistics->host_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
+    statistics->device_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
+    statistics->host_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
+    if (pools->device_local) {
+      cuuint64_t pool_peak = 0;
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
+          cuMemPoolGetAttribute(pools->device_local,
+                                CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
+      statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+    if (pools->other) {
+      cuuint64_t pool_peak = 0;
+      IREE_CUDA_IGNORE_ERROR(
+          pools->cuda_symbols,
+          cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+                                &pool_peak));
+      statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+  });
+}
+
+// NOTE: this is only issued if the buffer is destroyed without having had been
+// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
+// the release callback so that this isn't called and we don't double-free.
+static void iree_hal_cuda2_async_buffer_release_callback(
+    void* user_data, iree_hal_buffer_t* buffer) {
+  iree_hal_cuda2_memory_pools_t* pools =
+      (iree_hal_cuda2_memory_pools_t*)user_data;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size);
+
+  iree_hal_buffer_params_canonicalize(&params);
+
+  // TODO: more pools and better selection; this is coarsely deciding between
+  // only device local (variables, constants, transients) and other (staging,
+  // external) but could use more buffer properties (including usage/export
+  // flags) to better isolate the different usage patterns and keep the pools
+  // operating with reasonable limits. We should be using the |pool| arg.
+  CUmemoryPool memory_pool =
+      iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
+          ? pools->device_local
+          : pools->other;
+
+  CUdeviceptr device_ptr = 0;
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols,
+      cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
+                              stream),
+      "cuMemAllocFromPoolAsync");
+
+  // Wrap the allocated CUDA buffer in a HAL buffer.
+  // NOTE: we don't provide a device allocator because we didn't allocate from
+  // one and instead we use a release callback to perform the free if the user
+  // doesn't dealloca the buffer.
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_release_callback_t release_callback = {
+        .fn = iree_hal_cuda2_async_buffer_release_callback,
+        .user_data = pools,
+    };
+    status = iree_hal_cuda2_buffer_wrap(
+        /*device_allocator=*/NULL, params.type, params.access, params.usage,
+        allocation_size, /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+        device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator,
+        &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Update statistics (note that it may not yet be accurate).
+    iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
+    *out_buffer = buffer;
+  } else if (buffer) {
+    iree_hal_buffer_release(buffer);
+  } else {
+    IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
+                           cuMemFreeAsync(device_ptr, stream));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(
+      z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
+
+  // Try to schedule the buffer for freeing.
+  CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
+  iree_status_t status = IREE_CURESULT_TO_STATUS(
+      pools->cuda_symbols, cuMemFreeAsync(device_ptr, stream),
+      "cuMemFreeAsync");
+
+  // Drop the release callback so that we don't try to double-free the buffer.
+  iree_hal_cuda2_buffer_drop_release_callback(buffer);
+
+  // Update statistics (note that it may not yet be accurate).
+  iree_hal_cuda2_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h
new file mode 100644
index 000000000000..8eccf9ef7105
--- /dev/null
+++ b/experimental/cuda2/memory_pools.h
@@ -0,0 +1,73 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+#define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+
+#include "experimental/cuda2/api.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Retained CUDA memory pools for various allocation types.
+typedef struct iree_hal_cuda2_memory_pools_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  CUmemoryPool device_local;
+  // Used for any host-visible/host-local memory types.
+  CUmemoryPool other;
+
+  const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols;
+  iree_allocator_t host_allocator;
+
+  IREE_STATISTICS(struct {
+    iree_atomic_int64_t device_bytes_allocated;
+    iree_atomic_int64_t device_bytes_freed;
+    iree_atomic_int64_t host_bytes_allocated;
+    iree_atomic_int64_t host_bytes_freed;
+  } statistics;)
+} iree_hal_cuda2_memory_pools_t;
+
+// Initializes |out_pools| by configuring new CUDA memory pools.
+iree_status_t iree_hal_cuda2_memory_pools_initialize(
+    iree_allocator_t host_allocator,
+    const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
+    const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools);
+
+// Deinitializes the |pools| and releases the underlying CUDA resources.
+void iree_hal_cuda2_memory_pools_deinitialize(
+    iree_hal_cuda2_memory_pools_t* pools);
+
+// Merges statistics information from |pools| into |statistics|.
+void iree_hal_cuda2_memory_pools_merge_statistics(
+    iree_hal_cuda2_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics);
+
+// Asynchronously allocates a buffer from an appropriate pool.
+// The allocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda2_memory_pools_alloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+// Asynchronously deallocates a buffer from its pool.
+// The deallocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda2_memory_pools_dealloca(
+    iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_