diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt index c104bcde8f4e..ca1959516917 100644 --- a/experimental/cuda2/CMakeLists.txt +++ b/experimental/cuda2/CMakeLists.txt @@ -17,7 +17,13 @@ iree_cc_library( "api.h" SRCS "api.h" + "cuda_allocator.c" + "cuda_allocator.h" + "cuda_buffer.c" + "cuda_buffer.h" "cuda_driver.c" + "memory_pools.c" + "memory_pools.h" DEPS ::dynamic_symbols iree::base diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h index 565fd50ca972..a56b656b2b7c 100644 --- a/experimental/cuda2/api.h +++ b/experimental/cuda2/api.h @@ -16,6 +16,30 @@ extern "C" { #endif // __cplusplus +//===----------------------------------------------------------------------===// +// iree_hal_cuda_device_t +//===----------------------------------------------------------------------===// + +// Parameters defining a CUmemoryPool. +typedef struct iree_hal_cuda2_memory_pool_params_t { + // Minimum number of bytes to keep in the pool when trimming with + // iree_hal_device_trim. + uint64_t minimum_capacity; + // Soft maximum number of bytes to keep in the pool. + // When more than this is allocated the extra will be freed at the next + // device synchronization in order to remain under the threshold. + uint64_t release_threshold; + // TODO: per-device access permissions array. +} iree_hal_cuda2_memory_pool_params_t; + +// Parameters for each CUmemoryPool used for queue-ordered allocations. +typedef struct iree_hal_cuda2_memory_pooling_params_t { + // Used exclusively for DEVICE_LOCAL allocations. + iree_hal_cuda2_memory_pool_params_t device_local; + // Used for any host-visible/host-local memory types. + iree_hal_cuda2_memory_pool_params_t other; +} iree_hal_cuda2_memory_pooling_params_t; + //===----------------------------------------------------------------------===// // iree_hal_cuda2_driver_t //===----------------------------------------------------------------------===// diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c new file mode 100644 index 000000000000..5ec9a396af32 --- /dev/null +++ b/experimental/cuda2/cuda_allocator.c @@ -0,0 +1,623 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/cuda2/cuda_allocator.h" + +#include + +#include "experimental/cuda2/cuda_buffer.h" +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_status_util.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2 unpooled"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +typedef struct iree_hal_cuda2_allocator_t { + // Abstract resource used for injecting reference counting and vtable; + // must be at offset 0. + iree_hal_resource_t resource; + + // The device that this allocator allocates memory from. + iree_hal_device_t* base_device; + CUdevice device; + + // The CUDA stream that allocations should be used in. + CUstream stream; + + iree_hal_cuda2_memory_pools_t* pools; + + const iree_hal_cuda2_dynamic_symbols_t* symbols; + + iree_allocator_t host_allocator; + + // Whether the GPU and CPU can concurrently access CUDA managed data in a + // coherent way. We would need to explicitly perform flushing and invalidation + // between GPU and CPU if not. + bool supports_concurrent_managed_access; + + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) +} iree_hal_cuda2_allocator_t; + +static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable; + +static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast( + iree_hal_allocator_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable); + return (iree_hal_cuda2_allocator_t*)base_value; +} + +iree_status_t iree_hal_cuda2_allocator_create( + iree_hal_device_t* base_device, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device, + CUstream stream, iree_hal_cuda2_memory_pools_t* pools, + iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(base_device); + IREE_ASSERT_ARGUMENT(cuda_symbols); + IREE_ASSERT_ARGUMENT(pools); + IREE_ASSERT_ARGUMENT(out_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + // To support device-local + host-visible memory we need concurrent managed + // access indicating that the host and devices can concurrently access the + // device memory. If we don't have this feature then we fall back to forcing + // all device-local + host-visible memory into host-local + device-visible + // page-locked memory. The compiler tries to avoid this for high-traffic + // buffers except for readback staging buffers. + int supports_concurrent_managed_access = 0; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, IREE_CURESULT_TO_STATUS( + cuda_symbols, + cuDeviceGetAttribute( + &supports_concurrent_managed_access, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device), + "cuDeviceGetAttribute")); + + IREE_TRACE_ZONE_APPEND_TEXT( + z0, supports_concurrent_managed_access + ? "has CONCURRENT_MANAGED_ACCESS" + : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on " + "device-local + host-visible memory)"); + + iree_hal_cuda2_allocator_t* allocator = NULL; + iree_status_t status = iree_allocator_malloc( + host_allocator, sizeof(*allocator), (void**)&allocator); + if (iree_status_is_ok(status)) { + iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable, + &allocator->resource); + allocator->base_device = base_device; + allocator->device = device; + allocator->stream = stream; + allocator->pools = pools; + allocator->symbols = cuda_symbols; + allocator->host_allocator = host_allocator; + allocator->supports_concurrent_managed_access = + supports_concurrent_managed_access != 0; + *out_allocator = (iree_hal_allocator_t*)allocator; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_cuda2_allocator_destroy( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + IREE_ASSERT_ARGUMENT(base_allocator); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(allocator->host_allocator, allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_allocator_t iree_hal_cuda2_allocator_host_allocator( + const iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + iree_hal_cuda2_allocator_t* allocator = + (iree_hal_cuda2_allocator_t*)base_allocator; + return allocator->host_allocator; +} + +static iree_status_t iree_hal_cuda2_allocator_trim( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + return iree_ok_status(); +} + +static void iree_hal_cuda2_allocator_query_statistics( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) { + IREE_STATISTICS({ + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics)); + iree_hal_cuda2_memory_pools_merge_statistics(allocator->pools, + out_statistics); + }); +} + +static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_host_size_t capacity, + iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps, + iree_host_size_t* IREE_RESTRICT out_count) { + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(heaps); + IREE_ASSERT_ARGUMENT(out_count); + + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + + // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified + // set of heaps (likely still a cached and uncached, at minimum). + iree_host_size_t count = 3; + if (allocator->supports_concurrent_managed_access) { + ++count; // device-local | host-visible + } + if (out_count) *out_count = count; + if (capacity < count) { + // NOTE: lightweight as this is hit in normal pre-sizing usage. + return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE); + } + + // Don't think there's a query for these. + // Max allocation size may be much smaller in certain memory types such as + // page-locked memory and it'd be good to enforce that. + const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0; + const iree_device_size_t min_alignment = 64; + + int i = 0; + + // Device-local memory (dispatch resources): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .allowed_usage = + IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + if (allocator->supports_concurrent_managed_access) { + // Device-local managed memory with host mapping support: + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + } + + // Write-combined page-locked host-local memory (upload): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + // Cached page-locked host-local memory (download): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT | + IREE_HAL_MEMORY_TYPE_HOST_CACHED, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + IREE_ASSERT(i == count); + return iree_ok_status(); +} + +static iree_hal_buffer_compatibility_t +iree_hal_cuda2_allocator_query_buffer_compatibility( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t* IREE_RESTRICT allocation_size) { + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + + // All buffers can be allocated on the heap. + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + + // Buffers are importable in CUDA under most cases, though performance may + // vary wildly. We don't fully verify that the buffer parameters are + // self-consistent and just look at whether we can get a device pointer. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE; + } + + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; + } + } + + // If concurrent managed access is not supported then make device-local + + // host-visible allocations fall back to host-local + device-visible + // page-locked memory. This will be significantly slower for the device to + // access but the compiler only uses this type for readback staging buffers + // and it's better to function than function fast. + if (!allocator->supports_concurrent_managed_access && + iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE; + params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE); + params->type |= + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE; + } + + // We are now optimal. + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; but + // it can happen in real world use cases. So we should at least not crash. + if (*allocation_size == 0) *allocation_size = 4; + + return compatibility; +} + +static void iree_hal_cuda2_buffer_free( + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr) { + IREE_TRACE_ZONE_BEGIN(z0); + switch (buffer_type) { + case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree"); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr)); + break; + } + case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost"); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr)); + break; + } + case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister"); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr)); + break; + } + case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)"); + break; + } + } + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_cuda2_allocator_allocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t allocation_size, iree_const_byte_span_t initial_data, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(params); + IREE_ASSERT_ARGUMENT(out_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_cuda2_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + CUdeviceptr device_ptr = 0; + IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate"); + IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size); + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + // Device local + host visible. + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size, + CU_MEM_ATTACH_GLOBAL)); + if (iree_status_is_ok(status) && + allocator->supports_concurrent_managed_access) { + // Prefetch the buffer to the GPU stream. + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, + cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device, + allocator->stream)); + } + host_ptr = (void*)device_ptr; + } else { + // Device only. + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemAlloc(&device_ptr, allocation_size)); + } + } else { + // Host local cases. + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST; + unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP; + if (!iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { + flags |= CU_MEMHOSTALLOC_WRITECOMBINED; + } + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags)); + if (iree_status_is_ok(status)) { + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, + cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0)); + } + } + IREE_TRACE_ZONE_END(z0); + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_cuda2_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, allocation_size, + /*byte_offset=*/0, + /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr, + iree_hal_buffer_release_callback_null(), + iree_hal_allocator_host_allocator(base_allocator), &buffer); + } + + // Copy the initial contents into the buffer. This may require staging. + if (iree_status_is_ok(status) && + !iree_const_byte_span_is_empty(initial_data)) { + status = iree_hal_device_transfer_range( + allocator->base_device, + iree_hal_make_host_transfer_buffer_span((void*)initial_data.data, + initial_data.data_length), + 0, iree_hal_make_device_transfer_buffer(buffer), 0, + initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, + iree_infinite_timeout()); + } + + if (iree_status_is_ok(status)) { + IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID, + (void*)iree_hal_cuda2_buffer_device_pointer(buffer), + allocation_size); + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &allocator->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + if (!buffer) { + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static void iree_hal_cuda2_allocator_deallocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT base_buffer) { + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(base_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + + const iree_hal_cuda2_buffer_type_t buffer_type = + iree_hal_cuda2_buffer_type(base_buffer); + + // WARNING: we may be called from a random thread and need to ensure that we + // have an active CUDA context. Unfortunately CUDA is CUDA and trying to + // change the context here will result in full device synchronization. In the + // future we'll need to do something fairly complex such as having a dedicated + // thread with a persistently bound context that does nothing but free + // buffers. The load on this will be lighter when queue-ordered allocations + // are used or any sort of pooling policy is applied. + // + // WARNING: with CUDA's lazy error propagation it's possible that by the time + // this code is running something else has triggered device loss and we can't + // actually use the context. In that case we can't perform the frees and want + // to silently ignore them: whatever the user tries to do next will fail in + // the same way and if we were deallocating this buffer as part of a tear-down + // on failure we don't want to end up dying during cleanup. + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, + iree_hal_cuda2_buffer_device_pointer(base_buffer), + iree_hal_cuda2_buffer_host_pointer(base_buffer)); + + switch (buffer_type) { + case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: + case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { + IREE_TRACE_FREE_NAMED( + IREE_HAL_CUDA_ALLOCATOR_ID, + (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer)); + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); + break; + } + default: + // Buffer type not tracked. + break; + } + + iree_hal_buffer_destroy(base_buffer); +} + +static iree_status_t iree_hal_cuda2_allocator_import_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, + iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(params); + IREE_ASSERT_ARGUMENT(external_buffer); + IREE_ASSERT_ARGUMENT(out_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_device_size_t allocation_size = external_buffer->size; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_cuda2_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + CUdeviceptr device_ptr = 0; + + switch (external_buffer->type) { + case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: { + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "unable to register host allocations as device-local memory"); + } + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED; + host_ptr = external_buffer->handle.host_allocation.ptr; + uint32_t register_flags = 0; + if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) { + register_flags = CU_MEMHOSTREGISTER_READ_ONLY; + } + if (iree_any_bit_set(compat_params.usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS | + IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ | + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE | + IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) { + register_flags = CU_MEMHOSTREGISTER_DEVICEMAP; + } + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, + cuMemHostRegister(host_ptr, external_buffer->size, register_flags), + "cuMemHostRegister"); + if (iree_status_is_ok(status)) { + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, + cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0), + "cuMemHostGetDevicePointer"); + } + break; + } + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD: + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "handle-based imports not yet implemented"); + default: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "external buffer type not supported"); + } + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_cuda2_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, external_buffer->size, /*byte_offset=*/0, + /*byte_length=*/external_buffer->size, buffer_type, device_ptr, + host_ptr, release_callback, + iree_hal_allocator_host_allocator(base_allocator), &buffer); + } + + if (iree_status_is_ok(status)) { + *out_buffer = buffer; + } else { + if (!buffer) { + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static iree_status_t iree_hal_cuda2_allocator_export_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT buffer, + iree_hal_external_buffer_type_t requested_type, + iree_hal_external_buffer_flags_t requested_flags, + iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "exporting to external buffers not supported"); +} + +static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = { + .destroy = iree_hal_cuda2_allocator_destroy, + .host_allocator = iree_hal_cuda2_allocator_host_allocator, + .trim = iree_hal_cuda2_allocator_trim, + .query_statistics = iree_hal_cuda2_allocator_query_statistics, + .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps, + .query_buffer_compatibility = + iree_hal_cuda2_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer, + .import_buffer = iree_hal_cuda2_allocator_import_buffer, + .export_buffer = iree_hal_cuda2_allocator_export_buffer, +}; diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h new file mode 100644 index 000000000000..2ff33ea467c0 --- /dev/null +++ b/experimental/cuda2/cuda_allocator.h @@ -0,0 +1,33 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ +#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ + +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/memory_pools.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a CUDA memory allocator. +// |device| and |stream| will be used for management operations. +// |pools| provides memory pools that may be shared across multiple allocators +// and the pointer must remain valid for the lifetime of the allocator. +iree_status_t iree_hal_cuda2_allocator_create( + iree_hal_device_t* base_device, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device, + CUstream stream, iree_hal_cuda2_memory_pools_t* pools, + iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c new file mode 100644 index 000000000000..e88c9e3b3de3 --- /dev/null +++ b/experimental/cuda2/cuda_buffer.c @@ -0,0 +1,166 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/cuda2/cuda_buffer.h" + +#include +#include +#include + +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +typedef struct iree_hal_cuda2_buffer_t { + iree_hal_buffer_t base; + iree_hal_cuda2_buffer_type_t type; + void* host_ptr; + CUdeviceptr device_ptr; + iree_hal_buffer_release_callback_t release_callback; +} iree_hal_cuda2_buffer_t; + +static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable; + +static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast( + iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable); + return (iree_hal_cuda2_buffer_t*)base_value; +} + +static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast( + const iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable); + return (const iree_hal_cuda2_buffer_t*)base_value; +} + +iree_status_t iree_hal_cuda2_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(out_buffer); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_cuda2_buffer_t* buffer = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); + if (iree_status_is_ok(status)) { + iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, + allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, + &iree_hal_cuda2_buffer_vtable, &buffer->base); + buffer->type = buffer_type; + buffer->host_ptr = host_ptr; + buffer->device_ptr = device_ptr; + buffer->release_callback = release_callback; + *out_buffer = &buffer->base; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer); + iree_allocator_t host_allocator = base_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + if (buffer->release_callback.fn) { + buffer->release_callback.fn(buffer->release_callback.user_data, + base_buffer); + } + iree_allocator_free(host_allocator, buffer); + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_cuda2_buffer_map_range( + iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + IREE_ASSERT_ARGUMENT(base_buffer); + IREE_ASSERT_ARGUMENT(mapping); + iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer); + + // TODO(benvanik): add upload/download for unmapped buffers. + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type(base_buffer), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR( + iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer), + IREE_HAL_BUFFER_USAGE_MAPPING)); + + uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset; + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for + // heap buffers we could reallocate them such that ASAN yells, but that + // would only work if the entire buffer was discarded. +#ifndef NDEBUG + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); + } +#endif // !NDEBUG + + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda2_buffer_unmap_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + // Nothing to do today. + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda2_buffer_invalidate_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do today. + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda2_buffer_flush_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do today. + return iree_ok_status(); +} + +iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); + return buffer->type; +} + +CUdeviceptr iree_hal_cuda2_buffer_device_pointer( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); + return buffer->device_ptr; +} + +void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); + return buffer->host_ptr; +} + +void iree_hal_cuda2_buffer_drop_release_callback( + iree_hal_buffer_t* base_buffer) { + iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer); + buffer->release_callback = iree_hal_buffer_release_callback_null(); +} + +static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = { + .recycle = iree_hal_buffer_recycle, + .destroy = iree_hal_cuda2_buffer_destroy, + .map_range = iree_hal_cuda2_buffer_map_range, + .unmap_range = iree_hal_cuda2_buffer_unmap_range, + .invalidate_range = iree_hal_cuda2_buffer_invalidate_range, + .flush_range = iree_hal_cuda2_buffer_flush_range, +}; diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h new file mode 100644 index 000000000000..23cade3c57ad --- /dev/null +++ b/experimental/cuda2/cuda_buffer.h @@ -0,0 +1,66 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_ +#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_ + +#include "experimental/cuda2/cuda_headers.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef enum iree_hal_cuda2_buffer_type_e { + // Device local buffer; allocated with cuMemAlloc/cuMemAllocManaged, freed + // with cuMemFree. + IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0, + // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost. + IREE_HAL_CUDA_BUFFER_TYPE_HOST, + // Host local buffer; registered with cuMemHostRegister, freed with + // cuMemHostUnregister. + IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED, + // Device local buffer, allocated with cuMemAllocFromPoolAsync, freed with + // cuMemFree/cuMemFreeAsync. + IREE_HAL_CUDA_BUFFER_TYPE_ASYNC, +} iree_hal_cuda2_buffer_type_t; + +// Wraps a CUDA allocation in an iree_hal_buffer_t. +iree_status_t iree_hal_cuda2_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); + +// Returns the underlying CUDA buffer type of the given |buffer|. +iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type( + const iree_hal_buffer_t* buffer); + +// Returns the CUDA base device pointer for the given |buffer|. +// +// Note that this is the entire allocated_buffer and must be offset by the +// buffer byte_offset and byte_length when used. +CUdeviceptr iree_hal_cuda2_buffer_device_pointer( + const iree_hal_buffer_t* buffer); + +// Returns the CUDA host pointer for the given |buffer|, if available. +void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer); + +// Drops the release callback so that when the buffer is destroyed no callback +// will be made. This is not thread safe but all callers are expected to be +// holding an allocation and the earliest the buffer could be destroyed is after +// this call returns and the caller has released its reference. +void iree_hal_cuda2_buffer_drop_release_callback(iree_hal_buffer_t* buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_ diff --git a/experimental/cuda2/cuda_dynamic_symbol_table.h b/experimental/cuda2/cuda_dynamic_symbol_table.h index b4aaa93fc750..fb8ff5a8ecd8 100644 --- a/experimental/cuda2/cuda_dynamic_symbol_table.h +++ b/experimental/cuda2/cuda_dynamic_symbol_table.h @@ -49,6 +49,18 @@ IREE_CU_PFN_DECL(cuMemHostAlloc, void**, size_t, unsigned int) IREE_CU_PFN_DECL(cuMemHostRegister, void*, size_t, unsigned int) IREE_CU_PFN_DECL(cuMemHostUnregister, void*) IREE_CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int) +IREE_CU_PFN_DECL(cuMemPoolCreate, CUmemoryPool*, const CUmemPoolProps*) +IREE_CU_PFN_DECL(cuMemPoolDestroy, CUmemoryPool) +IREE_CU_PFN_DECL(cuMemPoolSetAccess, CUmemoryPool, const CUmemAccessDesc*, + size_t) +IREE_CU_PFN_DECL(cuMemPoolGetAttribute, CUmemoryPool, CUmemPool_attribute, + void*) +IREE_CU_PFN_DECL(cuMemPoolSetAttribute, CUmemoryPool, CUmemPool_attribute, + void*) +IREE_CU_PFN_DECL(cuMemPoolTrimTo, CUmemoryPool, size_t) +IREE_CU_PFN_DECL(cuMemAllocFromPoolAsync, CUdeviceptr*, size_t, CUmemoryPool, + CUstream) +IREE_CU_PFN_DECL(cuMemFreeAsync, CUdeviceptr dptr, CUstream hStream) IREE_CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*) IREE_CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int, CUjit_option*, void**) diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c new file mode 100644 index 000000000000..e29c5121c51a --- /dev/null +++ b/experimental/cuda2/memory_pools.c @@ -0,0 +1,278 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/cuda2/memory_pools.h" + +#include "experimental/cuda2/cuda_buffer.h" +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_status_util.h" +#include "iree/base/tracing.h" + +// NOTE: these are currently global for all devices; we could make +// device-specific ones by malloc() and leaking (with LSAN note) unique string +// values instead. +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID = + "CUDA pool: device-local reserved"; +static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID = + "CUDA pool: other reserved"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +static iree_status_t iree_hal_cuda2_create_memory_pool( + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, + iree_hal_cuda2_memory_pool_params_t params, + CUmemoryPool* IREE_RESTRICT out_pool) { + *out_pool = NULL; + + CUmemPoolProps pool_props = { + .allocType = CU_MEM_ALLOCATION_TYPE_PINNED, + // TODO: allow sharing of certain pool memory types by fd/HANDLE. + .handleTypes = CU_MEM_HANDLE_TYPE_NONE, + .location = + { + .type = CU_MEM_LOCATION_TYPE_DEVICE, + .id = cu_device, + }, + .win32SecurityAttributes = NULL, + .reserved = {0}, + }; + + CUmemoryPool pool = NULL; + IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props), + "cuMemPoolCreate"); + + iree_status_t status = IREE_CURESULT_TO_STATUS( + cuda_symbols, + cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + ¶ms.release_threshold), + "cuMemPoolSetAttribute"); + + if (iree_status_is_ok(status)) { + *out_pool = pool; + } else { + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool)); + } + return status; +} + +iree_status_t iree_hal_cuda2_memory_pools_initialize( + iree_allocator_t host_allocator, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, + const iree_hal_cuda2_memory_pooling_params_t* pooling_params, + iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) { + IREE_ASSERT_ARGUMENT(cuda_symbols); + IREE_ASSERT_ARGUMENT(pooling_params); + IREE_ASSERT_ARGUMENT(out_pools); + IREE_TRACE_ZONE_BEGIN(z0); + + memset(out_pools, 0, sizeof(*out_pools)); + out_pools->cuda_symbols = cuda_symbols; + out_pools->host_allocator = host_allocator; + + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device, + pooling_params->device_local, + &out_pools->device_local); + } + + if (iree_status_is_ok(status)) { + status = iree_hal_cuda2_create_memory_pool( + cuda_symbols, cu_device, pooling_params->other, &out_pools->other); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +void iree_hal_cuda2_memory_pools_deinitialize( + iree_hal_cuda2_memory_pools_t* pools) { + IREE_TRACE_ZONE_BEGIN(z0); + + if (pools->device_local) { + IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, + cuMemPoolDestroy(pools->device_local)); + pools->device_local = NULL; + } + + if (pools->other) { + IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other)); + pools->other = NULL; + } + + IREE_TRACE_ZONE_END(z0); +} + +static void iree_hal_cuda2_memory_pool_track_alloc( + iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) { + bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); + (void)is_device_local; + iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer); + (void)allocation_size; + IREE_TRACE_ALLOC_NAMED( + is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID + : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID, + (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size); + IREE_STATISTICS({ + iree_atomic_int64_t* bytes_allocated = + is_device_local ? &pools->statistics.device_bytes_allocated + : &pools->statistics.host_bytes_allocated; + iree_atomic_fetch_add_int64(bytes_allocated, allocation_size, + iree_memory_order_relaxed); + }); +} + +static void iree_hal_cuda2_memory_pool_track_free( + iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) { + bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); + (void)is_device_local; + IREE_TRACE_FREE_NAMED(is_device_local + ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID + : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID, + (void*)iree_hal_cuda2_buffer_device_pointer(buffer)); + IREE_STATISTICS({ + iree_atomic_int64_t* bytes_freed = + is_device_local ? &pools->statistics.device_bytes_freed + : &pools->statistics.host_bytes_freed; + iree_device_size_t allocation_size = + iree_hal_buffer_allocation_size(buffer); + iree_atomic_fetch_add_int64(bytes_freed, allocation_size, + iree_memory_order_relaxed); + }); +} + +void iree_hal_cuda2_memory_pools_merge_statistics( + iree_hal_cuda2_memory_pools_t* pools, + iree_hal_allocator_statistics_t* statistics) { + IREE_STATISTICS({ + statistics->device_bytes_allocated = iree_atomic_load_int64( + &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed); + statistics->host_bytes_allocated = iree_atomic_load_int64( + &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed); + statistics->device_bytes_freed = iree_atomic_load_int64( + &pools->statistics.device_bytes_freed, iree_memory_order_relaxed); + statistics->host_bytes_freed = iree_atomic_load_int64( + &pools->statistics.host_bytes_freed, iree_memory_order_relaxed); + if (pools->device_local) { + cuuint64_t pool_peak = 0; + IREE_CUDA_IGNORE_ERROR( + pools->cuda_symbols, + cuMemPoolGetAttribute(pools->device_local, + CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak)); + statistics->device_bytes_peak += (iree_device_size_t)pool_peak; + } + if (pools->other) { + cuuint64_t pool_peak = 0; + IREE_CUDA_IGNORE_ERROR( + pools->cuda_symbols, + cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH, + &pool_peak)); + statistics->host_bytes_peak += (iree_device_size_t)pool_peak; + } + }); +} + +// NOTE: this is only issued if the buffer is destroyed without having had been +// scheduled for deallocation asynchronously. When a buffer is scheduled we drop +// the release callback so that this isn't called and we don't double-free. +static void iree_hal_cuda2_async_buffer_release_callback( + void* user_data, iree_hal_buffer_t* buffer) { + iree_hal_cuda2_memory_pools_t* pools = + (iree_hal_cuda2_memory_pools_t*)user_data; + IREE_TRACE_ZONE_BEGIN(z0); + + CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer); + IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr)); + iree_hal_cuda2_memory_pool_track_free(pools, buffer); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_cuda2_memory_pools_alloca( + iree_hal_cuda2_memory_pools_t* pools, CUstream stream, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size); + + iree_hal_buffer_params_canonicalize(¶ms); + + // TODO: more pools and better selection; this is coarsely deciding between + // only device local (variables, constants, transients) and other (staging, + // external) but could use more buffer properties (including usage/export + // flags) to better isolate the different usage patterns and keep the pools + // operating with reasonable limits. We should be using the |pool| arg. + CUmemoryPool memory_pool = + iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL) + ? pools->device_local + : pools->other; + + CUdeviceptr device_ptr = 0; + iree_status_t status = IREE_CURESULT_TO_STATUS( + pools->cuda_symbols, + cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool, + stream), + "cuMemAllocFromPoolAsync"); + + // Wrap the allocated CUDA buffer in a HAL buffer. + // NOTE: we don't provide a device allocator because we didn't allocate from + // one and instead we use a release callback to perform the free if the user + // doesn't dealloca the buffer. + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + iree_hal_buffer_release_callback_t release_callback = { + .fn = iree_hal_cuda2_async_buffer_release_callback, + .user_data = pools, + }; + status = iree_hal_cuda2_buffer_wrap( + /*device_allocator=*/NULL, params.type, params.access, params.usage, + allocation_size, /*byte_offset=*/0, + /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC, + device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator, + &buffer); + } + + if (iree_status_is_ok(status)) { + // Update statistics (note that it may not yet be accurate). + iree_hal_cuda2_memory_pool_track_alloc(pools, buffer); + *out_buffer = buffer; + } else if (buffer) { + iree_hal_buffer_release(buffer); + } else { + IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, + cuMemFreeAsync(device_ptr, stream)); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +iree_status_t iree_hal_cuda2_memory_pools_dealloca( + iree_hal_cuda2_memory_pools_t* pools, CUstream stream, + iree_hal_buffer_t* buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_TRACE_ZONE_APPEND_VALUE( + z0, (int64_t)iree_hal_buffer_allocation_size(buffer)); + + // Try to schedule the buffer for freeing. + CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer); + iree_status_t status = IREE_CURESULT_TO_STATUS( + pools->cuda_symbols, cuMemFreeAsync(device_ptr, stream), + "cuMemFreeAsync"); + + // Drop the release callback so that we don't try to double-free the buffer. + iree_hal_cuda2_buffer_drop_release_callback(buffer); + + // Update statistics (note that it may not yet be accurate). + iree_hal_cuda2_memory_pool_track_free(pools, buffer); + + IREE_TRACE_ZONE_END(z0); + return status; +} diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h new file mode 100644 index 000000000000..8eccf9ef7105 --- /dev/null +++ b/experimental/cuda2/memory_pools.h @@ -0,0 +1,73 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_ +#define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_ + +#include "experimental/cuda2/api.h" +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_headers.h" +#include "iree/base/api.h" +#include "iree/base/internal/atomics.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Retained CUDA memory pools for various allocation types. +typedef struct iree_hal_cuda2_memory_pools_t { + // Used exclusively for DEVICE_LOCAL allocations. + CUmemoryPool device_local; + // Used for any host-visible/host-local memory types. + CUmemoryPool other; + + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols; + iree_allocator_t host_allocator; + + IREE_STATISTICS(struct { + iree_atomic_int64_t device_bytes_allocated; + iree_atomic_int64_t device_bytes_freed; + iree_atomic_int64_t host_bytes_allocated; + iree_atomic_int64_t host_bytes_freed; + } statistics;) +} iree_hal_cuda2_memory_pools_t; + +// Initializes |out_pools| by configuring new CUDA memory pools. +iree_status_t iree_hal_cuda2_memory_pools_initialize( + iree_allocator_t host_allocator, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, + const iree_hal_cuda2_memory_pooling_params_t* pooling_params, + iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools); + +// Deinitializes the |pools| and releases the underlying CUDA resources. +void iree_hal_cuda2_memory_pools_deinitialize( + iree_hal_cuda2_memory_pools_t* pools); + +// Merges statistics information from |pools| into |statistics|. +void iree_hal_cuda2_memory_pools_merge_statistics( + iree_hal_cuda2_memory_pools_t* pools, + iree_hal_allocator_statistics_t* statistics); + +// Asynchronously allocates a buffer from an appropriate pool. +// The allocation will be stream-ordered on |stream|. +iree_status_t iree_hal_cuda2_memory_pools_alloca( + iree_hal_cuda2_memory_pools_t* pools, CUstream stream, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer); + +// Asynchronously deallocates a buffer from its pool. +// The deallocation will be stream-ordered on |stream|. +iree_status_t iree_hal_cuda2_memory_pools_dealloca( + iree_hal_cuda2_memory_pools_t* pools, CUstream stream, + iree_hal_buffer_t* buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_