diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c new file mode 100644 index 000000000000..4d22a01fe4fe --- /dev/null +++ b/experimental/cuda2/cuda_allocator.c @@ -0,0 +1,582 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/hal/drivers/cuda/cuda_allocator.h" + +#include + +#include "iree/base/api.h" +#include "iree/base/tracing.h" +#include "iree/hal/drivers/cuda/cuda_buffer.h" +#include "iree/hal/drivers/cuda/dynamic_symbols.h" +#include "iree/hal/drivers/cuda/status_util.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +typedef struct iree_hal_cuda_allocator_t { + iree_hal_resource_t resource; + iree_hal_device_t* base_device; + iree_hal_cuda_context_wrapper_t* context; + CUdevice device; + CUstream stream; + bool supports_concurrent_managed_access; + + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) +} iree_hal_cuda_allocator_t; + +static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable; + +static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast( + iree_hal_allocator_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable); + return (iree_hal_cuda_allocator_t*)base_value; +} + +iree_status_t iree_hal_cuda_allocator_create( + iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context, + CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(base_device); + IREE_ASSERT_ARGUMENT(context); + IREE_TRACE_ZONE_BEGIN(z0); + + // To support device-local + host-visible memory we need concurrent managed + // access indicating that the host and devices can concurrently access the + // device memory. If we don't have this feature then we fall back to forcing + // all device-local + host-visible memory into host-local + device-visible + // page-locked memory. The compiler tries to avoid this for high-traffic + // buffers except for readback staging buffers. + int supports_concurrent_managed_access = 0; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, CU_RESULT_TO_STATUS( + context->syms, + cuDeviceGetAttribute( + &supports_concurrent_managed_access, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device), + "cuDeviceGetAttribute")); + + IREE_TRACE_ZONE_APPEND_TEXT( + z0, supports_concurrent_managed_access + ? "has CONCURRENT_MANAGED_ACCESS" + : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on " + "device-local + host-visible memory)"); + + iree_hal_cuda_allocator_t* allocator = NULL; + iree_status_t status = iree_allocator_malloc( + context->host_allocator, sizeof(*allocator), (void**)&allocator); + if (iree_status_is_ok(status)) { + iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable, + &allocator->resource); + allocator->base_device = base_device; + allocator->context = context; + allocator->device = device; + allocator->stream = stream; + allocator->supports_concurrent_managed_access = + supports_concurrent_managed_access != 0; + *out_allocator = (iree_hal_allocator_t*)allocator; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_cuda_allocator_destroy( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + iree_allocator_t host_allocator = allocator->context->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_allocator_t iree_hal_cuda_allocator_host_allocator( + const iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + iree_hal_cuda_allocator_t* allocator = + (iree_hal_cuda_allocator_t*)base_allocator; + return allocator->context->host_allocator; +} + +static iree_status_t iree_hal_cuda_allocator_trim( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + return iree_ok_status(); +} + +static void iree_hal_cuda_allocator_query_statistics( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) { + IREE_STATISTICS({ + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics)); + }); +} + +static iree_status_t iree_hal_cuda_allocator_query_memory_heaps( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_host_size_t capacity, + iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps, + iree_host_size_t* IREE_RESTRICT out_count) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + + // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified + // set of heaps (likely still a cached and uncached, at minimum). + iree_host_size_t count = 3; + if (allocator->supports_concurrent_managed_access) { + ++count; // device-local | host-visible + } + if (out_count) *out_count = count; + if (capacity < count) { + // NOTE: lightweight as this is hit in normal pre-sizing usage. + return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE); + } + + // Don't think there's a query for these. + // Max allocation size may be much smaller in certain memory types such as + // page-locked memory and it'd be good to enforce that. + const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0; + const iree_device_size_t min_alignment = 64; + + int i = 0; + + // Device-local memory (dispatch resources): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .allowed_usage = + IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + if (allocator->supports_concurrent_managed_access) { + // Device-local managed memory with host mapping support: + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + } + + // Write-combined page-locked host-local memory (upload): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + // Cached page-locked host-local memory (download): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT | + IREE_HAL_MEMORY_TYPE_HOST_CACHED, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + IREE_ASSERT(i == count); + return iree_ok_status(); +} + +static iree_hal_buffer_compatibility_t +iree_hal_cuda_allocator_query_buffer_compatibility( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t* IREE_RESTRICT allocation_size) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + + // All buffers can be allocated on the heap. + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + + // Buffers are importable in CUDA under most cases, though performance may + // vary wildly. We don't fully verify that the buffer parameters are + // self-consistent and just look at whether we can get a device pointer. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE; + } + + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; + } + } + + // If concurrent managed access is not supported then make device-local + + // host-visible allocations fall back to host-local + device-visible + // page-locked memory. This will be significantly slower for the device to + // access but the compiler only uses this type for readback staging buffers + // and it's better to function than function fast. + if (!allocator->supports_concurrent_managed_access && + iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE; + params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE); + params->type |= + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE; + } + + // We are now optimal. + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; but + // it can happen in real world use cases. So we should at least not crash. + if (*allocation_size == 0) *allocation_size = 4; + + return compatibility; +} + +static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context, + iree_hal_cuda_buffer_type_t buffer_type, + CUdeviceptr device_ptr, void* host_ptr) { + IREE_TRACE_ZONE_BEGIN(z0); + switch (buffer_type) { + case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree"); + CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr)); + break; + } + case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost"); + CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr)); + break; + } + case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister"); + CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr)); + break; + } + } + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_cuda_allocator_allocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t allocation_size, iree_const_byte_span_t initial_data, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_cuda_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + CUdeviceptr device_ptr = 0; + IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate"); + IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size); + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + // Device local case. + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + status = + CU_RESULT_TO_STATUS(allocator->context->syms, + cuMemAllocManaged(&device_ptr, allocation_size, + CU_MEM_ATTACH_GLOBAL)); + if (iree_status_is_ok(status) && + allocator->supports_concurrent_managed_access) { + // Prefetch the buffer on the GPU device. + status = CU_RESULT_TO_STATUS( + allocator->context->syms, + cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device, + allocator->stream)); + } + host_ptr = (void*)device_ptr; + } else { + // Device only. + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + status = CU_RESULT_TO_STATUS(allocator->context->syms, + cuMemAlloc(&device_ptr, allocation_size)); + } + } else { + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST; + unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP; + if (!iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { + flags |= CU_MEMHOSTALLOC_WRITECOMBINED; + } + status = + CU_RESULT_TO_STATUS(allocator->context->syms, + cuMemHostAlloc(&host_ptr, allocation_size, flags)); + if (iree_status_is_ok(status)) { + status = CU_RESULT_TO_STATUS( + allocator->context->syms, + cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0)); + } + } + IREE_TRACE_ZONE_END(z0); + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_cuda_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, allocation_size, + /*byte_offset=*/0, + /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr, + iree_hal_buffer_release_callback_null(), &buffer); + } + + // Copy the initial contents into the buffer. This may require staging. + if (iree_status_is_ok(status) && + !iree_const_byte_span_is_empty(initial_data)) { + status = iree_hal_device_transfer_range( + allocator->base_device, + iree_hal_make_host_transfer_buffer_span((void*)initial_data.data, + initial_data.data_length), + 0, iree_hal_make_device_transfer_buffer(buffer), 0, + initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, + iree_infinite_timeout()); + } + + if (iree_status_is_ok(status)) { + IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID, + (void*)iree_hal_cuda_buffer_device_pointer(buffer), + allocation_size); + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &allocator->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + if (!buffer) { + iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static void iree_hal_cuda_allocator_deallocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT base_buffer) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + + const iree_hal_cuda_buffer_type_t buffer_type = + iree_hal_cuda_buffer_type(base_buffer); + + // WARNING: we may be called from a random thread and need to ensure that we + // have an active CUDA context. Unfortunately CUDA is CUDA and trying to + // change the context here will result in full device synchronization. In the + // future we'll need to do something fairly complex such as having a dedicated + // thread with a persistently bound context that does nothing but free + // buffers. The load on this will be lighter when queue-ordered allocations + // are used or any sort of pooling policy is applied. + // + // WARNING: with CUDA's lazy error propagation it's possible that by the time + // this code is running something else has triggered device loss and we can't + // actually use the context. In that case we can't perform the frees and want + // to silently ignore them: whatever the user tries to do next will fail in + // the same way and if we were deallocating this buffer as part of a tear-down + // on failure we don't want to end up dying during cleanup. + iree_hal_cuda_buffer_free(allocator->context, buffer_type, + iree_hal_cuda_buffer_device_pointer(base_buffer), + iree_hal_cuda_buffer_host_pointer(base_buffer)); + + switch (buffer_type) { + case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: + case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { + IREE_TRACE_FREE_NAMED( + IREE_HAL_CUDA_ALLOCATOR_ID, + (void*)iree_hal_cuda_buffer_device_pointer(base_buffer)); + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); + break; + } + default: + // Buffer type not tracked. + break; + } + + iree_hal_buffer_destroy(base_buffer); +} + +static iree_status_t iree_hal_cuda_allocator_import_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, + iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_cuda_allocator_t* allocator = + iree_hal_cuda_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_device_size_t allocation_size = external_buffer->size; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_cuda_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + CUdeviceptr device_ptr = 0; + + switch (external_buffer->type) { + case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: { + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "unable to register host allocations as device-local memory"); + } + buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED; + host_ptr = external_buffer->handle.host_allocation.ptr; + uint32_t register_flags = 0; + if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) { + register_flags = CU_MEMHOSTREGISTER_READ_ONLY; + } + if (iree_any_bit_set(compat_params.usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS | + IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ | + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE | + IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) { + register_flags = CU_MEMHOSTREGISTER_DEVICEMAP; + } + status = CU_RESULT_TO_STATUS( + allocator->context->syms, + cuMemHostRegister(host_ptr, external_buffer->size, register_flags), + "cuMemHostRegister"); + if (iree_status_is_ok(status)) { + status = CU_RESULT_TO_STATUS( + allocator->context->syms, + cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0), + "cuMemHostGetDevicePointer"); + } + break; + } + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD: + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "handle-based imports not yet implemented"); + default: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "external buffer type not supported"); + } + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_cuda_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, external_buffer->size, + /*byte_offset=*/0, + /*byte_length=*/external_buffer->size, buffer_type, device_ptr, + host_ptr, release_callback, &buffer); + } + + if (iree_status_is_ok(status)) { + *out_buffer = buffer; + } else { + if (!buffer) { + iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static iree_status_t iree_hal_cuda_allocator_export_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT buffer, + iree_hal_external_buffer_type_t requested_type, + iree_hal_external_buffer_flags_t requested_flags, + iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) { + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "exporting to external buffers not supported"); +} + +static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = { + .destroy = iree_hal_cuda_allocator_destroy, + .host_allocator = iree_hal_cuda_allocator_host_allocator, + .trim = iree_hal_cuda_allocator_trim, + .query_statistics = iree_hal_cuda_allocator_query_statistics, + .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps, + .query_buffer_compatibility = + iree_hal_cuda_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer, + .import_buffer = iree_hal_cuda_allocator_import_buffer, + .export_buffer = iree_hal_cuda_allocator_export_buffer, +}; diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h new file mode 100644 index 000000000000..b2f272895a91 --- /dev/null +++ b/experimental/cuda2/cuda_allocator.h @@ -0,0 +1,28 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ +#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/hal/drivers/cuda/context_wrapper.h" +#include "iree/hal/drivers/cuda/status_util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Create a cuda allocator. +iree_status_t iree_hal_cuda_allocator_create( + iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context, + CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c new file mode 100644 index 000000000000..3b9c9e13f9e0 --- /dev/null +++ b/experimental/cuda2/cuda_buffer.c @@ -0,0 +1,161 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/hal/drivers/cuda/cuda_buffer.h" + +#include +#include +#include + +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +typedef struct iree_hal_cuda_buffer_t { + iree_hal_buffer_t base; + iree_hal_cuda_buffer_type_t type; + void* host_ptr; + CUdeviceptr device_ptr; + iree_hal_buffer_release_callback_t release_callback; +} iree_hal_cuda_buffer_t; + +static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable; + +static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast( + iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable); + return (iree_hal_cuda_buffer_t*)base_value; +} + +static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast( + const iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable); + return (const iree_hal_cuda_buffer_t*)base_value; +} + +iree_status_t iree_hal_cuda_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(allocator); + IREE_ASSERT_ARGUMENT(out_buffer); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_t host_allocator = + iree_hal_allocator_host_allocator(allocator); + iree_hal_cuda_buffer_t* buffer = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); + if (iree_status_is_ok(status)) { + iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, + allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, + &iree_hal_cuda_buffer_vtable, &buffer->base); + buffer->type = buffer_type; + buffer->host_ptr = host_ptr; + buffer->device_ptr = device_ptr; + buffer->release_callback = release_callback; + *out_buffer = &buffer->base; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer); + iree_allocator_t host_allocator = base_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + if (buffer->release_callback.fn) { + buffer->release_callback.fn(buffer->release_callback.user_data, + base_buffer); + } + iree_allocator_free(host_allocator, buffer); + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_cuda_buffer_map_range( + iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer); + + // TODO(benvanik): add upload/download for unmapped buffers. + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type(base_buffer), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR( + iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer), + IREE_HAL_BUFFER_USAGE_MAPPING)); + + uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset; + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for + // heap buffers we could reallocate them such that ASAN yells, but that + // would only work if the entire buffer was discarded. +#ifndef NDEBUG + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); + } +#endif // !NDEBUG + + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda_buffer_unmap_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + // Nothing to do (today). + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda_buffer_invalidate_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do. + return iree_ok_status(); +} + +static iree_status_t iree_hal_cuda_buffer_flush_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do. + return iree_ok_status(); +} + +iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda_buffer_t* buffer = + iree_hal_cuda_buffer_const_cast(base_buffer); + return buffer->type; +} + +CUdeviceptr iree_hal_cuda_buffer_device_pointer( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda_buffer_t* buffer = + iree_hal_cuda_buffer_const_cast(base_buffer); + return buffer->device_ptr; +} + +void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda_buffer_t* buffer = + iree_hal_cuda_buffer_const_cast(base_buffer); + return buffer->host_ptr; +} + +static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = { + .recycle = iree_hal_buffer_recycle, + .destroy = iree_hal_cuda_buffer_destroy, + .map_range = iree_hal_cuda_buffer_map_range, + .unmap_range = iree_hal_cuda_buffer_unmap_range, + .invalidate_range = iree_hal_cuda_buffer_invalidate_range, + .flush_range = iree_hal_cuda_buffer_flush_range, +}; diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h new file mode 100644 index 000000000000..0b07e8ddea7f --- /dev/null +++ b/experimental/cuda2/cuda_buffer.h @@ -0,0 +1,54 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_ +#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/hal/drivers/cuda/cuda_headers.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef enum iree_hal_cuda_buffer_type_e { + // cuMemAlloc/cuMemAllocManaged + cuMemFree + IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 1u << 0, + // cuMemHostAlloc + cuMemFreeHost + IREE_HAL_CUDA_BUFFER_TYPE_HOST = 1u << 1, + // cuMemHostRegister + cuMemHostUnregister + IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED = 1u << 2, +} iree_hal_cuda_buffer_type_t; + +// Wraps a CUDA allocation in an iree_hal_buffer_t. +iree_status_t iree_hal_cuda_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** out_buffer); + +// Returns the underlying CUDA buffer type. +iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type( + const iree_hal_buffer_t* buffer); + +// Returns the CUDA base pointer for the given |buffer|. +// This is the entire allocated_buffer and must be offset by the buffer +// byte_offset and byte_length when used. +CUdeviceptr iree_hal_cuda_buffer_device_pointer( + const iree_hal_buffer_t* buffer); + +// Returns the CUDA host pointer for the given |buffer|, if available. +void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVERS_CUDA_BUFFER_H_