diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt index c104bcde8f4e..83b927b115b8 100644 --- a/experimental/cuda2/CMakeLists.txt +++ b/experimental/cuda2/CMakeLists.txt @@ -17,6 +17,10 @@ iree_cc_library( "api.h" SRCS "api.h" + "cuda_allocator.c" + "cuda_allocator.h" + "cuda_buffer.c" + "cuda_buffer.h" "cuda_driver.c" DEPS ::dynamic_symbols diff --git a/experimental/cuda2/cuda_allocator.c b/experimental/cuda2/cuda_allocator.c index 4d22a01fe4fe..4a0cddd8a41f 100644 --- a/experimental/cuda2/cuda_allocator.c +++ b/experimental/cuda2/cuda_allocator.c @@ -1,47 +1,63 @@ -// Copyright 2021 The IREE Authors +// Copyright 2023 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree/hal/drivers/cuda/cuda_allocator.h" +#include "experimental/cuda2/cuda_allocator.h" #include +#include "experimental/cuda2/cuda_buffer.h" +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_status_util.h" #include "iree/base/api.h" #include "iree/base/tracing.h" -#include "iree/hal/drivers/cuda/cuda_buffer.h" -#include "iree/hal/drivers/cuda/dynamic_symbols.h" -#include "iree/hal/drivers/cuda/status_util.h" #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING -static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA"; +static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2"; #endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING -typedef struct iree_hal_cuda_allocator_t { +typedef struct iree_hal_cuda2_allocator_t { + // Abstract resource used for injecting reference counting and vtable; + // must be at offset 0. iree_hal_resource_t resource; + + // The device that this allocator allocates memory from. iree_hal_device_t* base_device; - iree_hal_cuda_context_wrapper_t* context; CUdevice device; + + // The CUDA stream that allocations should be used in. CUstream stream; + + const iree_hal_cuda2_dynamic_symbols_t* symbols; + + iree_allocator_t host_allocator; + + // Whether the GPU and CPU can concurrently access CUDA managed data in a + // coherent way. We would need to explicitly perform data migration and + // synchronization between GPU and CPU if not. bool supports_concurrent_managed_access; IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) -} iree_hal_cuda_allocator_t; +} iree_hal_cuda2_allocator_t; -static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable; +static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable; -static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast( +static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast( iree_hal_allocator_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable); - return (iree_hal_cuda_allocator_t*)base_value; + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable); + return (iree_hal_cuda2_allocator_t*)base_value; } -iree_status_t iree_hal_cuda_allocator_create( - iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context, - CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) { +iree_status_t iree_hal_cuda2_allocator_create( + iree_hal_device_t* base_device, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device, + CUstream stream, iree_allocator_t host_allocator, + iree_hal_allocator_t** out_allocator) { IREE_ASSERT_ARGUMENT(base_device); - IREE_ASSERT_ARGUMENT(context); + IREE_ASSERT_ARGUMENT(cuda_symbols); + IREE_ASSERT_ARGUMENT(out_allocator); IREE_TRACE_ZONE_BEGIN(z0); // To support device-local + host-visible memory we need concurrent managed @@ -52,8 +68,8 @@ iree_status_t iree_hal_cuda_allocator_create( // buffers except for readback staging buffers. int supports_concurrent_managed_access = 0; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, CU_RESULT_TO_STATUS( - context->syms, + z0, IREE_CURESULT_TO_STATUS( + cuda_symbols, cuDeviceGetAttribute( &supports_concurrent_managed_access, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device), @@ -65,16 +81,17 @@ iree_status_t iree_hal_cuda_allocator_create( : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on " "device-local + host-visible memory)"); - iree_hal_cuda_allocator_t* allocator = NULL; + iree_hal_cuda2_allocator_t* allocator = NULL; iree_status_t status = iree_allocator_malloc( - context->host_allocator, sizeof(*allocator), (void**)&allocator); + host_allocator, sizeof(*allocator), (void**)&allocator); if (iree_status_is_ok(status)) { - iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable, + iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable, &allocator->resource); allocator->base_device = base_device; - allocator->context = context; allocator->device = device; allocator->stream = stream; + allocator->symbols = cuda_symbols; + allocator->host_allocator = host_allocator; allocator->supports_concurrent_managed_access = supports_concurrent_managed_access != 0; *out_allocator = (iree_hal_allocator_t*)allocator; @@ -84,47 +101,51 @@ iree_status_t iree_hal_cuda_allocator_create( return status; } -static void iree_hal_cuda_allocator_destroy( +static void iree_hal_cuda2_allocator_destroy( iree_hal_allocator_t* IREE_RESTRICT base_allocator) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); - iree_allocator_t host_allocator = allocator->context->host_allocator; + IREE_ASSERT_ARGUMENT(base_allocator); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); IREE_TRACE_ZONE_BEGIN(z0); - iree_allocator_free(host_allocator, allocator); + iree_allocator_free(allocator->host_allocator, allocator); IREE_TRACE_ZONE_END(z0); } -static iree_allocator_t iree_hal_cuda_allocator_host_allocator( +static iree_allocator_t iree_hal_cuda2_allocator_host_allocator( const iree_hal_allocator_t* IREE_RESTRICT base_allocator) { - iree_hal_cuda_allocator_t* allocator = - (iree_hal_cuda_allocator_t*)base_allocator; - return allocator->context->host_allocator; + iree_hal_cuda2_allocator_t* allocator = + (iree_hal_cuda2_allocator_t*)base_allocator; + return allocator->host_allocator; } -static iree_status_t iree_hal_cuda_allocator_trim( +static iree_status_t iree_hal_cuda2_allocator_trim( iree_hal_allocator_t* IREE_RESTRICT base_allocator) { return iree_ok_status(); } -static void iree_hal_cuda_allocator_query_statistics( +static void iree_hal_cuda2_allocator_query_statistics( iree_hal_allocator_t* IREE_RESTRICT base_allocator, iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) { IREE_STATISTICS({ - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics)); }); } -static iree_status_t iree_hal_cuda_allocator_query_memory_heaps( +static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps( iree_hal_allocator_t* IREE_RESTRICT base_allocator, iree_host_size_t capacity, iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps, iree_host_size_t* IREE_RESTRICT out_count) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(heaps); + IREE_ASSERT_ARGUMENT(out_count); + + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified // set of heaps (likely still a cached and uncached, at minimum). @@ -199,12 +220,12 @@ static iree_status_t iree_hal_cuda_allocator_query_memory_heaps( } static iree_hal_buffer_compatibility_t -iree_hal_cuda_allocator_query_buffer_compatibility( +iree_hal_cuda2_allocator_query_buffer_compatibility( iree_hal_allocator_t* IREE_RESTRICT base_allocator, iree_hal_buffer_params_t* IREE_RESTRICT params, iree_device_size_t* IREE_RESTRICT allocation_size) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); // All buffers can be allocated on the heap. iree_hal_buffer_compatibility_t compatibility = @@ -254,42 +275,46 @@ iree_hal_cuda_allocator_query_buffer_compatibility( return compatibility; } -static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context, - iree_hal_cuda_buffer_type_t buffer_type, - CUdeviceptr device_ptr, void* host_ptr) { +static void iree_hal_cuda2_buffer_free( + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, + void* host_ptr) { IREE_TRACE_ZONE_BEGIN(z0); switch (buffer_type) { case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: { IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree"); - CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr)); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr)); break; } case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost"); - CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr)); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr)); break; } case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: { IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister"); - CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr)); + IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr)); break; } } IREE_TRACE_ZONE_END(z0); } -static iree_status_t iree_hal_cuda_allocator_allocate_buffer( +static iree_status_t iree_hal_cuda2_allocator_allocate_buffer( iree_hal_allocator_t* IREE_RESTRICT base_allocator, const iree_hal_buffer_params_t* IREE_RESTRICT params, iree_device_size_t allocation_size, iree_const_byte_span_t initial_data, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(params); + IREE_ASSERT_ARGUMENT(out_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); // Coerce options into those required by the current device. iree_hal_buffer_params_t compat_params = *params; iree_hal_buffer_compatibility_t compatibility = - iree_hal_cuda_allocator_query_buffer_compatibility( + iree_hal_cuda2_allocator_query_buffer_compatibility( base_allocator, &compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { @@ -315,26 +340,25 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( } iree_status_t status = iree_ok_status(); - iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; void* host_ptr = NULL; CUdeviceptr device_ptr = 0; - IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate"); + IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate"); IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size); if (iree_all_bits_set(compat_params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { - // Device local case. if (iree_all_bits_set(compat_params.type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + // Device local + host visible. buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; - status = - CU_RESULT_TO_STATUS(allocator->context->syms, - cuMemAllocManaged(&device_ptr, allocation_size, + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size, CU_MEM_ATTACH_GLOBAL)); if (iree_status_is_ok(status) && allocator->supports_concurrent_managed_access) { - // Prefetch the buffer on the GPU device. - status = CU_RESULT_TO_STATUS( - allocator->context->syms, + // Prefetch the buffer to the GPU stream. + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device, allocator->stream)); } @@ -342,22 +366,22 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( } else { // Device only. buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; - status = CU_RESULT_TO_STATUS(allocator->context->syms, - cuMemAlloc(&device_ptr, allocation_size)); + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemAlloc(&device_ptr, allocation_size)); } } else { + // Host local cases. buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST; unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP; if (!iree_all_bits_set(compat_params.type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { flags |= CU_MEMHOSTALLOC_WRITECOMBINED; } - status = - CU_RESULT_TO_STATUS(allocator->context->syms, - cuMemHostAlloc(&host_ptr, allocation_size, flags)); + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags)); if (iree_status_is_ok(status)) { - status = CU_RESULT_TO_STATUS( - allocator->context->syms, + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0)); } } @@ -365,7 +389,7 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { - status = iree_hal_cuda_buffer_wrap( + status = iree_hal_cuda2_buffer_wrap( base_allocator, compat_params.type, compat_params.access, compat_params.usage, allocation_size, /*byte_offset=*/0, @@ -387,15 +411,15 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( if (iree_status_is_ok(status)) { IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID, - (void*)iree_hal_cuda_buffer_device_pointer(buffer), + (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size); IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( &allocator->statistics, compat_params.type, allocation_size)); *out_buffer = buffer; } else { if (!buffer) { - iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr, - host_ptr); + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); } else { iree_hal_buffer_release(buffer); } @@ -403,14 +427,16 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( return status; } -static void iree_hal_cuda_allocator_deallocate_buffer( +static void iree_hal_cuda2_allocator_deallocate_buffer( iree_hal_allocator_t* IREE_RESTRICT base_allocator, iree_hal_buffer_t* IREE_RESTRICT base_buffer) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(base_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); - const iree_hal_cuda_buffer_type_t buffer_type = - iree_hal_cuda_buffer_type(base_buffer); + const iree_hal_cuda2_buffer_type_t buffer_type = + iree_hal_cuda2_buffer_type(base_buffer); // WARNING: we may be called from a random thread and need to ensure that we // have an active CUDA context. Unfortunately CUDA is CUDA and trying to @@ -426,16 +452,16 @@ static void iree_hal_cuda_allocator_deallocate_buffer( // to silently ignore them: whatever the user tries to do next will fail in // the same way and if we were deallocating this buffer as part of a tear-down // on failure we don't want to end up dying during cleanup. - iree_hal_cuda_buffer_free(allocator->context, buffer_type, - iree_hal_cuda_buffer_device_pointer(base_buffer), - iree_hal_cuda_buffer_host_pointer(base_buffer)); + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, + iree_hal_cuda2_buffer_device_pointer(base_buffer), + iree_hal_cuda2_buffer_host_pointer(base_buffer)); switch (buffer_type) { case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: case IREE_HAL_CUDA_BUFFER_TYPE_HOST: { IREE_TRACE_FREE_NAMED( IREE_HAL_CUDA_ALLOCATOR_ID, - (void*)iree_hal_cuda_buffer_device_pointer(base_buffer)); + (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer)); IREE_STATISTICS(iree_hal_allocator_statistics_record_free( &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), iree_hal_buffer_allocation_size(base_buffer))); @@ -449,20 +475,24 @@ static void iree_hal_cuda_allocator_deallocate_buffer( iree_hal_buffer_destroy(base_buffer); } -static iree_status_t iree_hal_cuda_allocator_import_buffer( +static iree_status_t iree_hal_cuda2_allocator_import_buffer( iree_hal_allocator_t* IREE_RESTRICT base_allocator, const iree_hal_buffer_params_t* IREE_RESTRICT params, iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, iree_hal_buffer_release_callback_t release_callback, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { - iree_hal_cuda_allocator_t* allocator = - iree_hal_cuda_allocator_cast(base_allocator); + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_ASSERT_ARGUMENT(params); + IREE_ASSERT_ARGUMENT(external_buffer); + IREE_ASSERT_ARGUMENT(out_buffer); + iree_hal_cuda2_allocator_t* allocator = + iree_hal_cuda2_allocator_cast(base_allocator); // Coerce options into those required by the current device. iree_hal_buffer_params_t compat_params = *params; iree_device_size_t allocation_size = external_buffer->size; iree_hal_buffer_compatibility_t compatibility = - iree_hal_cuda_allocator_query_buffer_compatibility( + iree_hal_cuda2_allocator_query_buffer_compatibility( base_allocator, &compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { @@ -488,7 +518,7 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( } iree_status_t status = iree_ok_status(); - iree_hal_cuda_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; + iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE; void* host_ptr = NULL; CUdeviceptr device_ptr = 0; @@ -513,13 +543,13 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) { register_flags = CU_MEMHOSTREGISTER_DEVICEMAP; } - status = CU_RESULT_TO_STATUS( - allocator->context->syms, + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemHostRegister(host_ptr, external_buffer->size, register_flags), "cuMemHostRegister"); if (iree_status_is_ok(status)) { - status = CU_RESULT_TO_STATUS( - allocator->context->syms, + status = IREE_CURESULT_TO_STATUS( + allocator->symbols, cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0), "cuMemHostGetDevicePointer"); } @@ -528,18 +558,17 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD: case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32: return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "handle-based imports not yet implemented"); + "unimplmented handle-based imports"); default: return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "external buffer type not supported"); + "unimplmented external buffer type"); } iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { - status = iree_hal_cuda_buffer_wrap( + status = iree_hal_cuda2_buffer_wrap( base_allocator, compat_params.type, compat_params.access, - compat_params.usage, external_buffer->size, - /*byte_offset=*/0, + compat_params.usage, external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, buffer_type, device_ptr, host_ptr, release_callback, &buffer); } @@ -548,8 +577,8 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( *out_buffer = buffer; } else { if (!buffer) { - iree_hal_cuda_buffer_free(allocator->context, buffer_type, device_ptr, - host_ptr); + iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); } else { iree_hal_buffer_release(buffer); } @@ -557,26 +586,26 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( return status; } -static iree_status_t iree_hal_cuda_allocator_export_buffer( +static iree_status_t iree_hal_cuda2_allocator_export_buffer( iree_hal_allocator_t* IREE_RESTRICT base_allocator, iree_hal_buffer_t* IREE_RESTRICT buffer, iree_hal_external_buffer_type_t requested_type, iree_hal_external_buffer_flags_t requested_flags, iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) { - return iree_make_status(IREE_STATUS_UNAVAILABLE, - "exporting to external buffers not supported"); + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "unimplemented exporting to external buffers"); } -static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = { - .destroy = iree_hal_cuda_allocator_destroy, - .host_allocator = iree_hal_cuda_allocator_host_allocator, - .trim = iree_hal_cuda_allocator_trim, - .query_statistics = iree_hal_cuda_allocator_query_statistics, - .query_memory_heaps = iree_hal_cuda_allocator_query_memory_heaps, +static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = { + .destroy = iree_hal_cuda2_allocator_destroy, + .host_allocator = iree_hal_cuda2_allocator_host_allocator, + .trim = iree_hal_cuda2_allocator_trim, + .query_statistics = iree_hal_cuda2_allocator_query_statistics, + .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps, .query_buffer_compatibility = - iree_hal_cuda_allocator_query_buffer_compatibility, - .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer, - .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer, - .import_buffer = iree_hal_cuda_allocator_import_buffer, - .export_buffer = iree_hal_cuda_allocator_export_buffer, + iree_hal_cuda2_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer, + .import_buffer = iree_hal_cuda2_allocator_import_buffer, + .export_buffer = iree_hal_cuda2_allocator_export_buffer, }; diff --git a/experimental/cuda2/cuda_allocator.h b/experimental/cuda2/cuda_allocator.h index b2f272895a91..1a063e3cd7c1 100644 --- a/experimental/cuda2/cuda_allocator.h +++ b/experimental/cuda2/cuda_allocator.h @@ -1,28 +1,30 @@ -// Copyright 2021 The IREE Authors +// Copyright 2023 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#ifndef IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ -#define IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ +#ifndef EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ +#define EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ +#include "experimental/cuda2/cuda_dynamic_symbols.h" #include "iree/base/api.h" #include "iree/hal/api.h" -#include "iree/hal/drivers/cuda/context_wrapper.h" -#include "iree/hal/drivers/cuda/status_util.h" #ifdef __cplusplus extern "C" { #endif // __cplusplus -// Create a cuda allocator. -iree_status_t iree_hal_cuda_allocator_create( - iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context, - CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator); +// Create a CUDA allocator that allocates device memory from the given |device| +// and used in the given |stream|. +iree_status_t iree_hal_cuda2_allocator_create( + iree_hal_device_t* base_device, + const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device, + CUstream stream, iree_allocator_t host_allocator, + iree_hal_allocator_t** out_allocator); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // IREE_HAL_DRIVERS_CUDA_ALLOCATOR_H_ +#endif // EXPERIMENTAL_CUDA2_CUDA_ALLOCATOR_H_ diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c index 3b9c9e13f9e0..d5d4a19ea4e3 100644 --- a/experimental/cuda2/cuda_buffer.c +++ b/experimental/cuda2/cuda_buffer.c @@ -1,10 +1,10 @@ -// Copyright 2021 The IREE Authors +// Copyright 2023 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree/hal/drivers/cuda/cuda_buffer.h" +#include "experimental/cuda2/cuda_buffer.h" #include #include @@ -13,34 +13,34 @@ #include "iree/base/api.h" #include "iree/base/tracing.h" -typedef struct iree_hal_cuda_buffer_t { +typedef struct iree_hal_cuda2_buffer_t { iree_hal_buffer_t base; - iree_hal_cuda_buffer_type_t type; + iree_hal_cuda2_buffer_type_t type; void* host_ptr; CUdeviceptr device_ptr; iree_hal_buffer_release_callback_t release_callback; -} iree_hal_cuda_buffer_t; +} iree_hal_cuda2_buffer_t; -static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable; +static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable; -static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast( +static iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_cast( iree_hal_buffer_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable); - return (iree_hal_cuda_buffer_t*)base_value; + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable); + return (iree_hal_cuda2_buffer_t*)base_value; } -static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast( +static const iree_hal_cuda2_buffer_t* iree_hal_cuda2_buffer_const_cast( const iree_hal_buffer_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable); - return (const iree_hal_cuda_buffer_t*)base_value; + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_buffer_vtable); + return (const iree_hal_cuda2_buffer_t*)base_value; } -iree_status_t iree_hal_cuda_buffer_wrap( +iree_status_t iree_hal_cuda2_buffer_wrap( iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, void* host_ptr, iree_hal_buffer_release_callback_t release_callback, iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(allocator); @@ -49,14 +49,14 @@ iree_status_t iree_hal_cuda_buffer_wrap( iree_allocator_t host_allocator = iree_hal_allocator_host_allocator(allocator); - iree_hal_cuda_buffer_t* buffer = NULL; + iree_hal_cuda2_buffer_t* buffer = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, allocation_size, byte_offset, byte_length, memory_type, allowed_access, allowed_usage, - &iree_hal_cuda_buffer_vtable, &buffer->base); + &iree_hal_cuda2_buffer_vtable, &buffer->base); buffer->type = buffer_type; buffer->host_ptr = host_ptr; buffer->device_ptr = device_ptr; @@ -68,8 +68,8 @@ iree_status_t iree_hal_cuda_buffer_wrap( return status; } -static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) { - iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer); +static void iree_hal_cuda2_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer); iree_allocator_t host_allocator = base_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); if (buffer->release_callback.fn) { @@ -80,12 +80,14 @@ static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_END(z0); } -static iree_status_t iree_hal_cuda_buffer_map_range( +static iree_status_t iree_hal_cuda2_buffer_map_range( iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, iree_hal_memory_access_t memory_access, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { - iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer); + IREE_ASSERT_ARGUMENT(base_buffer); + IREE_ASSERT_ARGUMENT(mapping); + iree_hal_cuda2_buffer_t* buffer = iree_hal_cuda2_buffer_cast(base_buffer); // TODO(benvanik): add upload/download for unmapped buffers. IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( @@ -110,52 +112,52 @@ static iree_status_t iree_hal_cuda_buffer_map_range( return iree_ok_status(); } -static iree_status_t iree_hal_cuda_buffer_unmap_range( +static iree_status_t iree_hal_cuda2_buffer_unmap_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { - // Nothing to do (today). + // Nothing to do today. return iree_ok_status(); } -static iree_status_t iree_hal_cuda_buffer_invalidate_range( +static iree_status_t iree_hal_cuda2_buffer_invalidate_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { - // Nothing to do. + // Nothing to do today. return iree_ok_status(); } -static iree_status_t iree_hal_cuda_buffer_flush_range( +static iree_status_t iree_hal_cuda2_buffer_flush_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { - // Nothing to do. + // Nothing to do today. return iree_ok_status(); } -iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type( +iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type( const iree_hal_buffer_t* base_buffer) { - const iree_hal_cuda_buffer_t* buffer = - iree_hal_cuda_buffer_const_cast(base_buffer); + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); return buffer->type; } -CUdeviceptr iree_hal_cuda_buffer_device_pointer( +CUdeviceptr iree_hal_cuda2_buffer_device_pointer( const iree_hal_buffer_t* base_buffer) { - const iree_hal_cuda_buffer_t* buffer = - iree_hal_cuda_buffer_const_cast(base_buffer); + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); return buffer->device_ptr; } -void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) { - const iree_hal_cuda_buffer_t* buffer = - iree_hal_cuda_buffer_const_cast(base_buffer); +void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) { + const iree_hal_cuda2_buffer_t* buffer = + iree_hal_cuda2_buffer_const_cast(base_buffer); return buffer->host_ptr; } -static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = { +static const iree_hal_buffer_vtable_t iree_hal_cuda2_buffer_vtable = { .recycle = iree_hal_buffer_recycle, - .destroy = iree_hal_cuda_buffer_destroy, - .map_range = iree_hal_cuda_buffer_map_range, - .unmap_range = iree_hal_cuda_buffer_unmap_range, - .invalidate_range = iree_hal_cuda_buffer_invalidate_range, - .flush_range = iree_hal_cuda_buffer_flush_range, + .destroy = iree_hal_cuda2_buffer_destroy, + .map_range = iree_hal_cuda2_buffer_map_range, + .unmap_range = iree_hal_cuda2_buffer_unmap_range, + .invalidate_range = iree_hal_cuda2_buffer_invalidate_range, + .flush_range = iree_hal_cuda2_buffer_flush_range, }; diff --git a/experimental/cuda2/cuda_buffer.h b/experimental/cuda2/cuda_buffer.h index 0b07e8ddea7f..57bde62aab19 100644 --- a/experimental/cuda2/cuda_buffer.h +++ b/experimental/cuda2/cuda_buffer.h @@ -1,54 +1,57 @@ -// Copyright 2021 The IREE Authors +// Copyright 2023 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#ifndef IREE_HAL_DRIVERS_CUDA_BUFFER_H_ -#define IREE_HAL_DRIVERS_CUDA_BUFFER_H_ +#ifndef EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_ +#define EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_ +#include "experimental/cuda2/cuda_headers.h" #include "iree/base/api.h" #include "iree/hal/api.h" -#include "iree/hal/drivers/cuda/cuda_headers.h" #ifdef __cplusplus extern "C" { #endif // __cplusplus -typedef enum iree_hal_cuda_buffer_type_e { - // cuMemAlloc/cuMemAllocManaged + cuMemFree +typedef enum iree_hal_cuda2_buffer_type_e { + // Device local buffer; allocated with + // cuMemAlloc/cuMemAllocManaged/cuMemAllocAsync, freed with cuMemFree. IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 1u << 0, - // cuMemHostAlloc + cuMemFreeHost + // Host local buffer; allocated with cuMemHostAlloc, freed with cuMemFreeHost. IREE_HAL_CUDA_BUFFER_TYPE_HOST = 1u << 1, - // cuMemHostRegister + cuMemHostUnregister + // Host local buffer; registered with cuMemHostRegister, freed with + // cuMemHostUnregister. IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED = 1u << 2, -} iree_hal_cuda_buffer_type_t; +} iree_hal_cuda2_buffer_type_t; // Wraps a CUDA allocation in an iree_hal_buffer_t. -iree_status_t iree_hal_cuda_buffer_wrap( +iree_status_t iree_hal_cuda2_buffer_wrap( iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr, + iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr, void* host_ptr, iree_hal_buffer_release_callback_t release_callback, iree_hal_buffer_t** out_buffer); -// Returns the underlying CUDA buffer type. -iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type( +// Returns the underlying CUDA buffer type of the given |buffer|. +iree_hal_cuda2_buffer_type_t iree_hal_cuda2_buffer_type( const iree_hal_buffer_t* buffer); -// Returns the CUDA base pointer for the given |buffer|. -// This is the entire allocated_buffer and must be offset by the buffer -// byte_offset and byte_length when used. -CUdeviceptr iree_hal_cuda_buffer_device_pointer( +// Returns the CUDA base device pointer for the given |buffer|. +// +// Note that this is the entire allocated_buffer and must be offset by the +// buffer byte_offset and byte_length when used. +CUdeviceptr iree_hal_cuda2_buffer_device_pointer( const iree_hal_buffer_t* buffer); // Returns the CUDA host pointer for the given |buffer|, if available. -void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer); +void* iree_hal_cuda2_buffer_host_pointer(const iree_hal_buffer_t* buffer); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // IREE_HAL_DRIVERS_CUDA_BUFFER_H_ +#endif // EXPERIMENTAL_CUDA2_CUDA_BUFFER_H_