From 85eb21b6e7519798e6351d65b21ddf54a98cbffe Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 13 Jun 2023 19:55:42 -0400 Subject: [PATCH] [cuda] Port over tracing utilities and use in NCCL channel (#14063) The main change is removing the context wrapper and including CUDA dynamic symbols directly. Progress towards https://github.com/openxla/iree/issues/13245 --- experimental/cuda2/CMakeLists.txt | 2 + experimental/cuda2/api.h | 10 + experimental/cuda2/cuda_device.c | 11 + experimental/cuda2/nccl_channel.c | 26 ++ experimental/cuda2/nccl_channel.h | 2 + .../cuda2/registration/driver_module.c | 7 + experimental/cuda2/tracing.c | 293 ++++++++++++++++++ experimental/cuda2/tracing.h | 122 ++++++++ 8 files changed, 473 insertions(+) create mode 100644 experimental/cuda2/tracing.c create mode 100644 experimental/cuda2/tracing.h diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt index 299260975ed1..b9be87d26f9a 100644 --- a/experimental/cuda2/CMakeLists.txt +++ b/experimental/cuda2/CMakeLists.txt @@ -34,6 +34,8 @@ iree_cc_library( "nccl_channel.h" "pipeline_layout.c" "pipeline_layout.h" + "tracing.c" + "tracing.h" DEPS ::dynamic_symbols iree::base diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h index 20951b06bb5f..5df17c54918f 100644 --- a/experimental/cuda2/api.h +++ b/experimental/cuda2/api.h @@ -59,6 +59,16 @@ typedef struct iree_hal_cuda2_device_params_t { // transient allocations while also increasing memory consumption. iree_host_size_t arena_block_size; + // Enables tracing of command buffers when IREE tracing is enabled. + // May take advantage of additional extensions for more accurate timing or + // hardware-specific performance counters. + // + // NOTE: tracing has a non-trivial overhead and will skew the timing of + // submissions and introduce false barriers between dispatches. Use this to + // identify slow dispatches and refine from there; be wary of whole-program + // tracing with this enabled. + bool stream_tracing; + // Whether to use async allocations even if reported as available by the // device. Defaults to true when the device supports it. bool async_allocations; diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c index 36bf8ab8fdf6..e51d3260cac3 100644 --- a/experimental/cuda2/cuda_device.c +++ b/experimental/cuda2/cuda_device.c @@ -19,6 +19,7 @@ #include "experimental/cuda2/nccl_dynamic_symbols.h" #include "experimental/cuda2/nop_executable_cache.h" #include "experimental/cuda2/pipeline_layout.h" +#include "experimental/cuda2/tracing.h" #include "iree/base/internal/arena.h" #include "iree/base/internal/math.h" #include "iree/hal/utils/buffer_transfer.h" @@ -53,6 +54,8 @@ typedef struct iree_hal_cuda2_device_t { // TODO: support multiple streams. CUstream cu_stream; + iree_hal_cuda2_tracing_context_t* tracing_context; + iree_allocator_t host_allocator; // Device memory pools and allocators. @@ -82,6 +85,7 @@ IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize( memset(out_params, 0, sizeof(*out_params)); out_params->arena_block_size = 32 * 1024; out_params->queue_count = 1; + out_params->stream_tracing = false; out_params->async_allocations = true; } @@ -128,7 +132,13 @@ static iree_status_t iree_hal_cuda2_device_create_internal( device->cu_stream = stream; device->host_allocator = host_allocator; + // Enable tracing for the (currently only) stream - no-op if disabled. iree_status_t status = iree_ok_status(); + if (device->params.stream_tracing) { + status = iree_hal_cuda2_tracing_context_allocate( + device->cuda_symbols, device->identifier, stream, &device->block_pool, + host_allocator, &device->tracing_context); + } // Memory pool support is conditional. if (iree_status_is_ok(status) && params->async_allocations) { @@ -237,6 +247,7 @@ static void iree_hal_cuda2_device_destroy(iree_hal_device_t* base_device) { iree_hal_cuda2_memory_pools_deinitialize(&device->memory_pools); // TODO: support multiple streams. + iree_hal_cuda2_tracing_context_free(device->tracing_context); IREE_CUDA_IGNORE_ERROR(device->cuda_symbols, cuStreamDestroy(device->cu_stream)); diff --git a/experimental/cuda2/nccl_channel.c b/experimental/cuda2/nccl_channel.c index e0bf3dc3a636..55ffc52a62b3 100644 --- a/experimental/cuda2/nccl_channel.c +++ b/experimental/cuda2/nccl_channel.c @@ -544,11 +544,29 @@ static iree_status_t iree_hal_cuda2_nccl_submit_batch_entry( iree_status_t iree_hal_cuda2_nccl_submit_batch( const iree_hal_cuda2_nccl_dynamic_symbols_t* symbols, + iree_hal_cuda2_tracing_context_t* tracing_context, const iree_hal_collective_batch_t* batch, CUstream stream) { IREE_ASSERT_ARGUMENT(symbols); IREE_ASSERT_ARGUMENT(batch); IREE_ASSERT_ARGUMENT(stream); + // Begin one zone for each entry in the batch. Each entry will show stacked on + // top of each other and unfortunately use independent CUDA events. We could + // optimize this by changing the tracing context to expose an API with event + // reservation and then zone commit using an existing event. + IREE_TRACE({ + iree_bitfield_string_temp_t string_temp; + for (iree_host_size_t i = 0; i < batch->count; ++i) { + iree_hal_collective_batch_entry_t* entry = &batch->entries[i]; + iree_string_view_t collective_str = + iree_hal_collective_op_format(&entry->op, &string_temp); + IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL( + tracing_context, stream, __FILE__, strlen(__FILE__), + (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), + collective_str.data, collective_str.size); + } + }); + // Issue all collective operations in the batch as part of a group. // NCCL may be able to fuse or reduce overheads by issuing like this. IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupStart(), "ncclGroupStart"); @@ -558,6 +576,14 @@ iree_status_t iree_hal_cuda2_nccl_submit_batch( } IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupEnd(), "ncclGroupEnd"); + // End all zones we began above - note that these are just simply nested so + // order doesn't matter so long as we end the right number of zones. + IREE_TRACE({ + for (iree_host_size_t i = 0; i < batch->count; ++i) { + IREE_CUDA_TRACE_ZONE_END(tracing_context, stream); + } + }); + return iree_ok_status(); } diff --git a/experimental/cuda2/nccl_channel.h b/experimental/cuda2/nccl_channel.h index a6da61f61e30..ad8bfef68667 100644 --- a/experimental/cuda2/nccl_channel.h +++ b/experimental/cuda2/nccl_channel.h @@ -11,6 +11,7 @@ #include "experimental/cuda2/cuda_dynamic_symbols.h" #include "experimental/cuda2/cuda_headers.h" #include "experimental/cuda2/nccl_dynamic_symbols.h" +#include "experimental/cuda2/tracing.h" #include "iree/base/api.h" #include "iree/hal/api.h" #include "iree/hal/utils/collective_batch.h" @@ -48,6 +49,7 @@ iree_status_t iree_hal_cuda2_nccl_channel_create( // Note that operations in the batch may apply to different channels. iree_status_t iree_hal_cuda2_nccl_submit_batch( const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols, + iree_hal_cuda2_tracing_context_t* tracing_context, const iree_hal_collective_batch_t* batch, CUstream stream); #ifdef __cplusplus diff --git a/experimental/cuda2/registration/driver_module.c b/experimental/cuda2/registration/driver_module.c index 33d93a3929e0..f7d46afb1f88 100644 --- a/experimental/cuda2/registration/driver_module.c +++ b/experimental/cuda2/registration/driver_module.c @@ -17,6 +17,12 @@ IREE_FLAG( bool, cuda_async_allocations, true, "Enables CUDA asynchronous stream-ordered allocations when supported."); +IREE_FLAG( + bool, cuda_tracing, true, + "Enables tracing of stream events when Tracy instrumentation is enabled.\n" + "Severely impacts benchmark timings and should only be used when\n" + "analyzing dispatch timings."); + IREE_FLAG(int32_t, cuda2_default_index, 0, "Specifies the index of the default CUDA device to use"); @@ -84,6 +90,7 @@ static iree_status_t iree_hal_cuda2_driver_factory_try_create( iree_hal_cuda2_device_params_t device_params; iree_hal_cuda2_device_params_initialize(&device_params); + device_params.stream_tracing = FLAG_cuda_tracing; device_params.async_allocations = FLAG_cuda_async_allocations; driver_options.default_device_index = FLAG_cuda2_default_index; diff --git a/experimental/cuda2/tracing.c b/experimental/cuda2/tracing.c new file mode 100644 index 000000000000..6243ad8780d5 --- /dev/null +++ b/experimental/cuda2/tracing.c @@ -0,0 +1,293 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/cuda2/tracing.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_status_util.h" + +// Total number of events per tracing context. This translates to the maximum +// number of outstanding timestamp queries before collection is required. +// To prevent spilling pages we leave some room for the context structure. +#define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) + +struct iree_hal_cuda2_tracing_context_t { + const iree_hal_cuda2_dynamic_symbols_t* symbols; + + CUstream stream; + iree_arena_block_pool_t* block_pool; + iree_allocator_t host_allocator; + + // A unique GPU zone ID allocated from Tracy. + // There is a global limit of 255 GPU zones (ID 255 is special). + uint8_t id; + + // Base event used for computing relative times for all recorded events. + // This is required as CUDA (without CUPTI) only allows for relative timing + // between events and we need a stable base event. + CUevent base_event; + + // Indices into |event_pool| defining a ringbuffer. + uint32_t query_head; + uint32_t query_tail; + uint32_t query_capacity; + + // Event pool reused to capture tracing timestamps. + CUevent event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY]; +}; + +static iree_status_t iree_hal_cuda2_tracing_context_initial_calibration( + const iree_hal_cuda2_dynamic_symbols_t* symbols, CUstream stream, + CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp, + float* out_timestamp_period) { + IREE_TRACE_ZONE_BEGIN(z0); + *out_cpu_timestamp = 0; + *out_gpu_timestamp = 0; + *out_timestamp_period = 1.0f; + + // Record event to the stream; in the absence of a synchronize this may not + // flush immediately. + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream))); + + // Force flush the event and wait for it to complete. + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event))); + + // Track when we know the event has completed and has a reasonable timestamp. + // This may drift from the actual time differential between host/device but is + // (maybe?) the best we can do without CUPTI. + *out_cpu_timestamp = iree_tracing_time(); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +iree_status_t iree_hal_cuda2_tracing_context_allocate( + const iree_hal_cuda2_dynamic_symbols_t* symbols, + iree_string_view_t queue_name, CUstream stream, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + iree_hal_cuda2_tracing_context_t** out_context) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_ASSERT_ARGUMENT(symbols); + IREE_ASSERT_ARGUMENT(stream); + IREE_ASSERT_ARGUMENT(block_pool); + IREE_ASSERT_ARGUMENT(out_context); + *out_context = NULL; + + iree_hal_cuda2_tracing_context_t* context = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); + if (iree_status_is_ok(status)) { + context->symbols = symbols; + context->stream = stream; + context->block_pool = block_pool; + context->host_allocator = host_allocator; + context->query_capacity = IREE_ARRAYSIZE(context->event_pool); + } + + // Pre-allocate all events in the event pool. + if (iree_status_is_ok(status)) { + IREE_TRACE_ZONE_BEGIN_NAMED( + z_event_pool, "iree_hal_cuda2_tracing_context_allocate_event_pool"); + IREE_TRACE_ZONE_APPEND_VALUE(z_event_pool, + (int64_t)context->query_capacity); + for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { + status = IREE_CURESULT_TO_STATUS( + symbols, cuEventCreate(&context->event_pool[i], CU_EVENT_DEFAULT)); + if (!iree_status_is_ok(status)) break; + } + IREE_TRACE_ZONE_END(z_event_pool); + } + + // Create the initial GPU event and insert it into the stream. + // All events we record are relative to this event. + int64_t cpu_timestamp = 0; + int64_t gpu_timestamp = 0; + float timestamp_period = 0.0f; + if (iree_status_is_ok(status)) { + status = IREE_CURESULT_TO_STATUS( + symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT)); + } + if (iree_status_is_ok(status)) { + status = iree_hal_cuda2_tracing_context_initial_calibration( + symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp, + ×tamp_period); + } + + // Allocate the GPU context and pass initial calibration data. + if (iree_status_is_ok(status)) { + context->id = iree_tracing_gpu_context_allocate( + IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size, + /*is_calibrated=*/false, cpu_timestamp, gpu_timestamp, + timestamp_period); + } + + if (iree_status_is_ok(status)) { + *out_context = context; + } else { + iree_hal_cuda2_tracing_context_free(context); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +void iree_hal_cuda2_tracing_context_free( + iree_hal_cuda2_tracing_context_t* context) { + if (!context) return; + IREE_TRACE_ZONE_BEGIN(z0); + + // Always perform a collection on shutdown. + iree_hal_cuda2_tracing_context_collect(context); + + // Release all events; since collection completed they should all be unused. + IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool, + "iree_hal_cuda2_tracing_context_free_event_pool"); + for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { + if (context->event_pool[i]) { + IREE_CUDA_IGNORE_ERROR(context->symbols, + cuEventDestroy(context->event_pool[i])); + } + } + IREE_TRACE_ZONE_END(z_event_pool); + if (context->base_event) { + IREE_CUDA_IGNORE_ERROR(context->symbols, + cuEventDestroy(context->base_event)); + } + + iree_allocator_t host_allocator = context->host_allocator; + iree_allocator_free(host_allocator, context); + + IREE_TRACE_ZONE_END(z0); +} + +void iree_hal_cuda2_tracing_context_collect( + iree_hal_cuda2_tracing_context_t* context) { + if (!context) return; + if (context->query_tail == context->query_head) { + // No outstanding queries. + return; + } + IREE_TRACE_ZONE_BEGIN(z0); + + while (context->query_tail != context->query_head) { + // Compute the contiguous range of queries ready to be read. + // If the ringbuffer wraps around we'll handle that in the next loop. + uint32_t try_query_count = + context->query_head < context->query_tail + ? context->query_capacity - context->query_tail + : context->query_head - context->query_tail; + IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)try_query_count); + + // Scan and feed the times to tracy, stopping when we hit the first + // unavailable query. + uint32_t query_base = context->query_tail; + uint32_t read_query_count = 0; + for (uint32_t i = 0; i < try_query_count; ++i) { + // Ensure the event has completed; will return CUDA_ERROR_NOT_READY if + // recorded but not retired or any other deferred error. + uint16_t query_id = (uint16_t)(query_base + i); + CUevent query_event = context->event_pool[query_id]; + CUresult result = context->symbols->cuEventQuery(query_event); + if (result != CUDA_SUCCESS) break; + + // Calculate context-relative time and notify tracy. + float relative_millis = 0.0f; + IREE_CUDA_IGNORE_ERROR( + context->symbols, + cuEventElapsedTime(&relative_millis, context->base_event, + query_event)); + int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0); + iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); + + read_query_count = i + 1; + } + IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)read_query_count); + + context->query_tail += read_query_count; + if (context->query_tail >= context->query_capacity) { + context->query_tail = 0; + } + } + + IREE_TRACE_ZONE_END(z0); +} + +static uint16_t iree_hal_cuda2_tracing_context_insert_query( + iree_hal_cuda2_tracing_context_t* context, CUstream stream) { + // Allocate an event from the pool for use by the query. + uint32_t query_id = context->query_head; + context->query_head = (context->query_head + 1) % context->query_capacity; + + // TODO: check to see if the read and write heads of the ringbuffer have + // overlapped. If they have we could try to collect but it's not guaranteed + // that collection will complete (e.g. we may be reserving events for use in + // graphs that haven't yet been launched). + // + // For now we just allow the overlap and tracing results will be inconsistent. + IREE_ASSERT_NE(context->query_head, context->query_tail); + + CUevent event = context->event_pool[query_id]; + IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event, stream)); + + return query_id; +} + +// TODO: optimize this implementation to reduce the number of events required: +// today we insert 2 events per zone (one for begin and one for end) but in +// many cases we could reduce this by inserting events only between zones and +// using the differences between them. + +void iree_hal_cuda2_tracing_zone_begin_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream, + const iree_tracing_location_t* src_loc) { + if (!context) return; + uint16_t query_id = + iree_hal_cuda2_tracing_context_insert_query(context, stream); + iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); +} + +void iree_hal_cuda2_tracing_zone_begin_external_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { + if (!context) return; + uint16_t query_id = + iree_hal_cuda2_tracing_context_insert_query(context, stream); + iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, + file_name_length, line, function_name, + function_name_length, name, name_length); +} + +void iree_hal_cuda2_tracing_zone_end_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream) { + if (!context) return; + uint16_t query_id = + iree_hal_cuda2_tracing_context_insert_query(context, stream); + iree_tracing_gpu_zone_end(context->id, query_id); +} + +#else + +iree_status_t iree_hal_cuda2_tracing_context_allocate( + const iree_hal_cuda2_dynamic_symbols_t* symbols, + iree_string_view_t queue_name, CUstream stream, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + iree_hal_cuda2_tracing_context_t** out_context) { + *out_context = NULL; + return iree_ok_status(); +} + +void iree_hal_cuda2_tracing_context_free( + iree_hal_cuda2_tracing_context_t* context) {} + +void iree_hal_cuda2_tracing_context_collect( + iree_hal_cuda2_tracing_context_t* context) {} + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION diff --git a/experimental/cuda2/tracing.h b/experimental/cuda2/tracing.h new file mode 100644 index 000000000000..57b6786dc469 --- /dev/null +++ b/experimental/cuda2/tracing.h @@ -0,0 +1,122 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef EXPERIMENTAL_CUDA2_TRACING_H_ +#define EXPERIMENTAL_CUDA2_TRACING_H_ + +#include "experimental/cuda2/cuda_dynamic_symbols.h" +#include "experimental/cuda2/cuda_headers.h" +#include "iree/base/api.h" +#include "iree/base/internal/arena.h" +#include "iree/base/tracing.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Per-stream CUDA tracing context. +// No-op if IREE tracing is not enabled. +// +// Use the IREE_CUDA_TRACE_* macros to trace a contiguous set of stream +// operations. Unlike the normal tracy macros there are no zone IDs and instead +// each stream gets an ID allocated once and passed to all tracing macros. +// +// Usage: +// IREE_CUDA_TRACE_ZONE_BEGIN(queue->tracing_context, stream); +// cuLaunchKernel(..., stream); +// IREE_CUDA_TRACE_ZONE_END(queue->tracing_context, stream); +// ... +// iree_hal_cuda2_tracing_context_collect(queue->tracing_context); +// +// NOTE: timestamps can have non-trivial side-effecting behavior and may +// introduce serialization in graph execution. +// +// TODO(benvanik): expose CUevent reservation separate from recording. For +// graphs we will need to insert the events but in order to reuse the graphs +// we'll need to reserve and patch new events each graph launch. For now we +// don't instrument graphs. +// +// Thread-compatible: external synchronization is required if using from +// multiple threads (same as with CUstream itself). +typedef struct iree_hal_cuda2_tracing_context_t + iree_hal_cuda2_tracing_context_t; + +// Allocates a tracing context for the given CUDA |stream|. +// Each context must only be used with the stream it was created for. +iree_status_t iree_hal_cuda2_tracing_context_allocate( + const iree_hal_cuda2_dynamic_symbols_t* symbols, + iree_string_view_t queue_name, CUstream stream, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + iree_hal_cuda2_tracing_context_t** out_context); + +// Frees a tracing context and all associated CUDA resources. +// All submissions using the resources must be completed prior to calling. +void iree_hal_cuda2_tracing_context_free( + iree_hal_cuda2_tracing_context_t* context); + +// Collects in-flight timestamp queries from the stream and feeds them to tracy. +// Must be called frequently (every submission, etc) to drain the backlog; +// tracing may start failing if the internal ringbuffer is exceeded. +void iree_hal_cuda2_tracing_context_collect( + iree_hal_cuda2_tracing_context_t* context); + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +// Begins a normal zone derived on the calling |src_loc|. +// Must be perfectly nested and paired with a corresponding zone end. +void iree_hal_cuda2_tracing_zone_begin_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream, + const iree_tracing_location_t* src_loc); + +// Begins an external zone using the given source information. +// The provided strings will be copied into the tracy buffer. +void iree_hal_cuda2_tracing_zone_begin_external_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length); + +void iree_hal_cuda2_tracing_zone_end_impl( + iree_hal_cuda2_tracing_context_t* context, CUstream stream); + +// Begins a new zone with the parent function name. +#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream) \ + static const iree_tracing_location_t TracyConcat( \ + __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__, \ + __FILE__, (uint32_t)__LINE__, 0}; \ + iree_hal_cuda2_tracing_zone_begin_impl( \ + context, stream, &TracyConcat(__tracy_source_location, __LINE__)); + +// Begins an externally defined zone with a dynamic source location. +// The |file_name|, |function_name|, and optional |name| strings will be copied +// into the trace buffer and do not need to persist. +#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, stream, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ + iree_hal_cuda2_tracing_zone_begin_external_impl( \ + context, stream, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) + +// Ends the current zone. Must be passed the |zone_id| from the _BEGIN. +#define IREE_CUDA_TRACE_ZONE_END(context, stream) \ + iree_hal_cuda2_tracing_zone_end_impl(context, stream) + +#else + +#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream) +#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, stream, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) +#define IREE_CUDA_TRACE_ZONE_END(context, stream) + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // EXPERIMENTAL_CUDA2_TRACING_H_