From 85eb21b6e7519798e6351d65b21ddf54a98cbffe Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Tue, 13 Jun 2023 19:55:42 -0400
Subject: [PATCH] [cuda] Port over tracing utilities and use in NCCL channel 
 (#14063)

The main change is removing the context wrapper and including CUDA
dynamic symbols directly.

Progress towards https://github.com/openxla/iree/issues/13245
---
 experimental/cuda2/CMakeLists.txt             |   2 +
 experimental/cuda2/api.h                      |  10 +
 experimental/cuda2/cuda_device.c              |  11 +
 experimental/cuda2/nccl_channel.c             |  26 ++
 experimental/cuda2/nccl_channel.h             |   2 +
 .../cuda2/registration/driver_module.c        |   7 +
 experimental/cuda2/tracing.c                  | 293 ++++++++++++++++++
 experimental/cuda2/tracing.h                  | 122 ++++++++
 8 files changed, 473 insertions(+)
 create mode 100644 experimental/cuda2/tracing.c
 create mode 100644 experimental/cuda2/tracing.h

diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index 299260975ed1..b9be87d26f9a 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -34,6 +34,8 @@ iree_cc_library(
     "nccl_channel.h"
     "pipeline_layout.c"
     "pipeline_layout.h"
+    "tracing.c"
+    "tracing.h"
   DEPS
     ::dynamic_symbols
     iree::base
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index 20951b06bb5f..5df17c54918f 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -59,6 +59,16 @@ typedef struct iree_hal_cuda2_device_params_t {
   // transient allocations while also increasing memory consumption.
   iree_host_size_t arena_block_size;
 
+  // Enables tracing of command buffers when IREE tracing is enabled.
+  // May take advantage of additional extensions for more accurate timing or
+  // hardware-specific performance counters.
+  //
+  // NOTE: tracing has a non-trivial overhead and will skew the timing of
+  // submissions and introduce false barriers between dispatches. Use this to
+  // identify slow dispatches and refine from there; be wary of whole-program
+  // tracing with this enabled.
+  bool stream_tracing;
+
   // Whether to use async allocations even if reported as available by the
   // device. Defaults to true when the device supports it.
   bool async_allocations;
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index 36bf8ab8fdf6..e51d3260cac3 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c
@@ -19,6 +19,7 @@
 #include "experimental/cuda2/nccl_dynamic_symbols.h"
 #include "experimental/cuda2/nop_executable_cache.h"
 #include "experimental/cuda2/pipeline_layout.h"
+#include "experimental/cuda2/tracing.h"
 #include "iree/base/internal/arena.h"
 #include "iree/base/internal/math.h"
 #include "iree/hal/utils/buffer_transfer.h"
@@ -53,6 +54,8 @@ typedef struct iree_hal_cuda2_device_t {
   // TODO: support multiple streams.
   CUstream cu_stream;
 
+  iree_hal_cuda2_tracing_context_t* tracing_context;
+
   iree_allocator_t host_allocator;
 
   // Device memory pools and allocators.
@@ -82,6 +85,7 @@ IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
   memset(out_params, 0, sizeof(*out_params));
   out_params->arena_block_size = 32 * 1024;
   out_params->queue_count = 1;
+  out_params->stream_tracing = false;
   out_params->async_allocations = true;
 }
 
@@ -128,7 +132,13 @@ static iree_status_t iree_hal_cuda2_device_create_internal(
   device->cu_stream = stream;
   device->host_allocator = host_allocator;
 
+  // Enable tracing for the (currently only) stream - no-op if disabled.
   iree_status_t status = iree_ok_status();
+  if (device->params.stream_tracing) {
+    status = iree_hal_cuda2_tracing_context_allocate(
+        device->cuda_symbols, device->identifier, stream, &device->block_pool,
+        host_allocator, &device->tracing_context);
+  }
 
   // Memory pool support is conditional.
   if (iree_status_is_ok(status) && params->async_allocations) {
@@ -237,6 +247,7 @@ static void iree_hal_cuda2_device_destroy(iree_hal_device_t* base_device) {
   iree_hal_cuda2_memory_pools_deinitialize(&device->memory_pools);
 
   // TODO: support multiple streams.
+  iree_hal_cuda2_tracing_context_free(device->tracing_context);
   IREE_CUDA_IGNORE_ERROR(device->cuda_symbols,
                          cuStreamDestroy(device->cu_stream));
 
diff --git a/experimental/cuda2/nccl_channel.c b/experimental/cuda2/nccl_channel.c
index e0bf3dc3a636..55ffc52a62b3 100644
--- a/experimental/cuda2/nccl_channel.c
+++ b/experimental/cuda2/nccl_channel.c
@@ -544,11 +544,29 @@ static iree_status_t iree_hal_cuda2_nccl_submit_batch_entry(
 
 iree_status_t iree_hal_cuda2_nccl_submit_batch(
     const iree_hal_cuda2_nccl_dynamic_symbols_t* symbols,
+    iree_hal_cuda2_tracing_context_t* tracing_context,
     const iree_hal_collective_batch_t* batch, CUstream stream) {
   IREE_ASSERT_ARGUMENT(symbols);
   IREE_ASSERT_ARGUMENT(batch);
   IREE_ASSERT_ARGUMENT(stream);
 
+  // Begin one zone for each entry in the batch. Each entry will show stacked on
+  // top of each other and unfortunately use independent CUDA events. We could
+  // optimize this by changing the tracing context to expose an API with event
+  // reservation and then zone commit using an existing event.
+  IREE_TRACE({
+    iree_bitfield_string_temp_t string_temp;
+    for (iree_host_size_t i = 0; i < batch->count; ++i) {
+      iree_hal_collective_batch_entry_t* entry = &batch->entries[i];
+      iree_string_view_t collective_str =
+          iree_hal_collective_op_format(&entry->op, &string_temp);
+      IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL(
+          tracing_context, stream, __FILE__, strlen(__FILE__),
+          (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
+          collective_str.data, collective_str.size);
+    }
+  });
+
   // Issue all collective operations in the batch as part of a group.
   // NCCL may be able to fuse or reduce overheads by issuing like this.
   IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupStart(), "ncclGroupStart");
@@ -558,6 +576,14 @@ iree_status_t iree_hal_cuda2_nccl_submit_batch(
   }
   IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupEnd(), "ncclGroupEnd");
 
+  // End all zones we began above - note that these are just simply nested so
+  // order doesn't matter so long as we end the right number of zones.
+  IREE_TRACE({
+    for (iree_host_size_t i = 0; i < batch->count; ++i) {
+      IREE_CUDA_TRACE_ZONE_END(tracing_context, stream);
+    }
+  });
+
   return iree_ok_status();
 }
 
diff --git a/experimental/cuda2/nccl_channel.h b/experimental/cuda2/nccl_channel.h
index a6da61f61e30..ad8bfef68667 100644
--- a/experimental/cuda2/nccl_channel.h
+++ b/experimental/cuda2/nccl_channel.h
@@ -11,6 +11,7 @@
 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "experimental/cuda2/cuda_headers.h"
 #include "experimental/cuda2/nccl_dynamic_symbols.h"
+#include "experimental/cuda2/tracing.h"
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
 #include "iree/hal/utils/collective_batch.h"
@@ -48,6 +49,7 @@ iree_status_t iree_hal_cuda2_nccl_channel_create(
 // Note that operations in the batch may apply to different channels.
 iree_status_t iree_hal_cuda2_nccl_submit_batch(
     const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
+    iree_hal_cuda2_tracing_context_t* tracing_context,
     const iree_hal_collective_batch_t* batch, CUstream stream);
 
 #ifdef __cplusplus
diff --git a/experimental/cuda2/registration/driver_module.c b/experimental/cuda2/registration/driver_module.c
index 33d93a3929e0..f7d46afb1f88 100644
--- a/experimental/cuda2/registration/driver_module.c
+++ b/experimental/cuda2/registration/driver_module.c
@@ -17,6 +17,12 @@ IREE_FLAG(
     bool, cuda_async_allocations, true,
     "Enables CUDA asynchronous stream-ordered allocations when supported.");
 
+IREE_FLAG(
+    bool, cuda_tracing, true,
+    "Enables tracing of stream events when Tracy instrumentation is enabled.\n"
+    "Severely impacts benchmark timings and should only be used when\n"
+    "analyzing dispatch timings.");
+
 IREE_FLAG(int32_t, cuda2_default_index, 0,
           "Specifies the index of the default CUDA device to use");
 
@@ -84,6 +90,7 @@ static iree_status_t iree_hal_cuda2_driver_factory_try_create(
 
   iree_hal_cuda2_device_params_t device_params;
   iree_hal_cuda2_device_params_initialize(&device_params);
+  device_params.stream_tracing = FLAG_cuda_tracing;
   device_params.async_allocations = FLAG_cuda_async_allocations;
 
   driver_options.default_device_index = FLAG_cuda2_default_index;
diff --git a/experimental/cuda2/tracing.c b/experimental/cuda2/tracing.c
new file mode 100644
index 000000000000..6243ad8780d5
--- /dev/null
+++ b/experimental/cuda2/tracing.c
@@ -0,0 +1,293 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/tracing.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+
+// Total number of events per tracing context. This translates to the maximum
+// number of outstanding timestamp queries before collection is required.
+// To prevent spilling pages we leave some room for the context structure.
+#define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256)
+
+struct iree_hal_cuda2_tracing_context_t {
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  CUstream stream;
+  iree_arena_block_pool_t* block_pool;
+  iree_allocator_t host_allocator;
+
+  // A unique GPU zone ID allocated from Tracy.
+  // There is a global limit of 255 GPU zones (ID 255 is special).
+  uint8_t id;
+
+  // Base event used for computing relative times for all recorded events.
+  // This is required as CUDA (without CUPTI) only allows for relative timing
+  // between events and we need a stable base event.
+  CUevent base_event;
+
+  // Indices into |event_pool| defining a ringbuffer.
+  uint32_t query_head;
+  uint32_t query_tail;
+  uint32_t query_capacity;
+
+  // Event pool reused to capture tracing timestamps.
+  CUevent event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY];
+};
+
+static iree_status_t iree_hal_cuda2_tracing_context_initial_calibration(
+    const iree_hal_cuda2_dynamic_symbols_t* symbols, CUstream stream,
+    CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp,
+    float* out_timestamp_period) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_cpu_timestamp = 0;
+  *out_gpu_timestamp = 0;
+  *out_timestamp_period = 1.0f;
+
+  // Record event to the stream; in the absence of a synchronize this may not
+  // flush immediately.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream)));
+
+  // Force flush the event and wait for it to complete.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event)));
+
+  // Track when we know the event has completed and has a reasonable timestamp.
+  // This may drift from the actual time differential between host/device but is
+  // (maybe?) the best we can do without CUPTI.
+  *out_cpu_timestamp = iree_tracing_time();
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+    const iree_hal_cuda2_dynamic_symbols_t* symbols,
+    iree_string_view_t queue_name, CUstream stream,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_cuda2_tracing_context_t** out_context) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(symbols);
+  IREE_ASSERT_ARGUMENT(stream);
+  IREE_ASSERT_ARGUMENT(block_pool);
+  IREE_ASSERT_ARGUMENT(out_context);
+  *out_context = NULL;
+
+  iree_hal_cuda2_tracing_context_t* context = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
+  if (iree_status_is_ok(status)) {
+    context->symbols = symbols;
+    context->stream = stream;
+    context->block_pool = block_pool;
+    context->host_allocator = host_allocator;
+    context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
+  }
+
+  // Pre-allocate all events in the event pool.
+  if (iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_BEGIN_NAMED(
+        z_event_pool, "iree_hal_cuda2_tracing_context_allocate_event_pool");
+    IREE_TRACE_ZONE_APPEND_VALUE(z_event_pool,
+                                 (int64_t)context->query_capacity);
+    for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
+      status = IREE_CURESULT_TO_STATUS(
+          symbols, cuEventCreate(&context->event_pool[i], CU_EVENT_DEFAULT));
+      if (!iree_status_is_ok(status)) break;
+    }
+    IREE_TRACE_ZONE_END(z_event_pool);
+  }
+
+  // Create the initial GPU event and insert it into the stream.
+  // All events we record are relative to this event.
+  int64_t cpu_timestamp = 0;
+  int64_t gpu_timestamp = 0;
+  float timestamp_period = 0.0f;
+  if (iree_status_is_ok(status)) {
+    status = IREE_CURESULT_TO_STATUS(
+        symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT));
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_tracing_context_initial_calibration(
+        symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp,
+        &timestamp_period);
+  }
+
+  // Allocate the GPU context and pass initial calibration data.
+  if (iree_status_is_ok(status)) {
+    context->id = iree_tracing_gpu_context_allocate(
+        IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size,
+        /*is_calibrated=*/false, cpu_timestamp, gpu_timestamp,
+        timestamp_period);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_context = context;
+  } else {
+    iree_hal_cuda2_tracing_context_free(context);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda2_tracing_context_free(
+    iree_hal_cuda2_tracing_context_t* context) {
+  if (!context) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Always perform a collection on shutdown.
+  iree_hal_cuda2_tracing_context_collect(context);
+
+  // Release all events; since collection completed they should all be unused.
+  IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool,
+                              "iree_hal_cuda2_tracing_context_free_event_pool");
+  for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
+    if (context->event_pool[i]) {
+      IREE_CUDA_IGNORE_ERROR(context->symbols,
+                             cuEventDestroy(context->event_pool[i]));
+    }
+  }
+  IREE_TRACE_ZONE_END(z_event_pool);
+  if (context->base_event) {
+    IREE_CUDA_IGNORE_ERROR(context->symbols,
+                           cuEventDestroy(context->base_event));
+  }
+
+  iree_allocator_t host_allocator = context->host_allocator;
+  iree_allocator_free(host_allocator, context);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_cuda2_tracing_context_collect(
+    iree_hal_cuda2_tracing_context_t* context) {
+  if (!context) return;
+  if (context->query_tail == context->query_head) {
+    // No outstanding queries.
+    return;
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  while (context->query_tail != context->query_head) {
+    // Compute the contiguous range of queries ready to be read.
+    // If the ringbuffer wraps around we'll handle that in the next loop.
+    uint32_t try_query_count =
+        context->query_head < context->query_tail
+            ? context->query_capacity - context->query_tail
+            : context->query_head - context->query_tail;
+    IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)try_query_count);
+
+    // Scan and feed the times to tracy, stopping when we hit the first
+    // unavailable query.
+    uint32_t query_base = context->query_tail;
+    uint32_t read_query_count = 0;
+    for (uint32_t i = 0; i < try_query_count; ++i) {
+      // Ensure the event has completed; will return CUDA_ERROR_NOT_READY if
+      // recorded but not retired or any other deferred error.
+      uint16_t query_id = (uint16_t)(query_base + i);
+      CUevent query_event = context->event_pool[query_id];
+      CUresult result = context->symbols->cuEventQuery(query_event);
+      if (result != CUDA_SUCCESS) break;
+
+      // Calculate context-relative time and notify tracy.
+      float relative_millis = 0.0f;
+      IREE_CUDA_IGNORE_ERROR(
+          context->symbols,
+          cuEventElapsedTime(&relative_millis, context->base_event,
+                             query_event));
+      int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0);
+      iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp);
+
+      read_query_count = i + 1;
+    }
+    IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)read_query_count);
+
+    context->query_tail += read_query_count;
+    if (context->query_tail >= context->query_capacity) {
+      context->query_tail = 0;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static uint16_t iree_hal_cuda2_tracing_context_insert_query(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream) {
+  // Allocate an event from the pool for use by the query.
+  uint32_t query_id = context->query_head;
+  context->query_head = (context->query_head + 1) % context->query_capacity;
+
+  // TODO: check to see if the read and write heads of the ringbuffer have
+  // overlapped. If they have we could try to collect but it's not guaranteed
+  // that collection will complete (e.g. we may be reserving events for use in
+  // graphs that haven't yet been launched).
+  //
+  // For now we just allow the overlap and tracing results will be inconsistent.
+  IREE_ASSERT_NE(context->query_head, context->query_tail);
+
+  CUevent event = context->event_pool[query_id];
+  IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event, stream));
+
+  return query_id;
+}
+
+// TODO: optimize this implementation to reduce the number of events required:
+// today we insert 2 events per zone (one for begin and one for end) but in
+// many cases we could reduce this by inserting events only between zones and
+// using the differences between them.
+
+void iree_hal_cuda2_tracing_zone_begin_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+    const iree_tracing_location_t* src_loc) {
+  if (!context) return;
+  uint16_t query_id =
+      iree_hal_cuda2_tracing_context_insert_query(context, stream);
+  iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
+}
+
+void iree_hal_cuda2_tracing_zone_begin_external_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length) {
+  if (!context) return;
+  uint16_t query_id =
+      iree_hal_cuda2_tracing_context_insert_query(context, stream);
+  iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
+                                       file_name_length, line, function_name,
+                                       function_name_length, name, name_length);
+}
+
+void iree_hal_cuda2_tracing_zone_end_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream) {
+  if (!context) return;
+  uint16_t query_id =
+      iree_hal_cuda2_tracing_context_insert_query(context, stream);
+  iree_tracing_gpu_zone_end(context->id, query_id);
+}
+
+#else
+
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+    const iree_hal_cuda2_dynamic_symbols_t* symbols,
+    iree_string_view_t queue_name, CUstream stream,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_cuda2_tracing_context_t** out_context) {
+  *out_context = NULL;
+  return iree_ok_status();
+}
+
+void iree_hal_cuda2_tracing_context_free(
+    iree_hal_cuda2_tracing_context_t* context) {}
+
+void iree_hal_cuda2_tracing_context_collect(
+    iree_hal_cuda2_tracing_context_t* context) {}
+
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
diff --git a/experimental/cuda2/tracing.h b/experimental/cuda2/tracing.h
new file mode 100644
index 000000000000..57b6786dc469
--- /dev/null
+++ b/experimental/cuda2/tracing.h
@@ -0,0 +1,122 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_TRACING_H_
+#define EXPERIMENTAL_CUDA2_TRACING_H_
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Per-stream CUDA tracing context.
+// No-op if IREE tracing is not enabled.
+//
+// Use the IREE_CUDA_TRACE_* macros to trace a contiguous set of stream
+// operations. Unlike the normal tracy macros there are no zone IDs and instead
+// each stream gets an ID allocated once and passed to all tracing macros.
+//
+// Usage:
+//   IREE_CUDA_TRACE_ZONE_BEGIN(queue->tracing_context, stream);
+//   cuLaunchKernel(..., stream);
+//   IREE_CUDA_TRACE_ZONE_END(queue->tracing_context, stream);
+//   ...
+//   iree_hal_cuda2_tracing_context_collect(queue->tracing_context);
+//
+// NOTE: timestamps can have non-trivial side-effecting behavior and may
+// introduce serialization in graph execution.
+//
+// TODO(benvanik): expose CUevent reservation separate from recording. For
+// graphs we will need to insert the events but in order to reuse the graphs
+// we'll need to reserve and patch new events each graph launch. For now we
+// don't instrument graphs.
+//
+// Thread-compatible: external synchronization is required if using from
+// multiple threads (same as with CUstream itself).
+typedef struct iree_hal_cuda2_tracing_context_t
+    iree_hal_cuda2_tracing_context_t;
+
+// Allocates a tracing context for the given CUDA |stream|.
+// Each context must only be used with the stream it was created for.
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+    const iree_hal_cuda2_dynamic_symbols_t* symbols,
+    iree_string_view_t queue_name, CUstream stream,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_cuda2_tracing_context_t** out_context);
+
+// Frees a tracing context and all associated CUDA resources.
+// All submissions using the resources must be completed prior to calling.
+void iree_hal_cuda2_tracing_context_free(
+    iree_hal_cuda2_tracing_context_t* context);
+
+// Collects in-flight timestamp queries from the stream and feeds them to tracy.
+// Must be called frequently (every submission, etc) to drain the backlog;
+// tracing may start failing if the internal ringbuffer is exceeded.
+void iree_hal_cuda2_tracing_context_collect(
+    iree_hal_cuda2_tracing_context_t* context);
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// Begins a normal zone derived on the calling |src_loc|.
+// Must be perfectly nested and paired with a corresponding zone end.
+void iree_hal_cuda2_tracing_zone_begin_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+    const iree_tracing_location_t* src_loc);
+
+// Begins an external zone using the given source information.
+// The provided strings will be copied into the tracy buffer.
+void iree_hal_cuda2_tracing_zone_begin_external_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length);
+
+void iree_hal_cuda2_tracing_zone_end_impl(
+    iree_hal_cuda2_tracing_context_t* context, CUstream stream);
+
+// Begins a new zone with the parent function name.
+#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream)                           \
+  static const iree_tracing_location_t TracyConcat(                           \
+      __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__,       \
+                                            __FILE__, (uint32_t)__LINE__, 0}; \
+  iree_hal_cuda2_tracing_zone_begin_impl(                                     \
+      context, stream, &TracyConcat(__tracy_source_location, __LINE__));
+
+// Begins an externally defined zone with a dynamic source location.
+// The |file_name|, |function_name|, and optional |name| strings will be copied
+// into the trace buffer and do not need to persist.
+#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL(                             \
+    context, stream, file_name, file_name_length, line, function_name,   \
+    function_name_length, name, name_length)                             \
+  iree_hal_cuda2_tracing_zone_begin_external_impl(                       \
+      context, stream, file_name, file_name_length, line, function_name, \
+      function_name_length, name, name_length)
+
+// Ends the current zone. Must be passed the |zone_id| from the _BEGIN.
+#define IREE_CUDA_TRACE_ZONE_END(context, stream) \
+  iree_hal_cuda2_tracing_zone_end_impl(context, stream)
+
+#else
+
+#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream)
+#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL(                           \
+    context, stream, file_name, file_name_length, line, function_name, \
+    function_name_length, name, name_length)
+#define IREE_CUDA_TRACE_ZONE_END(context, stream)
+
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_TRACING_H_