Skip to content

Commit

Permalink
[cuda] Port over tracing utilities and use in NCCL channel (#14063)
Browse files Browse the repository at this point in the history
The main change is removing the context wrapper and including CUDA
dynamic symbols directly.

Progress towards #13245
  • Loading branch information
antiagainst committed Jun 13, 2023
1 parent 85a1a56 commit 85eb21b
Show file tree
Hide file tree
Showing 8 changed files with 473 additions and 0 deletions.
2 changes: 2 additions & 0 deletions experimental/cuda2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ iree_cc_library(
"nccl_channel.h"
"pipeline_layout.c"
"pipeline_layout.h"
"tracing.c"
"tracing.h"
DEPS
::dynamic_symbols
iree::base
Expand Down
10 changes: 10 additions & 0 deletions experimental/cuda2/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ typedef struct iree_hal_cuda2_device_params_t {
// transient allocations while also increasing memory consumption.
iree_host_size_t arena_block_size;

// Enables tracing of command buffers when IREE tracing is enabled.
// May take advantage of additional extensions for more accurate timing or
// hardware-specific performance counters.
//
// NOTE: tracing has a non-trivial overhead and will skew the timing of
// submissions and introduce false barriers between dispatches. Use this to
// identify slow dispatches and refine from there; be wary of whole-program
// tracing with this enabled.
bool stream_tracing;

// Whether to use async allocations even if reported as available by the
// device. Defaults to true when the device supports it.
bool async_allocations;
Expand Down
11 changes: 11 additions & 0 deletions experimental/cuda2/cuda_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "experimental/cuda2/nccl_dynamic_symbols.h"
#include "experimental/cuda2/nop_executable_cache.h"
#include "experimental/cuda2/pipeline_layout.h"
#include "experimental/cuda2/tracing.h"
#include "iree/base/internal/arena.h"
#include "iree/base/internal/math.h"
#include "iree/hal/utils/buffer_transfer.h"
Expand Down Expand Up @@ -53,6 +54,8 @@ typedef struct iree_hal_cuda2_device_t {
// TODO: support multiple streams.
CUstream cu_stream;

iree_hal_cuda2_tracing_context_t* tracing_context;

iree_allocator_t host_allocator;

// Device memory pools and allocators.
Expand Down Expand Up @@ -82,6 +85,7 @@ IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
memset(out_params, 0, sizeof(*out_params));
out_params->arena_block_size = 32 * 1024;
out_params->queue_count = 1;
out_params->stream_tracing = false;
out_params->async_allocations = true;
}

Expand Down Expand Up @@ -128,7 +132,13 @@ static iree_status_t iree_hal_cuda2_device_create_internal(
device->cu_stream = stream;
device->host_allocator = host_allocator;

// Enable tracing for the (currently only) stream - no-op if disabled.
iree_status_t status = iree_ok_status();
if (device->params.stream_tracing) {
status = iree_hal_cuda2_tracing_context_allocate(
device->cuda_symbols, device->identifier, stream, &device->block_pool,
host_allocator, &device->tracing_context);
}

// Memory pool support is conditional.
if (iree_status_is_ok(status) && params->async_allocations) {
Expand Down Expand Up @@ -237,6 +247,7 @@ static void iree_hal_cuda2_device_destroy(iree_hal_device_t* base_device) {
iree_hal_cuda2_memory_pools_deinitialize(&device->memory_pools);

// TODO: support multiple streams.
iree_hal_cuda2_tracing_context_free(device->tracing_context);
IREE_CUDA_IGNORE_ERROR(device->cuda_symbols,
cuStreamDestroy(device->cu_stream));

Expand Down
26 changes: 26 additions & 0 deletions experimental/cuda2/nccl_channel.c
Original file line number Diff line number Diff line change
Expand Up @@ -544,11 +544,29 @@ static iree_status_t iree_hal_cuda2_nccl_submit_batch_entry(

iree_status_t iree_hal_cuda2_nccl_submit_batch(
const iree_hal_cuda2_nccl_dynamic_symbols_t* symbols,
iree_hal_cuda2_tracing_context_t* tracing_context,
const iree_hal_collective_batch_t* batch, CUstream stream) {
IREE_ASSERT_ARGUMENT(symbols);
IREE_ASSERT_ARGUMENT(batch);
IREE_ASSERT_ARGUMENT(stream);

// Begin one zone for each entry in the batch. Each entry will show stacked on
// top of each other and unfortunately use independent CUDA events. We could
// optimize this by changing the tracing context to expose an API with event
// reservation and then zone commit using an existing event.
IREE_TRACE({
iree_bitfield_string_temp_t string_temp;
for (iree_host_size_t i = 0; i < batch->count; ++i) {
iree_hal_collective_batch_entry_t* entry = &batch->entries[i];
iree_string_view_t collective_str =
iree_hal_collective_op_format(&entry->op, &string_temp);
IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL(
tracing_context, stream, __FILE__, strlen(__FILE__),
(uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
collective_str.data, collective_str.size);
}
});

// Issue all collective operations in the batch as part of a group.
// NCCL may be able to fuse or reduce overheads by issuing like this.
IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupStart(), "ncclGroupStart");
Expand All @@ -558,6 +576,14 @@ iree_status_t iree_hal_cuda2_nccl_submit_batch(
}
IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupEnd(), "ncclGroupEnd");

// End all zones we began above - note that these are just simply nested so
// order doesn't matter so long as we end the right number of zones.
IREE_TRACE({
for (iree_host_size_t i = 0; i < batch->count; ++i) {
IREE_CUDA_TRACE_ZONE_END(tracing_context, stream);
}
});

return iree_ok_status();
}

Expand Down
2 changes: 2 additions & 0 deletions experimental/cuda2/nccl_channel.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "experimental/cuda2/cuda_dynamic_symbols.h"
#include "experimental/cuda2/cuda_headers.h"
#include "experimental/cuda2/nccl_dynamic_symbols.h"
#include "experimental/cuda2/tracing.h"
#include "iree/base/api.h"
#include "iree/hal/api.h"
#include "iree/hal/utils/collective_batch.h"
Expand Down Expand Up @@ -48,6 +49,7 @@ iree_status_t iree_hal_cuda2_nccl_channel_create(
// Note that operations in the batch may apply to different channels.
iree_status_t iree_hal_cuda2_nccl_submit_batch(
const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
iree_hal_cuda2_tracing_context_t* tracing_context,
const iree_hal_collective_batch_t* batch, CUstream stream);

#ifdef __cplusplus
Expand Down
7 changes: 7 additions & 0 deletions experimental/cuda2/registration/driver_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ IREE_FLAG(
bool, cuda_async_allocations, true,
"Enables CUDA asynchronous stream-ordered allocations when supported.");

IREE_FLAG(
bool, cuda_tracing, true,
"Enables tracing of stream events when Tracy instrumentation is enabled.\n"
"Severely impacts benchmark timings and should only be used when\n"
"analyzing dispatch timings.");

IREE_FLAG(int32_t, cuda2_default_index, 0,
"Specifies the index of the default CUDA device to use");

Expand Down Expand Up @@ -84,6 +90,7 @@ static iree_status_t iree_hal_cuda2_driver_factory_try_create(

iree_hal_cuda2_device_params_t device_params;
iree_hal_cuda2_device_params_initialize(&device_params);
device_params.stream_tracing = FLAG_cuda_tracing;
device_params.async_allocations = FLAG_cuda_async_allocations;

driver_options.default_device_index = FLAG_cuda2_default_index;
Expand Down
Loading

0 comments on commit 85eb21b

Please sign in to comment.