From 90c4c8ea10d3e110115906146f051fe51784560c Mon Sep 17 00:00:00 2001 From: Abhinav Bhatele Date: Sun, 24 Mar 2024 14:30:01 -0700 Subject: [PATCH 1/5] WIP: allgather benchmark --- allgather.cu | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 allgather.cu diff --git a/allgather.cu b/allgather.cu new file mode 100644 index 0000000..41ba72b --- /dev/null +++ b/allgather.cu @@ -0,0 +1,193 @@ +/* \file allgather.c + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#ifdef USE_CUDA + #include + #include +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif defined(USE_RCCL) + #include "rccl.h" +#endif + +#define NUM_GPU_DEVICES_PER_NODE 4 +#define NUM_WARMUP_ITERATIONS 5 + +#define MPICHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(half *data, int size) { + for (int i = 0; i < (size / sizeof(half)); ++i) { + data[i] = __float2half((float)i); + } +} + +int main(int argc, char *argv[]) { + if (argc != 6) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + cudaSetDevice((my_rank % NUM_GPU_DEVICES_PER_NODE)); + + int local_data_size = max_msg_size; // Size of local data to be reduced + int global_data_size = local_data_size * num_gpus; // Size of global data + + half *local_data = (half*)malloc(local_data_size); + half *global_data = (half*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + half *d_local_data, *d_global_data; + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #ifdef USE_NCCL + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank)); + #elif defined(USE_RCCL) + // TODO: fix later + rcclComm_t rccl_comm; + rcclCommInitRank(&comm, num_gpus, 0, rccl_root); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + total_time = 0.0; + + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF, + d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request)); + + MPICHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations + 5; ++i) { + #ifdef USE_MPI + MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF, + d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request)); + + MPICHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + printf("%d %.6f seconds\n", msg_size, (total_time/iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + + #ifdef USE_NCCL + ncclCommDestroy(nccl_comm); + #elif defined(USE_RCCL) + rcclCommDestroy(rccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + From 12d0c3d222137caa31c75b13312b2f2320fbdfaa Mon Sep 17 00:00:00 2001 From: Abhinav Bhatele Date: Sun, 24 Mar 2024 18:38:25 -0700 Subject: [PATCH 2/5] change to bf16 --- Makefile | 15 +++++++++++++++ README | 9 +++++++++ allgather.cu | 50 +++++++++++++++++++++++++++++--------------------- 3 files changed, 53 insertions(+), 21 deletions(-) create mode 100644 Makefile create mode 100644 README diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5d75e9d --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI + +all: allgather.x + +allgather.x: allgather.cu + ${CC} ${CFLAGS} -o allgather.x allgather.cu + +clean: + rm -f allgather.x diff --git a/README b/README new file mode 100644 index 0000000..eba2046 --- /dev/null +++ b/README @@ -0,0 +1,9 @@ +Before compiling do these: + +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 +export CRAY_ACCEL_TARGET=nvidia80 + +When running do these: + +module load cudatoolkit +export MPICH_GPU_SUPPORT_ENABLED=1 diff --git a/allgather.cu b/allgather.cu index 41ba72b..ea43599 100644 --- a/allgather.cu +++ b/allgather.cu @@ -11,7 +11,7 @@ #ifdef USE_CUDA #include - #include + #include #endif #ifdef USE_NCCL @@ -20,7 +20,6 @@ #include "rccl.h" #endif -#define NUM_GPU_DEVICES_PER_NODE 4 #define NUM_WARMUP_ITERATIONS 5 #define MPICHECK(cmd) do { \ @@ -50,14 +49,14 @@ } \ } while(0) -void initializeData(half *data, int size) { - for (int i = 0; i < (size / sizeof(half)); ++i) { - data[i] = __float2half((float)i); +void initializeData(nv_bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { + data[i] = __float2bfloat16((float)i); } } int main(int argc, char *argv[]) { - if (argc != 6) { + if (argc != 5) { fprintf(stderr, "Usage: %s \n", argv[0]); return EXIT_FAILURE; } @@ -85,26 +84,34 @@ int main(int argc, char *argv[]) { } // Initialize GPU context - cudaSetDevice((my_rank % NUM_GPU_DEVICES_PER_NODE)); + int num_gpus_per_node; + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); int local_data_size = max_msg_size; // Size of local data to be reduced int global_data_size = local_data_size * num_gpus; // Size of global data - half *local_data = (half*)malloc(local_data_size); - half *global_data = (half*)malloc(global_data_size); + nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); + nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); // Allocate memory on GPU - half *d_local_data, *d_global_data; + nv_bfloat16 *d_local_data, *d_global_data; CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - #ifdef USE_NCCL + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + #elif USE_NCCL ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -119,6 +126,7 @@ int main(int argc, char *argv[]) { /* Create a new NCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank)); + #elif defined(USE_RCCL) // TODO: fix later rcclComm_t rccl_comm; @@ -136,19 +144,18 @@ int main(int argc, char *argv[]) { printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } + fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - total_time = 0.0; - // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF, - d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request)); + MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16, + d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPICHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) - NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL); + NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); @@ -157,14 +164,14 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); start_time = MPI_Wtime(); - for (int i = 0; i < iterations + 5; ++i) { + for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF, - d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request)); + MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16, + d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPICHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) - NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL); + NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); @@ -172,7 +179,8 @@ int main(int argc, char *argv[]) { } MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; - printf("%d %.6f seconds\n", msg_size, (total_time/iterations)); + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, total_time); } // Cleanup From 29beb46e201fe56c4e33b500542125a7a02fef2c Mon Sep 17 00:00:00 2001 From: Abhinav Bhatele Date: Sun, 24 Mar 2024 19:04:30 -0700 Subject: [PATCH 3/5] fix bug in MPI portion --- allgather.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/allgather.cu b/allgather.cu index ea43599..cb11cc8 100644 --- a/allgather.cu +++ b/allgather.cu @@ -72,6 +72,8 @@ int main(int argc, char *argv[]) { } int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -84,7 +86,6 @@ int main(int argc, char *argv[]) { } // Initialize GPU context - int num_gpus_per_node; cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); @@ -147,11 +148,13 @@ int main(int argc, char *argv[]) { fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(nv_bfloat16); + // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16, - d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPICHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) @@ -166,8 +169,8 @@ int main(int argc, char *argv[]) { start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16, - d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPICHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) @@ -180,7 +183,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, total_time); + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup From 621bf3ed67eb7d1a3c871e686bbe5478ebba720a Mon Sep 17 00:00:00 2001 From: Abhinav Bhatele Date: Sun, 24 Mar 2024 19:20:44 -0700 Subject: [PATCH 4/5] fix nccl allgather --- Makefile | 7 +++++-- allgather.cu | 26 +++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 5d75e9d..5e0777e 100644 --- a/Makefile +++ b/Makefile @@ -4,12 +4,15 @@ # SPDX-License-Identifier: MIT CC = cc -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/plugin/lib + all: allgather.x allgather.x: allgather.cu - ${CC} ${CFLAGS} -o allgather.x allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: rm -f allgather.x diff --git a/allgather.cu b/allgather.cu index cb11cc8..1d2346a 100644 --- a/allgather.cu +++ b/allgather.cu @@ -22,7 +22,7 @@ #define NUM_WARMUP_ITERATIONS 5 -#define MPICHECK(cmd) do { \ +#define MPI_CHECK(cmd) do { \ int e = cmd; \ if( e != MPI_SUCCESS ) { \ printf("Failed: MPI error %s:%d '%d'\n", \ @@ -40,11 +40,11 @@ } \ } while(0) -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while(0) @@ -126,7 +126,7 @@ int main(int argc, char *argv[]) { 0, MPI_COMM_WORLD)); /* Create a new NCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank)); + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); #elif defined(USE_RCCL) // TODO: fix later @@ -153,12 +153,12 @@ int main(int argc, char *argv[]) { // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - MPICHECK(MPI_Wait(&request, &status)); + MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) - NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL); + NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); @@ -169,12 +169,12 @@ int main(int argc, char *argv[]) { start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - MPICHECK(MPI_Wait(&request, &status)); + MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) - NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL); + NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); From 418ddf62ed3de72b354c54f539a6a58b1ad3543e Mon Sep 17 00:00:00 2001 From: Abhinav Bhatele Date: Sun, 24 Mar 2024 19:26:31 -0700 Subject: [PATCH 5/5] fix linking --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5e0777e..df453b4 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/plugin/lib +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl all: allgather.x