From 90c4c8ea10d3e110115906146f051fe51784560c Mon Sep 17 00:00:00 2001
From: Abhinav Bhatele <bhatele@llnl.gov>
Date: Sun, 24 Mar 2024 14:30:01 -0700
Subject: [PATCH 1/5] WIP: allgather benchmark

---
 allgather.cu | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 allgather.cu
diff --git a/allgather.cu b/allgather.cu
new file mode 100644
index 0000000..41ba72b
--- /dev/null
+++ b/allgather.cu
@@ -0,0 +1,193 @@
+/* \file allgather.c
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef USE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda_fp16.h>
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif defined(USE_RCCL)
+  #include "rccl.h"
+#endif
+
+#define NUM_GPU_DEVICES_PER_NODE	4
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPICHECK(cmd) do {                          \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed, NCCL error %s:%d '%s'\n",       \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(half *data, int size) {
+    for (int i = 0; i < (size / sizeof(half)); ++i) {
+        data[i] = __float2half((float)i);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 6) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    cudaSetDevice((my_rank % NUM_GPU_DEVICES_PER_NODE));
+
+    int local_data_size = max_msg_size; // Size of local data to be reduced
+    int global_data_size = local_data_size * num_gpus; // Size of global data
+
+    half *local_data = (half*)malloc(local_data_size);
+    half *global_data = (half*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    half *d_local_data, *d_global_data;
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #ifdef USE_NCCL
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank));
+    #elif defined(USE_RCCL)
+    // TODO: fix later
+    rcclComm_t rccl_comm;
+    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+        total_time = 0.0;
+
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+	    MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF,
+		d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request));
+                
+            MPICHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL);
+            #elif defined(USE_RCCL)
+	    // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations + 5; ++i) {
+            #ifdef USE_MPI
+            MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF,
+                d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request));
+
+            MPICHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL);
+            #elif defined(USE_RCCL)
+            // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+        printf("%d %.6f seconds\n", msg_size, (total_time/iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+
+    #ifdef USE_NCCL
+    ncclCommDestroy(nccl_comm);
+    #elif defined(USE_RCCL)
+    rcclCommDestroy(rccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+

From 12d0c3d222137caa31c75b13312b2f2320fbdfaa Mon Sep 17 00:00:00 2001
From: Abhinav Bhatele <bhatele@llnl.gov>
Date: Sun, 24 Mar 2024 18:38:25 -0700
Subject: [PATCH 2/5] change to bf16

---
 Makefile     | 15 +++++++++++++++
 README       |  9 +++++++++
 allgather.cu | 50 +++++++++++++++++++++++++++++---------------------
 3 files changed, 53 insertions(+), 21 deletions(-)
 create mode 100644 Makefile
 create mode 100644 README

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5d75e9d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC	= cc
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+
+all: allgather.x
+
+allgather.x: allgather.cu 
+	${CC} ${CFLAGS} -o allgather.x allgather.cu 
+
+clean: 
+	rm -f allgather.x 
diff --git a/README b/README
new file mode 100644
index 0000000..eba2046
--- /dev/null
+++ b/README
@@ -0,0 +1,9 @@
+Before compiling do these:
+
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
+export CRAY_ACCEL_TARGET=nvidia80
+
+When running do these:
+
+module load cudatoolkit
+export MPICH_GPU_SUPPORT_ENABLED=1
diff --git a/allgather.cu b/allgather.cu
index 41ba72b..ea43599 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -11,7 +11,7 @@
 
 #ifdef USE_CUDA
   #include <cuda_runtime.h>
-  #include <cuda_fp16.h>
+  #include <cuda_bf16.h>
 #endif
 
 #ifdef USE_NCCL
@@ -20,7 +20,6 @@
   #include "rccl.h"
 #endif
 
-#define NUM_GPU_DEVICES_PER_NODE	4
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPICHECK(cmd) do {                          \
@@ -50,14 +49,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(half *data, int size) {
-    for (int i = 0; i < (size / sizeof(half)); ++i) {
-        data[i] = __float2half((float)i);
+void initializeData(nv_bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+        data[i] = __float2bfloat16((float)i);
     }
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 6) {
+    if (argc != 5) {
         fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
         return EXIT_FAILURE;
     }
@@ -85,26 +84,34 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
-    cudaSetDevice((my_rank % NUM_GPU_DEVICES_PER_NODE));
+    int num_gpus_per_node;
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
 
     int local_data_size = max_msg_size; // Size of local data to be reduced
     int global_data_size = local_data_size * num_gpus; // Size of global data
 
-    half *local_data = (half*)malloc(local_data_size);
-    half *global_data = (half*)malloc(global_data_size);
+    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
+    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
     // Allocate memory on GPU
-    half *d_local_data, *d_global_data;
+    nv_bfloat16 *d_local_data, *d_global_data;
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
 
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
-    #ifdef USE_NCCL
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    #elif USE_NCCL
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -119,6 +126,7 @@ int main(int argc, char *argv[]) {
 
     /* Create a new NCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank));
+
     #elif defined(USE_RCCL)
     // TODO: fix later
     rcclComm_t rccl_comm;
@@ -136,19 +144,18 @@ int main(int argc, char *argv[]) {
         printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
+    fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-        total_time = 0.0;
-
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
-	    MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF,
-		d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request));
+	    MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16,
+		d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
             MPICHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
-            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL);
+            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL);
             #elif defined(USE_RCCL)
 	    // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
@@ -157,14 +164,14 @@ int main(int argc, char *argv[]) {
 
         MPI_Barrier(MPI_COMM_WORLD);
         start_time = MPI_Wtime();
-	for (int i = 0; i < iterations + 5; ++i) {
+	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPICHECK(MPI_Iallgather(d_local_data, msg_size, MPI_HALF,
-                d_global_data, msg_size, MPI_HALF, MPI_COMM_WORLD, &request));
+            MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16,
+                d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
 
             MPICHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
-            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, ncclSum, nccl_comm, NULL);
+            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL);
             #elif defined(USE_RCCL)
             // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
@@ -172,7 +179,8 @@ int main(int argc, char *argv[]) {
         }
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
-        printf("%d %.6f seconds\n", msg_size, (total_time/iterations));
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, total_time);
     }
 
     // Cleanup

From 29beb46e201fe56c4e33b500542125a7a02fef2c Mon Sep 17 00:00:00 2001
From: Abhinav Bhatele <bhatele@llnl.gov>
Date: Sun, 24 Mar 2024 19:04:30 -0700
Subject: [PATCH 3/5] fix bug in MPI portion

---
 allgather.cu | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/allgather.cu b/allgather.cu
index ea43599..cb11cc8 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -72,6 +72,8 @@ int main(int argc, char *argv[]) {
     }
 
     int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
 
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
@@ -84,7 +86,6 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
-    int num_gpus_per_node;
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
 
@@ -147,11 +148,13 @@ int main(int argc, char *argv[]) {
     fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(nv_bfloat16);
+
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
-	    MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16,
-		d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+	    MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
             MPICHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
@@ -166,8 +169,8 @@ int main(int argc, char *argv[]) {
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPICHECK(MPI_Iallgather(d_local_data, msg_size, mpi_type_bfloat16,
-                d_global_data, msg_size, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+            MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+                d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
 
             MPICHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
@@ -180,7 +183,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, total_time);
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup

From 621bf3ed67eb7d1a3c871e686bbe5478ebba720a Mon Sep 17 00:00:00 2001
From: Abhinav Bhatele <bhatele@llnl.gov>
Date: Sun, 24 Mar 2024 19:20:44 -0700
Subject: [PATCH 4/5] fix nccl allgather

---
 Makefile     |  7 +++++--
 allgather.cu | 26 +++++++++++++-------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 5d75e9d..5e0777e 100644
--- a/Makefile
+++ b/Makefile
@@ -4,12 +4,15 @@
 # SPDX-License-Identifier: MIT
 
 CC	= cc
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/plugin/lib
+
 
 all: allgather.x
 
 allgather.x: allgather.cu 
-	${CC} ${CFLAGS} -o allgather.x allgather.cu 
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
 	rm -f allgather.x 
diff --git a/allgather.cu b/allgather.cu
index cb11cc8..1d2346a 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -22,7 +22,7 @@
 
 #define NUM_WARMUP_ITERATIONS		5
 
-#define MPICHECK(cmd) do {                          \
+#define MPI_CHECK(cmd) do {                          \
   int e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
     printf("Failed: MPI error %s:%d '%d'\n",        \
@@ -40,11 +40,11 @@
   }                                                 \
 } while(0)
 
-#define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
-    printf("Failed, NCCL error %s:%d '%s'\n",       \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
+#define NCCL_CHECK(cmd) do {                         \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
     exit(EXIT_FAILURE);                             \
   }                                                 \
 } while(0)
@@ -126,7 +126,7 @@ int main(int argc, char *argv[]) {
                         0, MPI_COMM_WORLD));
 
     /* Create a new NCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, rank));
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
 
     #elif defined(USE_RCCL)
     // TODO: fix later
@@ -153,12 +153,12 @@ int main(int argc, char *argv[]) {
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
-	    MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
 		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
-            MPICHECK(MPI_Wait(&request, &status));
+            MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
-            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL);
+            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
             #elif defined(USE_RCCL)
 	    // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
@@ -169,12 +169,12 @@ int main(int argc, char *argv[]) {
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPICHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+            MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
                 d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
 
-            MPICHECK(MPI_Wait(&request, &status));
+            MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
-            NCCLCHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_size, ncclHalf, nccl_comm, NULL);
+            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
             #elif defined(USE_RCCL)
             // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);

From 418ddf62ed3de72b354c54f539a6a58b1ad3543e Mon Sep 17 00:00:00 2001
From: Abhinav Bhatele <bhatele@llnl.gov>
Date: Sun, 24 Mar 2024 19:26:31 -0700
Subject: [PATCH 5/5] fix linking

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5e0777e..df453b4 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
 CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/plugin/lib
+LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
 all: allgather.x