karpathy · chinthysl · May 17, 2024
diff --git a/Makefile b/Makefile
@@ -188,23 +188,21 @@ else
   endif
 endif
 
-# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
+# Check if NCCL is available for multi-GPU training
 ifeq ($(NO_MULTI_GPU), 1)
-  $(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
+  $(info → Multi-GPU is manually disabled)
 else
   ifneq ($(OS), Windows_NT)
     # Detect if running on macOS or Linux
     ifeq ($(SHELL_UNAME), Darwin)
-      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
-    else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
-      $(info ✓ OpenMPI found, OK to train with multiple GPUs)
-      NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
-      NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
-      NVCC_LDLIBS += -lmpi -lnccl
+      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
+    else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
+      $(info ✓ NCCL found, OK to train with multiple GPUs)
+      NVCC_LDLIBS += -lnccl
       NVCC_FLAGS += -DMULTI_GPU
     else
-      $(info ✗ OpenMPI is not found, disabling multi-GPU support)
-      $(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
+      $(info ✗ NCCL is not found, disabling multi-GPU support)
+      $(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
     endif
   endif
 endif

diff --git a/README.md b/README.md
@@ -128,19 +128,21 @@ sudo apt-get -y install libcudnn9-dev-cuda-12
 
 On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
-**multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
+**multi-GPU training**. Support for multi-GPU training is availabel using NCCL. Make sure you download and install [NCCL](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), e.g. on Linux:
 
 ```bash
-sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
+sudo sudo apt install libnccl2 libnccl-dev
 ```
 
 and then:
 
 ```bash
 make train_gpt2cu
-mpirun -np <number of GPUs> ./train_gpt2cu
+mpirun -np <number of GPUs> bach -c './train_gpt2cu -pn <number of GPUs> -pr $OMPI_COMM_WORLD_RANK'
 ```
 
+**multi-node training**. For SLURM enabled cluster, use the sample script in [scripts/run_gpt2_124M.sbatch](scripts/run_gpt2_124M.sbatch)
+
 ## experiments / sweeps
 
 Just as an example process to sweep learning rates on a machine with 4 GPUs on TinyStories. Run a shell script `sweep.sh` (after you of course `chmod u+x sweep.sh`):

diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
@@ -11,7 +11,6 @@ endif
 # Compiler flags
 CFLAGS = -O3 --use_fast_math
 NVCCFLAGS = -lcublas -lcublasLt -std=c++17
-MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
 
 # Default rule for our CUDA files
 %: %.cu
@@ -52,7 +51,7 @@ global_norm: global_norm.cu
 
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
-	$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
+	$(NVCC) -lnccl $(NVCCFLAGS) nccl_all_reduce.cu -o nccl_all_reduce
 
 # Run all targets
 run_all: all

diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
@@ -5,17 +5,16 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
 Then aggregates the values in the resulting vectors.
 
 Compile example:
-nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
+nvcc -lnccl -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
 
 Run on 2 local GPUs (set -np to a different value to change GPU count):
-mpirun -np 2 ./nccl_all_reduce
+mpirun -np 2 bash -c './nccl_all_reduce $OMPI_COMM_WORLD_RANK'
 
 */
 
 #include "common.h"
 #include <assert.h>
 #include <cuda_runtime.h>
-#include <mpi.h>
 #include <nccl.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -31,99 +30,52 @@ void nccl_check(ncclResult_t status, const char *file, int line) {
 }
 #define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__))
 
-void mpi_check(int status, const char *file, int line) {
-  if (status != MPI_SUCCESS) {
-    char mpi_error[4096];
-    int mpi_error_len = 0;
-    assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) ==
-           MPI_SUCCESS);
-    printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len,
-           mpi_error);
-    exit(EXIT_FAILURE);
-  }
-}
-#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
-
-// Sets a vector to a predefined value
-__global__ void set_vector(float *data, int N, float value) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Check for out-of-bounds access
-  if (i < N) {
-    data[i] = value;
-  }
-}
-
 size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
 
 // Parameters specific to training on multiple GPUs.
 typedef struct {
-  int process_rank;      // Rank of this process among all MPI processes on all hosts. 0 if no multi-GPU.
+  int process_rank;      // Rank of this process among all processes on all hosts. 0 if no multi-GPU.
   int num_processes;     // Total number of processes on all hosts. 1 if no multi-GPU.
-  int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
+  int device_idx;        // This process GPU index on current machine. 0 if no multi-GPU.
   ncclComm_t nccl_comm;  // NCCL communication primitive, used for collective mutli-GPU work.
 } MultiGpuConfig;
 
-// Determine which GPU this process should use.
-// Processes on the same machines use different GPU indicies. Processes on other machines don't.
-// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread
-int multi_gpu_get_local_device_idx(int process_rank, int num_processes) {
-  char hostname[1024];
-  hostname[1023] = '\0';
-  // All processes on the same machine will share the same hostname.
-  gethostname(hostname, 1023);
-  for (int i=0; i < 1024; i++) {
-    if (hostname[i] == '.') {
-        hostname[i] = '\0';
-        break;
-    }
-  }
-  uint64_t hostname_hash = 5381;
-  for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5) + hostname_hash) ^ hostname[c]; }
-
-  // Distribute all hostname hashes to all processes.
-  uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t));
-  all_hostsname_hashes[process_rank] = hostname_hash;
-  mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
-
-  // Identify which GPU we need to use.
-  int local_device_idx = 0;
-  for (int current_process = 0; current_process < num_processes; ++current_process) {
-     if (current_process == process_rank) {
-      // Found my gpu, local_device_idx now has my target GPU index.
-      break;
-     }
-     if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) {
-      // This process ID runs on the same machine, but it's not me, skip this GPU
-      local_device_idx++;
-     }
-  }
-
-  free(all_hostsname_hashes);
-  return local_device_idx;
-}
-
-MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) {
-    // Initialize MPI.
+MultiGpuConfig multi_gpu_config_init(int num_processes, int process_rank, int gpus_per_node, char *dfs_path) {
     MultiGpuConfig result;
-    mpiCheck(MPI_Init(argc, argv));
-    mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank));
-    mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes));
-    result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes);
-    printf("[Process rank %d] Using GPU %d\n", result.process_rank, result.local_device_idx);
-    cudaCheck(cudaSetDevice(result.local_device_idx));
     ncclUniqueId nccl_id;
-    if (result.process_rank == 0) {
+
+    result.process_rank = process_rank;
+    result.num_processes = num_processes;
+    result.device_idx = process_rank % gpus_per_node;
+
+    FILE* idFile;
+    static char filename[256];
+    snprintf(filename, sizeof(filename), "%s/ncclUniqueId.dat", dfs_path);
+
+    if (result.process_rank == 0) { // Generate the NCCL unique ID at rank 0 and write it to a file
         ncclCheck(ncclGetUniqueId(&nccl_id));
+        idFile = fopen(filename, "wb");
+        assert(idFile != NULL);
+        fwrite(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fclose(idFile);            
+    } else {                        // Other ranks wait until the file is available and read the unique ID
+        do {
+            usleep(1000000);
+            idFile = fopen(filename, "rb");
+            if (idFile != NULL) break;
+        } while (idFile == NULL);
+        fread(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fclose(idFile);
     }
-    mpiCheck(MPI_Bcast((void *)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
+
+    printf("ProcessID:%d, NumProcess::%d, DeviceId:%d\n", result.process_rank, result.num_processes, result.device_idx);
+    cudaCheck(cudaSetDevice(result.device_idx));
     ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank));
     return result;
 }
 
 void multi_gpu_config_free(const MultiGpuConfig* multi_gpu_config) {
     ncclCommDestroy(multi_gpu_config->nccl_comm);
-    mpiCheck(MPI_Finalize());
 }
 
 float get_mean(float *arr, size_t size, int process_rank) {
@@ -134,12 +86,20 @@ float get_mean(float *arr, size_t size, int process_rank) {
   return sum / size;
 }
 
-int main(int argc, char **argv) {
+// CUDA kernel to set each element of the array to a specific value
+__global__ void set_vector(float *array, float value, size_t num_elements) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_elements) {
+        array[idx] = value;
+    }
+}
+
+int main(int argc, char *argv[]) {
   // Some constants
   const size_t all_reduce_buffer_size = 32 * 1024 * 1024;
   const size_t threads_per_block = 1024;
 
-  MultiGpuConfig multi_gpu_config = multi_gpu_config_init(&argc, &argv);
+  MultiGpuConfig multi_gpu_config = multi_gpu_config_init(2, atoi(argv[1]), 8, ".");
 
   // Allocating buffers on each of the devices.
   float *all_reduce_buffer;