Updated torchrun instructions (#2096)

* Updated torchrun instructions * Update examples/README.md Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * Update examples/README.md Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * Update examples/README.md Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * Update examples/README.md Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * Update README.md for torchrun instructions * Added SLURM scripts and updated README * Update examples/Slurm/submit-multinode.sh Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update examples/Slurm/submit-multiGPU.sh Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update examples/README.md Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update examples/README.md Co-authored-by: Zach Mueller <muellerzr@gmail.com> * final details * modified argument parser * modified slurm multigpu script * modified multinode slurm script * Added accelerate multine issue * Update examples/README.md Co-authored-by: Zach Mueller <muellerzr@gmail.com> * fixed readme commnad * added --main_process_port specification to readme * Revert "modified argument parser" This reverts commit c3bef5c. --------- Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> Co-authored-by: Zach Mueller <muellerzr@gmail.com>
huggingface · Nov 20, 2023 · 427ef8b · 427ef8b
1 parent 35b0206
commit 427ef8b
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 26 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -64,28 +64,25 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./nlp_example.py  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./nlp_example.py
+        torchrun --nproc_per_node 2 ./nlp_example.py
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
         ```bash
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch)
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the second server
+        torchrun \ # python -m torch.distributed.run 
+            --nproc_per_node 2 \
+            --nnodes 2 \
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./nlp_example.py
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -152,28 +149,25 @@ To run it in each of these various modes, use the following commands:
         accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
         accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./cv_example.py --data_dir path_to_data
+        torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
         ```bash
         accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
         accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
-            ./cv_example.py --data_dir path_to_data  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
-            ./cv_example.py --data_dir path_to_data  # On the second server
+        torchrun \ # python -m torch.distributed.run
+            --nproc_per_node 2 \
+            --nnodes 2 \
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./cv_example.py --data_dir path_to_data
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -206,6 +200,13 @@ with `pip install runhouse`, and you can refer to
 for hardware setup instructions, or this
 [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough.
 
+## SLURM Scripts 
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
+
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
+
+In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.
+
 ## Finer Examples
 
 While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.

diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#SBATCH --job-name=multigpu
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+export GPUS_PER_NODE=4
+######################
+
+export SCRIPT=/accelerate/examples/complete_nlp_example.py
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    --with_tracking \
+    "
+
+accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS
diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#SBATCH --job-name=multinode
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=4                   # number of nodes
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+export GPUS_PER_NODE=4
+######################
+
+######################
+#### Set network #####
+######################
+head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+######################
+
+export LAUNCHER="accelerate launch \
+    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
+    --num_machines $SLURM_NNODES \
+    --rdzv_backend c10d \
+    --main_process_ip $head_node_ip \
+    --main_process_port 29500 \
+    "
+export SCRIPT="/accelerate/examples/complete_nlp_example.py"
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    "
+
+# This step is necessary because accelerate launch does not handle multiline arguments properly
+export CMD="$LAUNCHER $PYTHON_FILE $ARGS" 
+srun $CMD