Skip to content

Commit

Permalink
[deepspeed] test on one node 2 gpus max (#11237)
Browse files Browse the repository at this point in the history
* test on one node 2 gpus max

* fix the other place

* refactor

* fix

* cleanup

* more exact version
  • Loading branch information
stas00 committed Apr 14, 2021
1 parent 25e1af3 commit 83206ca
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -90,7 +90,7 @@
"cookiecutter==1.7.2",
"dataclasses",
"datasets",
"deepspeed>0.3.13",
"deepspeed>=0.3.14",
"docutils==0.16.0",
"fairscale>0.3",
"faiss-cpu",
Expand Down
15 changes: 10 additions & 5 deletions tests/deepspeed/test_deepspeed.py
Expand Up @@ -595,8 +595,7 @@ def run_trainer(

ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"]
num_gpus = get_gpu_count() if distributed else 1
launcher = f"deepspeed --num_gpus {num_gpus}".split()
launcher = self.get_launcher(distributed)

cmd = launcher + script + args + ds_args
# keep for quick debug
Expand Down Expand Up @@ -629,15 +628,21 @@ def test_clm(self, stage):
--block_size 128
""".split()

distributed = True
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
script = [f"{self.examples_dir_str}/language-modeling/run_clm.py"]
num_gpus = get_gpu_count() if distributed else 1
launcher = f"deepspeed --num_gpus {num_gpus}".split()
launcher = self.get_launcher(distributed=True)

cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())

return output_dir

def get_launcher(self, distributed=False):
# 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
# - it won't be able to handle that
# 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()

0 comments on commit 83206ca

Please sign in to comment.