-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Description
🐛 Describe the bug
gpus info : 3 nodes , 4 gpus per node (GeForce RTX 2080 Ti)
pp:3
tp:2
dp:2
I use train_test.py in project [ColossalAI-Example] ,and get the checkpoint file,want to load and test. like this:
trainer = Trainer(engine=engine, logger=logger, timer=timier) last_epoch = 0 if len(args.from_cpt) > 0 & os.path.exists(args.from_cpt): last_epoch = load_checkpoint(args.from_cpt, model, _, _, False)
but it comes error,here is the error info.:
`
Traceback (most recent call last):
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 150, in
main()
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 128, in main
last_epoch = load_checkpoint(args.from_cpt, model, _, _, False)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 276, in load_checkpoint
raise e
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 263, in load_checkpoint
broadcast_model(model)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 200, in broadcast_model
dist.broadcast(p, src_rank, group=group)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1197, in broadcast
work = group.broadcast([tensor], opts)
RuntimeError: [1] is setting up NCCL communicator and retreiving ncclUniqueId from [0] via c10d key-value store by key '0', but store->get('0') got error: Connection reset by peer
Traceback (most recent call last):
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 150, in
main()
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 128, in main
last_epoch = load_checkpoint(args.from_cpt, model, _, _, False)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 279, in load_checkpoint
state_dict = broadcast_state_dict(state_dict, ParallelMode.MODEL)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 23, in broadcast_state_dict
dist.broadcast_object_list(state_dict, src=src_rank, group=gpc.get_cpu_group(parallel_mode))
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1877, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1201, in broadcast
work.wait()
RuntimeError: [/opt/conda/conda-bld/pytorch_1659484806139/work/third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [10.20.0.205]:32579
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3555 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3556 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3557 closing signal SIGTERM
Traceback (most recent call last):
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 150, in
main()
File "/workspace/ColossalAI-Examples/language/gpt/test_gpt.py", line 128, in main
last_epoch = load_checkpoint(args.from_cpt, model, _, _, False)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 279, in load_checkpoint
state_dict = broadcast_state_dict(state_dict, ParallelMode.MODEL)
File "/opt/conda/lib/python3.9/site-packages/colossalai/utils/checkpointing.py", line 23, in broadcast_state_dict
dist.broadcast_object_list(state_dict, src=src_rank, group=gpc.get_cpu_group(parallel_mode))
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1877, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1201, in broadcast
work.wait()
RuntimeError: [/opt/conda/conda-bld/pytorch_1659484806139/work/third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [10.20.0.205]:62621
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 3558) of binary: /opt/conda/bin/python
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 14339) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.12.1', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/run.py", line 761, in main
run(args)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/run.py", line 752, in run
elastic_launch(
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in call
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.12.1', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/run.py", line 761, in main
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
`
Environment
CONDA_DEFAULT_ENV="base"
CONDA_PROMPT_MODIFIER="(base) "
CONDA_PYTHON_EXE="/opt/conda/bin/python"
CONDA_SHLVL="1"
CUDA_HOME="/usr/local/cuda"
CUDA_VERSION="11.6.1"
DATA="/workspace/gpt/traindata/train_data.json"
LD_LIBRARY_PATH="/root/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
LESSOPEN="| /usr/bin/lesspipe %s"
LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
NVARCH="x86_64"
NV_CUDA_COMPAT_PACKAGE="cuda-compat-11-6"
NV_CUDA_CUDART_DEV_VERSION="11.6.55-1"
NV_CUDA_CUDART_VERSION="11.6.55-1"
NV_CUDA_LIB_VERSION="11.6.1-1"
NV_CUDNN_PACKAGE_DEV="libcudnn8-dev=8.4.0.27-1+cuda11.6"
NV_CUDNN_PACKAGE="libcudnn8=8.4.0.27-1+cuda11.6"
NV_CUDNN_PACKAGE_NAME="libcudnn8"
NV_CUDNN_VERSION="8.4.0.27"
NVIDIA_DRIVER_CAPABILITIES="compute,utility"
NVIDIA_REQUIRE_CUDA="cuda>=11.6 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471" NV_LIBCUBLAS_DEV_PACKAGE="libcublas-dev-11-6=11.8.1.74-1"
NVIDIA_VISIBLE_DEVICES="all"
NV_LIBCUBLAS_DEV_PACKAGE_NAME="libcublas-dev-11-6"
NV_LIBCUBLAS_DEV_VERSION="11.8.1.74-1"
NV_LIBCUBLAS_PACKAGE="libcublas-11-6=11.8.1.74-1"
NV_LIBCUBLAS_PACKAGE_NAME="libcublas-11-6"
NV_LIBCUBLAS_VERSION="11.8.1.74-1"
NV_LIBCUSPARSE_DEV_VERSION="11.7.2.112-1"
NV_LIBCUSPARSE_VERSION="11.7.2.112-1"
NV_LIBNCCL_DEV_PACKAGE="libnccl-dev=2.12.7-1+cuda11.6"
NV_LIBNCCL_DEV_PACKAGE_NAME="libnccl-dev"
NV_LIBNCCL_DEV_PACKAGE_VERSION="2.12.7-1"
NV_LIBNCCL_PACKAGE="libnccl2=2.12.7-1+cuda11.6"
NV_LIBNCCL_PACKAGE_NAME="libnccl2"
NV_LIBNCCL_PACKAGE_VERSION="2.12.7-1"
NV_LIBNPP_DEV_PACKAGE="libnpp-dev-11-6=11.6.2.112-1"
NV_LIBNPP_DEV_VERSION="11.6.2.112-1"
NV_LIBNPP_PACKAGE="libnpp-11-6=11.6.2.112-1"
NV_LIBNPP_VERSION="11.6.2.112-1" NCCL_VERSION="2.12.7-1"
NV_NVML_DEV_VERSION="11.6.55-1"
NV_NVPROF_DEV_PACKAGE="cuda-nvprof-11-6=11.6.112-1"
NV_NVPROF_VERSION="11.6.112-1"
NV_NVTX_VERSION="11.6.112-1"
PATH="/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"