-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Description
🐛 Describe the bug
I tried examples/language/gpt/experiments/pipeline_parrallel/run.sh and examples/language/gpt/titans/run.sh. But no one works. The followings are error messages:
for pipeline/parrallel/run.sh, I got:
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/rpc/internal.py", line 206, in _run_function
result = python_udf.func(*python_udf.args, **python_udf.kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/rpc/rref_proxy.py", line 11, in _local_invoke
return getattr(rref.local_value(), func_name)(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/pipeline/rpc/_pipeline_base.py", line 230, in sync_global_worker_rrefs
self._initialize_partition()
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/pipeline/rpc/_pipeline_base.py", line 185, in _initialize_partition
self.module_partition: nn.Module = partition_fn(*partition_args).to(device)
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py", line 74, in partition
module = create_partition_module(pp_rank, stage_num, model, data_kwargs)
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py", line 61, in create_partition_module
graph = tracer.trace(root=model, meta_args=meta_args)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 397, in trace
self.graph = super().trace(root, concrete_args=concrete_args)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 739, in trace
(self.create_arg(fn(*args)),),
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/model_zoo.py", line 29, in forward
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1043, in forward
transformer_outputs = self.transformer(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 887, in forward
outputs = block(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 388, in forward
target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: maximum recursion depth exceeded while calling a Python object
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/rpc/internal.py", line 206, in _run_function
result = python_udf.func(*python_udf.args, **python_udf.kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/rpc/rref_proxy.py", line 11, in _local_invoke
return getattr(rref.local_value(), func_name)(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/pipeline/rpc/_pipeline_base.py", line 230, in sync_global_worker_rrefs
self._initialize_partition()
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/pipeline/rpc/_pipeline_base.py", line 185, in _initialize_partition
self.module_partition: nn.Module = partition_fn(*partition_args).to(device)
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py", line 74, in partition
module = create_partition_module(pp_rank, stage_num, model, data_kwargs)
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py", line 61, in create_partition_module
graph = tracer.trace(root=model, meta_args=meta_args)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 397, in trace
self.graph = super().trace(root, concrete_args=concrete_args)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 739, in trace
(self.create_arg(fn(*args)),),
File "/ossfs/workspace/ColossalAi/examples/language/gpt/experiments/pipeline_parallel/model_zoo.py", line 29, in forward
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1043, in forward
transformer_outputs = self.transformer(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 887, in forward
outputs = block(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 388, in forward
attn_outputs = self.attn(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 310, in forward
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/pytorch_utils.py", line 115, in forward
x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 705, in module_getattr_wrapper
return self.getattr(attr, attr_val, parameter_proxy_cache)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 484, in getattr
maybe_parameter_proxy = maybe_get_proxy_for_attr(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 478, in maybe_get_proxy_for_attr
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 142, in create_proxy
meta_out = self._meta_data_computing(
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 308, in _meta_data_computing
raise RuntimeError(f"Could not compute metadata for {kind} target {target}: {e}")
RuntimeError: Could not compute metadata attn_outputs = self.attn(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 310, in forward
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 717, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 195, in call_module
return forward(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 710, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/pytorch_utils.py", line 115, in forward
x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 705, in module_getattr_wrapper
return self.getattr(attr, attr_val, parameter_proxy_cache)
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 484, in getattr
maybe_parameter_proxy = maybe_get_proxy_for_attr(
File "/root/miniconda3/lib/python3.8/site-packages/torch/fx/_symbolic_trace.py", line 478, in maybe_get_proxy_for_attr
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 142, in create_proxy
meta_out = self._meta_data_computing(
File "/root/miniconda3/lib/python3.8/site-packages/colossalai/fx/tracer/tracer.py", line 308, in _meta_data_computing
raise RuntimeError(f"Could not compute metadata for {kind} target {target}: {e}")
RuntimeError: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target
... repeat * 100
model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr del.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target
model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target
... repeat * 100
model.transformer.h.0.attn.c_attn.bias: maximum recursion depth exceeded while calling a Python object
target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target model.transformer.h.0.attn.c_attn.bias: Could not compute metadata for get_attr target
... repeat * 100
model.transformer.h.0.attn.c_attn.bias: maximum recursion depth exceeded while calling a Python object
for titans/run.sh:
sh run.sh
/root/miniconda3/lib/python3.8/site-packages/torch/library.py:130: UserWarning: Overriding a previously registered kernel for the same operator and the same dispatch key
operator: aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
registered at aten/src/ATen/RegisterSchema.cpp:6
dispatch key: Meta
previous kernel: registered at ../aten/src/ATen/functorch/BatchRulesScatterOps.cpp:1053
new kernel: registered at /dev/null:219 (Triggered internally at ../aten/src/ATen/core/dispatch/OperatorEntry.cpp:150.)
self.m.impl(name, dispatch_key, fn)
/bin/bash: line 0: fg: no job control
Error: failed to run torchrun --nproc_per_node=2 --nnodes=1 --node_rank=0 --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 --rdzv_id=colossalai-default-job train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset on 127.0.0.1, is localhost: True, exception: Encountered a bad command exit code!
Command: 'cd /ossfs/workspace/ColossalAi/examples/language/gpt/titans && export AS="/root/miniconda3/bin/x86_64-conda-linux-gnu-as" LC_PAPER="en_US.UTF-8" LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/root/miniconda3//lib -Wl,-rpath-link,/root/miniconda3//lib -L/root/miniconda3//lib" AR="/root/miniconda3/bin/x86_64-conda-linux-gnu-ar" POD_NAMESPACE="kubemaker" BIZ_ID="132704^pai_alipay^85816978^2023-02-26" CONDA_BACKUP_RANLIB="/root/miniconda3//bin/x86_64-conda-linux-gnu-ranlib" CONDA_BACKUP_DEBUG_CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3//include" CUDA_PATH="/root/miniconda3" LC_ADDRESS="en_US.UTF-8" AISTUDIO_JCS_SUB_JOB_ID="1" GCC_NM="/root/miniconda3/bin/x86_64-conda-linux-gnu-gcc-nm" LC_MONETARY="en_US.UTF-8" HADOOP_LOGS="/home/hadoop/hadoop-data/hadoop-logs" HOSTNAME="kmaker-49-011036210011" THEIA_MINI_BROWSER_HOST_PATTERN="{{hostname}}" CONDA_MIRRORS="mirrors.bfsu.edu.cn" PIP_NO_CACHE_DIR="1" GIT__CAN_USE_NO_OPTIONAL_LOCKS="true" JUPYTER_WORK_DIR="/root" ENV_ARGO_WORKFLOW_NAME="aistudio-85816978" HOST="x86_64-conda-linux-gnu" TERM="xterm-color" ENV_GREY_IMAGE_TYPE="false" KUBERNETES_PORT_443_TCP_PORT="443" KUBERNETES_PORT="tcp://172.16.0.1:443" ARGO_DEADLINE="2023-03-19T07:46:41Z" SHELL="/bin/bash" NM="/root/miniconda3/bin/x86_64-conda-linux-gnu-nm" HADOOP_HOME="/hadoop-client-dev/bin/hadoop" ALIPAY_APP_ZONE="GZ00G" AISTUDIO_TASK_ROOT_PATH="/home/admin" CONDA_BACKUP_CXXFILT="/root/miniconda3//bin/x86_64-conda-linux-gnu-c++filt" CPPFLAGS="-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /root/miniconda3//include" LEGACY_CONTAINER_SIZE_CPU_COUNT="2" CONDA_BACKUP_AR="/root/miniconda3//bin/x86_64-conda-linux-gnu-ar" CONDA_BACKUP_GXX="/root/miniconda3//bin/x86_64-conda-linux-gnu-g++" JUPYTER_NOTEBOOK_DIR="/ossfs" CONDA_SHLVL="1" CONDA_BACKUP_AS="/root/miniconda3//bin/x86_64-conda-linux-gnu-as" CONDA_BACKUP_CONDA_BUILD_SYSROOT="/root/miniconda3//x86_64-conda-linux-gnu/sysroot" JUPYTER_RUNTIME_DIR="/root/.local/share/jupyter/runtime" AISTUDIO_JOB_NAME="2H2BC7II" KUBERNETES_SERVICE_PORT="6443" ULOGFS_ENABLED="true" CONDA_PROMPT_MODIFIER="(base) " SIZE="/root/miniconda3/bin/x86_64-conda-linux-gnu-size" LC_NUMERIC="en_US.UTF-8" SYSTEMCTL_SKIP_REDIRECT="1" ENV_ODPS_ACCESS_KEY="" THEIA_WEBVIEW_EXTERNAL_ENDPOINT="{{hostname}}" KUBERNETES_SERVICE_HOST="apiserver.sigma-stl.svc.stl.alipay.com" POD_NAME="aistudio-85816978-2502438577" IDE_COMMON_OSS_BUCKET="dmsint" ENV_ODPS_ACCESS_ID="LTAIQH9JObhyEydB" CONDA_BACKUP_LD="/root/miniconda3//bin/x86_64-conda-linux-gnu-ld" CONDA_BACKUP_STRIP="/root/miniconda3//bin/x86_64-conda-linux-gnu-strip" CXX_FOR_BUILD="/root/miniconda3/bin/x86_64-conda-linux-gnu-c++" AISTUDIO_INNER_ZONE="prod" LC_ALL="en_US.UTF-8" D2_CYCTIME="20230227154641" CUDA_HOME="/root/miniconda3" PYPI_MIRRORS="mirrors.bfsu.edu.cn/pypi/web" ENV_SIGMA_APP_NAME="kmaker" CONDA_BACKUP_DEBUG_CPPFLAGS="-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /root/miniconda3//include" LC_TELEPHONE="en_US.UTF-8" LD_LIBRARY_PATH="/usr/lib64::/lib:/lib64:/usr/lib64:/usr/lib:/root/miniconda3/lib/:/root/miniconda3/lib/python3.6/site-packages/aistudio_common/reader/libs/:/opt/taobao/java/jre/lib/amd64/server/:/usr/local/cuda/lib64:/usr/local/lib" NVIDIA_VISIBLE_DEVICES="GPU-fbacfc74-9e5e-4d0c-aeaa-a1825b3d527e,GPU-f9bdac76-fd45-104b-dee4-2e212f380492" SIGMA_MAX_PROCESSORS_LIMIT="2" DefaultRoute="11.36.211.253" ILOGTAIL_PODNAME="aistudio-85816978-2502438577" CONDA_BACKUP_SIZE="/root/miniconda3//bin/x86_64-conda-linux-gnu-size" CONDA_BACKUP_GCC_NM="/root/miniconda3//bin/x86_64-conda-linux-gnu-gcc-nm" ALIPAY_APP_APPNAME="kmaker" EXECUTIONRECORD_ID="85816978" CONDA_BACKUP_HOST="x86_64-conda-linux-gnu" AISTUDIO_NAMESPACE="workflow_15090068" ENV_EXPERIMENT_TYPE_ENUM="K8S_CONTAINER" BATCH_SIZE="32" CONDA_EXE="/root/miniconda3/bin/conda" USER_NAME="aaron.hx" CONDA_BACKUP_GPROF="/root/miniconda3//bin/x86_64-conda-linux-gnu-gprof" CONDA_BACKUP_CXX_FOR_BUILD="/root/miniconda3//bin/x86_64-conda-linux-gnu-c++" USERNUMBER="132704" VSCODE_API_VERSION="1.53.2" TF_JNI_OPTS="-XX:ErrorFile=/tmp/hs_err_pid_%p.log" NVIDIA_DRIVER_CAPABILITIES="all" CONDA_BACKUP_BUILD="x86_64-conda-linux-gnu" DATA="/data/scratch/gpt_data/small-gpt-dataset.json" CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3//include" ENV_ARGO_NODE_NAME="aistudio-85816978-2502438577" ODPS_PROJECT="ant_p13n_dev" LINKB_APP_NAME="15090068" LD_GOLD="/root/miniconda3/bin/x86_64-conda-linux-gnu-ld.gold" WORKFLOW_ID="15090068" CONDA_BUILD_SYSROOT="/root/miniconda3/x86_64-conda-linux-gnu/sysroot" STRINGS="/root/miniconda3/bin/x86_64-conda-linux-gnu-strings" AISTUDIO_SITE_ENUM="INTERNAL" CONDA_BACKUP_LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/root/miniconda3//lib -Wl,-rpath-link,/root/miniconda3//lib -L/root/miniconda3//lib" CONDA_BACKUP_host_alias="x86_64-conda-linux-gnu" CPP="/root/miniconda3/bin/x86_64-conda-linux-gnu-cpp" pouch_container_image="reg.docker.alibaba-inc.com/aii/aistudio:3350123-20230221212038_nydus_v2" CXXFILT="/root/miniconda3/bin/x86_64-conda-linux-gnu-c++filt" ali_run_mode="alipay_container" ZHENJIN_HTTP_PREFIX="http://cmps-model.cn-hangzhou.alipay.aliyun-inc.com/264991" ULOGFS_ZCLEAN_ENABLE="true" CONDA_BACKUP_CXX="/root/miniconda3//bin/x86_64-conda-linux-gnu-c++" PATH="/root/miniconda3/bin:/root/miniconda3/condabin:/usr/local/cuda/bin:/root/.tnvm/versions/alinode/v5.20.3/bin:/root/coreutils/bin:/root/miniconda3/bin:/root/.tnvm/versions/alinode/v5.20.3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/X11R6/bin:/opt/satools:/opt/taobao/java/bin:/opt/odpscmd_public/bin:/root/apache-maven-3.6.3/bin" CONDA_BACKUP_CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3//include" LC_MESSAGES="en_US.UTF-8" ACCELERATE_DISABLE_RICH="1" ODPS_ALIYUN_ID="bs3yocnlmjdg@aliyun.com" ENV_CODE_NAME="dev_container" ILOGTAIL_ENV="{"Appname":"kmaker","LogAppname":"","Idcname":"stl","Apppath":"/home/admin/logs","Taglist":{"POD_NAME":"aistudio-85816978-2502438577","aistudio":"aistudio-85816978","app":"argo","component":"workflow","kubemaker":"aistudio-85816978"}}" JUPYTER_SERVICE_PORT="8080" DEBUG_CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3//include" BUILD="x86_64-conda-linux-gnu" LD="/root/miniconda3/bin/x86_64-conda-linux-gnu-ld" CONDA_PREFIX="/root/miniconda3" LC_IDENTIFICATION="en_US.UTF-8" LC_COLLATE="en_US.UTF-8" ARGO_PROGRESS_FILE="/var/run/argo/progress" PWD="/ossfs/workspace/ColossalAi/examples/language/gpt/titans" AISTUDIO_PROXY_ADDR="https://aistudioproxy.alipay.com/proxy/workflow_15090068:8080" STRIP="/root/miniconda3/bin/x86_64-conda-linux-gnu-strip" JAVA_HOME="/opt/taobao/java" npm_config_user="root" pouchSupportCgroup="true" ENV_ODPS_PROJECT_NAME="ant_p13n_dev" GPUNUM="2" CONDA_BACKUP_CC_FOR_BUILD="/root/miniconda3//bin/x86_64-conda-linux-gnu-cc" CMAKE_ARGS="-DCMAKE_LINKER=/root/miniconda3/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/root/miniconda3/bin/x86_64-conda-linux-gnu-strip" ELFEDIT="/root/miniconda3/bin/x86_64-conda-linux-gnu-elfedit" EDITOR="vim" pouch_container_id="f3591b81c52317c948382e9463490ea0e3b43ba0a3ad660f7371749003d55f31" CONDA_BACKUP_OBJCOPY="/root/miniconda3//bin/x86_64-conda-linux-gnu-objcopy" GCC_RANLIB="/root/miniconda3/bin/x86_64-conda-linux-gnu-gcc-ranlib" LANG="en_US.UTF-8" LOCAL_GIT_DIRECTORY="/root/miniconda3" io_alibaba_pouch_snapshotter="rafs" LAUNCH_CONTAINER_MODE="dev_container" LC_MEASUREMENT="en_US.UTF-8" IDE_SERVICE_PORT="8088" CONDA_BACKUP_OBJDUMP="/root/miniconda3//bin/x86_64-conda-linux-gnu-objdump" SN="5caf8b60-a482-4065-bbee-09a0b0621a94" JUPYTER_CONFIG_DIR="/root/.jupyter" ALIPAY_POD_NAME="aistudio-85816978-2502438577" ENV_TYPE="prod" ali_runtime_type="runc" CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME="_sysconfigdata_x86_64_conda_cos7_linux_gnu" ULOGFS_STREAM_ENABLED="true" ALIPAY_POD_NAMESPACE="kubemaker" ODPS_ENDPOINT="http://service.odps.aliyun-inc.com/api" CXX="/root/miniconda3/bin/x86_64-conda-linux-gnu-c++" CC_FOR_BUILD="/root/miniconda3/bin/x86_64-conda-linux-gnu-cc" ENDPOINT="http://service.odps.aliyun-inc.com/api" OBJCOPY="/root/miniconda3/bin/x86_64-conda-linux-gnu-objcopy" PANGU_CLUSTER_NAME="pangu1_analyze_sata_em14_online" ARGO_TEMPLATE="" CONDA_BACKUP_ELFEDIT="/root/miniconda3//bin/x86_64-conda-linux-gnu-elfedit" CONDA_BACKUP_GCC_AR="/root/miniconda3//bin/x86_64-conda-linux-gnu-gcc-ar" TOKENIZERS_PARALLELISM="false" TF_CPP_MIN_LOG_LEVEL="2" SHLVL="5" HOME="/root" LESSCHARSET="utf-8" ALIPAY_APP_ENV="prod" LANGUAGE="en_us" CONDA_BACKUP_NM="/root/miniconda3//bin/x86_64-conda-linux-gnu-nm" CONDA_BACKUP_build_alias="x86_64-conda-linux-gnu" ENV_ODPS_ALIYUN_ID="bs3yocnlmjdg@aliyun.com" KUBERNETES_PORT_443_TCP_PROTO="tcp" USE_LOCAL_GIT="true" DEBUG_CPPFLAGS="-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /root/miniconda3//include" CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3//include" KUBERNETES_SERVICE_PORT_HTTPS="443" _CONDA_PYTHON_SYSCONFIGDATA_NAME="_sysconfigdata_x86_64_conda_cos7_linux_gnu" GCC="/root/miniconda3/bin/x86_64-conda-linux-gnu-gcc" AISTUDIO_DOMAIN_CODE="experience.l2domain" ENV_AISTUDIO_HOST="aistudio.alipay.com" CONDA_BACKUP_DEBUG_CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3//include" JUPYTER_DATA_DIR="/root/.local/share/jupyter" IMAGE_TYPE="3350123" BASH_ENV="/root/.bashrc" CONDA_BACKUP_GCC="/root/miniconda3//bin/x86_64-conda-linux-gnu-gcc" CONDA_BACKUP_CMAKE_ARGS="-DCMAKE_LINKER=/root/miniconda3//bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/root/miniconda3//bin/x86_64-conda-linux-gnu-strip" ADDR2LINE="/root/miniconda3/bin/x86_64-conda-linux-gnu-addr2line" PYTHONPATH="/root/miniconda3/lib/python3.8/site-packages/aistudio_notebook/public:/root/miniconda3/lib/python3.8/site-packages/aistudio_notebook/public:/root/miniconda3/lib/python3.8/site-packages/aistudio_notebook/public:/root/miniconda3/lib/python3.8/site-packages/aistudio_notebook/public:/root/miniconda3/lib/python3.8/site-packages/aistudio_notebook/public:" RequestedIP="11.36.210.11" ENV_BIZDATE="20230226154640" ENV_GROUP_ID="5211" ARGO_PROGRESS_FILE_TICK_DURATION="3s" CONDA_PYTHON_EXE="/root/miniconda3/bin/python" ARGO_INCLUDE_SCRIPT_OUTPUT="false" IS_DEV_CONTAINER="true" TPDEGREE="2" build_alias="x86_64-conda-linux-gnu" CLASSPATH="/root/miniconda3/lib/python3.6/site-packages/aistudio_common/reader/libs/penrose-1.0-SNAPSHOT-jar-with-dependencies.jar" LC_CTYPE="en_US.UTF-8" CONDA_BACKUP_GCC_RANLIB="/root/miniconda3//bin/x86_64-conda-linux-gnu-gcc-ranlib" CONDA_BACKUP_CMAKE_PREFIX_PATH="/root/miniconda3/:/root/miniconda3//x86_64-conda-linux-gnu/sysroot/usr" OMP_NUM_THREADS="1" AJDK_MAX_PROCESSORS_LIMIT="2" ARGO_PROGRESS_PATCH_TICK_DURATION="1m0s" AISTUDIO_JOB_TYPE="launchContainer" NODE_LOG_DIR="/logs/alinode" ENABLE_NODE_LOG="YES" CONDA_DEFAULT_ENV="base" CONDA_BACKUP_CC="/root/miniconda3//bin/x86_64-conda-linux-gnu-cc" DEBUG_CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3//include" RANLIB="/root/miniconda3/bin/x86_64-conda-linux-gnu-ranlib" ANTB_BUILD_PLATFORM="AISTUDIO" PROMPT_COMMAND="printf "\033]0;%s@%s:%s\007" "${USER}" "${HOSTNAME%%.*}" "${PWD/#$HOME/~}";sh /etc/sysconfig/bash-prompt-history" IPYTHON_PROFILE_PATH="/root/.ipython/profile_default" CONDA_BACKUP_STRINGS="/root/miniconda3//bin/x86_64-conda-linux-gnu-strings" CMAKE_PREFIX_PATH="/root/miniconda3:/root/miniconda3/x86_64-conda-linux-gnu/sysroot/usr" CC="/root/miniconda3/bin/x86_64-conda-linux-gnu-cc" AISTUDIO_COMMON_PATH="/root/miniconda3/lib/python3.6/site-packages/aistudio_common" ENV_ODPS_ENDPOINT="http://service.odps.aliyun-inc.com/api" KUBERNETES_PORT_443_TCP_ADDR="172.16.0.1" CONDA_BACKUP_LD_GOLD="/root/miniconda3//bin/x86_64-conda-linux-gnu-ld.gold" host_alias="x86_64-conda-linux-gnu" READELF="/root/miniconda3/bin/x86_64-conda-linux-gnu-readelf" ARGO_CONTAINER_NAME="main" AISTUDIO_JCS_JOB_ID="stl##aistudio-85816978##kmaker" VISUAL_DATA_PATH="pangu://pangu1_analyze_sata_em14_online/pai/aistudio/checkpoint/aistudio-85816978" DefaultMask="255.255.252.0" ODPS_ACCESS_KEY="" ALIPAY_SIGMA_CPUMODE="cpushare" KUBERNETES_PORT_443_TCP="tcp://172.16.0.1:443" CONDA_BACKUP_ADDR2LINE="/root/miniconda3//bin/x86_64-conda-linux-gnu-addr2line" CONDA_BACKUP_READELF="/root/miniconda3//bin/x86_64-conda-linux-gnu-readelf" CONDA_BACKUP_CPP="/root/miniconda3//bin/x86_64-conda-linux-gnu-cpp" GCC_AR="/root/miniconda3/bin/x86_64-conda-linux-gnu-gcc-ar" OBJDUMP="/root/miniconda3/bin/x86_64-conda-linux-gnu-objdump" LC_TIME="en_US.UTF-8" container="placeholder" ODPS_ACCESS_ID="LTAIQH9JObhyEydB" WORKFLOW_API_PARAM_FILE="/ossfs/.param.conf" CONDA_BACKUP_CPPFLAGS="-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /root/miniconda3//include" CONDA_BACKUP_CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3//include" GPROF="/root/miniconda3/bin/x86_64-conda-linux-gnu-gprof" ENABLE_AISTUDIO_READER_SLEEP="true" GIT_EXEC_PATH="/root/miniconda3/libexec/git-core" GXX="/root/miniconda3/bin/x86_64-conda-linux-gnu-g++" LC_NAME="en_US.UTF-8" POD_IP="11.36.210.11" _="/root/miniconda3/bin/colossalai" && torchrun --nproc_per_node=2 --nnodes=1 --node_rank=0 --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 --rdzv_id=colossalai-default-job train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset'
Exit code: 1
Stdout: already printed
Stderr: already printed
====== Training on All Nodes =====
127.0.0.1: failure
====== Stopping All Nodes =====
127.0.0.1: finish
Could you help with the issues?
Environment
2 * v100 32G
pytorch 1.13 + cu117
python 3.8.13