-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Description
Is there an existing issue for this bug?
- I have searched the existing issues
The bug has not been fixed in the latest main branch
- I have checked the latest main branch
Do you feel comfortable sharing a concise (minimal) script that reproduces the error? :)
Yes, I will share a minimal reproducible script.
🐛 Describe the bug

[rank104]: Traceback (most recent call last):
[rank104]: File "lora_finetune.py", line 481, in
[rank104]: train(args)
[rank104]: File "lora_finetune.py", line 308, in train
[rank104]: outputs = booster.execute_pipeline(
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/booster.py", line 221, in execute_pipeline
[rank104]: return self.plugin.execute_pipeline(data_iter, model, criterion, optimizer, return_loss, return_outputs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/plugin/hybrid_parallel_plugin.py", line 1409, in execute_pipeline
[rank104]: outputs = self.scheduler.forward_backward_step(
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 472, in forward_backward_step
[rank104]: result = self.run_forward_backward(model, data_iter, criterion, optimizer, return_loss, return_outputs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 416, in run_forward_backward
[rank104]: input_obj_grad = self.backward_step(optimizer, input_obj, output_obj, output_obj_grad)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 314, in backward_step
[rank104]: optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0])
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/plugin/hybrid_parallel_plugin.py", line 832, in backward_by_grad
[rank104]: super().backward_by_grad(tensor, grad, inputs=inputs, retain_graph=retain_graph)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/zero/low_level/low_level_optim.py", line 479, in backward_by_grad
[rank104]: torch.autograd.backward(
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 289, in backward
[rank104]: _engine_run_backward(
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward
[rank104]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 306, in apply
[rank104]: return user_fn(self, *args)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 296, in backward
[rank104]: outputs = ctx.run_function(*detached_inputs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank104]: return self._call_impl(*args, **kwargs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank104]: return forward_call(*args, **kwargs)
[rank104]: File "/root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py", line 1228, in forward
[rank104]: hidden_states = self.mlp(hidden_states)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank104]: return self._call_impl(*args, **kwargs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank104]: return forward_call(*args, **kwargs)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/shardformer/modeling/deepseek_v3.py", line 81, in forward
[rank104]: y = self.moe_forward(hidden_states, topk_idx, topk_weight).view(*orig_shape)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/shardformer/modeling/deepseek_v3.py", line 100, in moe_forward
[rank104]: gathered_tokens, _ = all_to_all_uneven(sorted_tokens, input_split_sizes, output_splits, self.ep_group)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/moe/_operation.py", line 452, in all_to_all_uneven
[rank104]: return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)
[rank104]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 574, in apply
[rank104]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/moe/_operation.py", line 428, in forward
[rank104]: return _all_to_all(
[rank104]: File "/opt/conda/lib/python3.8/site-packages/colossalai/moe/_operation.py", line 395, in _all_to_all
[rank104]: outputs = torch.empty(outputs_shape, dtype=inputs.dtype, device=inputs.device)
[rank104]: RuntimeError: Trying to create tensor with negative dimension -2058873370790320781: [-2058873370790320781, 7168]
EP16,PP3
此外偶发:
[rank106]: Traceback (most recent call last):
[rank106]: File "lora_finetune.py", line 481, in
[rank106]: train(args)
[rank106]: File "lora_finetune.py", line 308, in train
[rank106]: outputs = booster.execute_pipeline(
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/booster.py", line 221, in execute_pipeline
[rank106]: return self.plugin.execute_pipeline(data_iter, model, criterion, optimizer, return_loss, return_outputs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/plugin/hybrid_parallel_plugin.py", line 1409, in execute_pipeline
[rank106]: outputs = self.scheduler.forward_backward_step(
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 472, in forward_backward_step
[rank106]: result = self.run_forward_backward(model, data_iter, criterion, optimizer, return_loss, return_outputs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 416, in run_forward_backward
[rank106]: input_obj_grad = self.backward_step(optimizer, input_obj, output_obj, output_obj_grad)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/pipeline/schedule/one_f_one_b.py", line 314, in backward_step
[rank106]: optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0])
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/booster/plugin/hybrid_parallel_plugin.py", line 832, in backward_by_grad
[rank106]: super().backward_by_grad(tensor, grad, inputs=inputs, retain_graph=retain_graph)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/zero/low_level/low_level_optim.py", line 479, in backward_by_grad
[rank106]: torch.autograd.backward(
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 289, in backward
[rank106]: _engine_run_backward(
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward
[rank106]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 306, in apply
[rank106]: return user_fn(self, *args)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 296, in backward
[rank106]: outputs = ctx.run_function(*detached_inputs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank106]: return self._call_impl(*args, **kwargs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank106]: return forward_call(*args, **kwargs)
[rank106]: File "/root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py", line 1228, in forward
[rank106]: hidden_states = self.mlp(hidden_states)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank106]: return self._call_impl(*args, **kwargs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank106]: return forward_call(*args, **kwargs)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/shardformer/modeling/deepseek_v3.py", line 81, in forward
[rank106]: y = self.moe_forward(hidden_states, topk_idx, topk_weight).view(*orig_shape)
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/shardformer/modeling/deepseek_v3.py", line 130, in moe_forward
[rank106]: tokens_for_this_expert = DPGradScalerIn.apply(tokens_for_this_expert, self.moe_dp_size, activate_experts[i])
[rank106]: File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 574, in apply
[rank106]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank106]: File "/opt/conda/lib/python3.8/site-packages/colossalai/moe/_operation.py", line 343, in forward
[rank106]: assert activated_experts != 0, f"shouldn't be called when no expert is activated"
[rank106]: AssertionError: shouldn't be called when no expert is activated
Environment
torch 2.4.0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0
96 * A100,EP 16 PP 3/6 lora训练