You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have read the README and searched the existing issues.
System Info
llamafactory version: 0.8.2.dev0
Platform: Windows-10-10.0.19045-SP0
Python version: 3.10.14
PyTorch version: 2.0.0+cu117 (GPU)
Transformers version: 4.41.2
Datasets version: 2.18.0
Accelerate version: 0.31.0
PEFT version: 0.11.1
TRL version: 0.9.4
GPU type: NVIDIA GeForce RTX 3090
Reproduction
python src/webui.py
然后正常训练,就产生如下内容
Traceback (most recent call last): File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory_init_.py", line 17, in
from .cli import VERSION
File "E:\LLaMA-Factory-main\src\llamafactory\cli.py", line 21, in
from . import launcher
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 20, in
from ..data import get_template_and_fix_tokenizer
File "E:\LLaMA-Factory-main\src\llamafactory\data_init_.py", line 16, in
from .data_utils import Role, split_dataset
File "E:\LLaMA-Factory-main\src\llamafactory\data\data_utils.py", line 20, in
from ..extras.logging import get_logger
File "E:\LLaMA-Factory-main\src\llamafactory\extras\logging.py", line 20, in
from .constants import RUNNING_LOG
File "E:\LLaMA-Factory-main\src\llamafactory\extras\constants.py", line 19, in
from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft_init_.py", line 22, in
from .auto import (
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\auto.py", line 32, in
from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mapping.py", line 22, in
from .mixed_model import PeftMixedModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mixed_model.py", line 26, in
from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners_init_.py", line 22, in
from .loha import LoHaConfig, LoHaModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha_init_.py", line 16, in
from .layer import Conv2d, Linear, LoHaLayer
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 302, in
class HadaWeight(torch.autograd.Function):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 304, in HadaWeight
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py:304: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at ..\torch\csrc\utils\tensor_numpy.cpp:84.)
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.
If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.
Traceback (most recent call last): File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory_init_.py", line 17, in
from .cli import VERSION
File "E:\LLaMA-Factory-main\src\llamafactory\cli.py", line 21, in
from . import launcher
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 20, in
from ..data import get_template_and_fix_tokenizer
File "E:\LLaMA-Factory-main\src\llamafactory\data_init_.py", line 16, in
from .data_utils import Role, split_dataset
File "E:\LLaMA-Factory-main\src\llamafactory\data\data_utils.py", line 20, in
from ..extras.logging import get_logger
File "E:\LLaMA-Factory-main\src\llamafactory\extras\logging.py", line 20, in
from .constants import RUNNING_LOG
File "E:\LLaMA-Factory-main\src\llamafactory\extras\constants.py", line 19, in
from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft_init_.py", line 22, in
from .auto import (
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\auto.py", line 32, in
from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mapping.py", line 22, in
from .mixed_model import PeftMixedModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mixed_model.py", line 26, in
from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners_init_.py", line 22, in
from .loha import LoHaConfig, LoHaModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha_init_.py", line 16, in
from .layer import Conv2d, Linear, LoHaLayer
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 302, in
class HadaWeight(torch.autograd.Function):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 304, in HadaWeight
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py:304: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at ..\torch\csrc\utils\tensor_numpy.cpp:84.)
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 15212) of binary: D:\Users\Administrator\anaconda3\envs\llama_factory\python.exe
Traceback (most recent call last):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\runpy.py", line 196, in _run_module_as_main
return run_code(code, main_globals, None,
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\runpy.py", line 86, in run_code
exec(code, run_globals)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\Scripts\torchrun.exe_main.py", line 7, in
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\elastic\multiprocessing\errors_init.py", line 346, in wrapper
return f(*args, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\run.py", line 794, in main
run(args)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\run.py", line 785, in run
elastic_launch(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\launcher\api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\launcher\api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Reminder
System Info
llamafactory
version: 0.8.2.dev0Reproduction
python src/webui.py
然后正常训练,就产生如下内容
Traceback (most recent call last): File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory_init_.py", line 17, in
from .cli import VERSION
File "E:\LLaMA-Factory-main\src\llamafactory\cli.py", line 21, in
from . import launcher
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 20, in
from ..data import get_template_and_fix_tokenizer
File "E:\LLaMA-Factory-main\src\llamafactory\data_init_.py", line 16, in
from .data_utils import Role, split_dataset
File "E:\LLaMA-Factory-main\src\llamafactory\data\data_utils.py", line 20, in
from ..extras.logging import get_logger
File "E:\LLaMA-Factory-main\src\llamafactory\extras\logging.py", line 20, in
from .constants import RUNNING_LOG
File "E:\LLaMA-Factory-main\src\llamafactory\extras\constants.py", line 19, in
from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft_init_.py", line 22, in
from .auto import (
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\auto.py", line 32, in
from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mapping.py", line 22, in
from .mixed_model import PeftMixedModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mixed_model.py", line 26, in
from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners_init_.py", line 22, in
from .loha import LoHaConfig, LoHaModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha_init_.py", line 16, in
from .layer import Conv2d, Linear, LoHaLayer
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 302, in
class HadaWeight(torch.autograd.Function):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 304, in HadaWeight
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py:304: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at ..\torch\csrc\utils\tensor_numpy.cpp:84.)
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.
If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.
Traceback (most recent call last): File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory_init_.py", line 17, in
from .cli import VERSION
File "E:\LLaMA-Factory-main\src\llamafactory\cli.py", line 21, in
from . import launcher
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 15, in
from llamafactory.train.tuner import run_exp
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 20, in
from ..data import get_template_and_fix_tokenizer
File "E:\LLaMA-Factory-main\src\llamafactory\data_init_.py", line 16, in
from .data_utils import Role, split_dataset
File "E:\LLaMA-Factory-main\src\llamafactory\data\data_utils.py", line 20, in
from ..extras.logging import get_logger
File "E:\LLaMA-Factory-main\src\llamafactory\extras\logging.py", line 20, in
from .constants import RUNNING_LOG
File "E:\LLaMA-Factory-main\src\llamafactory\extras\constants.py", line 19, in
from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft_init_.py", line 22, in
from .auto import (
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\auto.py", line 32, in
from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mapping.py", line 22, in
from .mixed_model import PeftMixedModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\mixed_model.py", line 26, in
from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners_init_.py", line 22, in
from .loha import LoHaConfig, LoHaModel
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha_init_.py", line 16, in
from .layer import Conv2d, Linear, LoHaLayer
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 302, in
class HadaWeight(torch.autograd.Function):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py", line 304, in HadaWeight
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\peft\tuners\loha\layer.py:304: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at ..\torch\csrc\utils\tensor_numpy.cpp:84.)
def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
[W ..\torch\csrc\distributed\c10d\socket.cpp:601] [c10d] The client socket has failed to connect to [PC-20221025YWFX]:22582 (system error: 10049 - 在其上下文中,该请求的地址无效。).
Traceback (most recent call last):
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 23, in
launch()
File "E:\LLaMA-Factory-main\src\llamafactory\launcher.py", line 19, in launch
run_exp()
File "E:\LLaMA-Factory-main\src\llamafactory\train\tuner.py", line 41, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 151, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 137, in _parse_train_args
return _parse_args(parser, args)
File "E:\LLaMA-Factory-main\src\llamafactory\hparams\parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "", line 133, in init
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 1641, in post_init
and (self.device.type == "cpu" and not is_torch_greater_or_equal_than_2_3)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2149, in device
return self._setup_devices
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\utils\generic.py", line 59, in get
cached = self.fget(obj)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\transformers\training_args.py", line 2081, in _setup_devices
self.distributed_state = PartialState(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\accelerate\state.py", line 212, in init
torch.distributed.init_process_group(backend=self.backend, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
RuntimeError: Distributed package doesn't have NCCL built in
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 15212) of binary: D:\Users\Administrator\anaconda3\envs\llama_factory\python.exe
Traceback (most recent call last):
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\runpy.py", line 196, in _run_module_as_main
return run_code(code, main_globals, None,
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\runpy.py", line 86, in run_code
exec(code, run_globals)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\Scripts\torchrun.exe_main.py", line 7, in
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\elastic\multiprocessing\errors_init.py", line 346, in wrapper
return f(*args, **kwargs)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\run.py", line 794, in main
run(args)
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\run.py", line 785, in run
elastic_launch(
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\launcher\api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "D:\Users\Administrator\anaconda3\envs\llama_factory\lib\site-packages\torch\distributed\launcher\api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
E:\LLaMA-Factory-main\src\llamafactory\launcher.py FAILED
Failures:
[1]:
time : 2024-06-18_18:16:32
host : PC-20221025YWFX
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 16608)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-06-18_18:16:32
host : PC-20221025YWFX
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 2128)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2024-06-18_18:16:32
host : PC-20221025YWFX
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 15212)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Expected behavior
No response
Others
No response
The text was updated successfully, but these errors were encountered: