Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework network interfaces args in horovod.run and horovodrun #3506

Merged
merged 3 commits into from Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 9 additions & 4 deletions horovod/runner/__init__.py
Expand Up @@ -112,7 +112,9 @@ def run(
use_gloo=None,
use_mpi=None,
mpi_args=None,
# network_interface is deprecated, use network_interfaces instead
network_interface=None,
network_interfaces=None,
executable=None,
# np is deprecated, use num_proc instead
np=None,
Expand Down Expand Up @@ -174,13 +176,16 @@ def run(
:param use_mpi: Run Horovod using the MPI controller. This will
be the default if Horovod was built with MPI support.
:param mpi_args: Extra arguments for the MPI controller. This is only used when use_mpi is True.
:param network_interface: Network interfaces to use for communication separated by comma. If
not specified, Horovod will find the common NICs among all the
workers and use those; example, eth0,eth1.
:param network_interfaces: List of network interfaces to use for communication. If not specified,
Horovod will find the common NICs among all the workers and use those.
Example: ["eth0", "eth1"].
:param executable: Optional executable to run when launching the workers. Defaults to `sys.executable`.
:return: Return a list which contains values return by all Horovod processes.
The index of the list corresponds to the rank of each Horovod process.
"""
if network_interface:
network_interfaces = network_interface.split(',')
warnings.warn('network_interface is deprecated, use network_interfaces instead', DeprecationWarning)
if np is not None:
num_proc = np
warnings.warn('np is deprecated, use num_proc instead', DeprecationWarning)
Expand Down Expand Up @@ -224,7 +229,7 @@ def wrapped_func():
hargs.verbose = verbose
hargs.use_gloo = use_gloo
hargs.use_mpi = use_mpi
hargs.nics = network_interface
hargs.nics = set(network_interfaces) if network_interfaces else None
hargs.run_func = wrapped_func
hargs.executable = executable

Expand Down
2 changes: 1 addition & 1 deletion horovod/runner/common/util/settings.py
Expand Up @@ -42,7 +42,7 @@ def __init__(self, num_proc=None, verbose=0, ssh_port=None, ssh_identity_file=No
:param run_func_mode: whether it is run function mode
:type run_func_mode: boolean
:param nics: specify the NICs to be used for tcp network communication.
:type nics: string
:type nics: Iterable[str]
:param elastic: enable elastic auto-scaling and fault tolerance mode
:type elastic: boolean
:param prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver
Expand Down
58 changes: 53 additions & 5 deletions horovod/runner/launch.py
Expand Up @@ -240,6 +240,49 @@ def __call__(self, parser, args, values, option_string=None):
return StoreOverrideBoolAction


def make_nic_action(deprecated):
# This is an append Action that splits the values on ','
# where that splitting can be marked deprecated, depending on the actual command line argument used
class NicAction(argparse.Action):
def __init__(self,
option_strings,
dest,
default=None,
type=None,
choices=None,
required=False,
help=None):
super(NicAction, self).__init__(
option_strings=option_strings,
dest=dest,
nargs=1,
default=default,
type=type,
choices=choices,
required=required,
help=help)

def __call__(self, parser, args, values, option_string=None):
if ',' in values[0]:
if deprecated:
# providing a list of interfaces is deprecated, so we warn but split it anyway
warnings.warn(f'Providing multiple interfaces via --network-interface is deprecated. '
f'Please use --network-interfaces instead, or use '
f'multiple --network-interface arguments with a '
f'single network interface each.',
DeprecationWarning)
values = values[0].split(',')

# union the existing dest nics with the new ones
items = getattr(args, self.dest, None)
items = set() if items is None else items
items = items.union(values)

setattr(args, self.dest, items)

return NicAction


def parse_args():
override_args = set()

Expand Down Expand Up @@ -273,10 +316,18 @@ def parse_args():
'HOROVOD_START_TIMEOUT can also be used to '
'specify the initialization timeout.')

parser.add_argument('--network-interface', action='store', dest='nics',
parser.add_argument('--network-interfaces', action=make_nic_action(deprecated=False), dest='nics',
help='Network interfaces that can be used for communication separated by '
'comma. If not specified, Horovod will find the common NICs among all '
maxhgerlach marked this conversation as resolved.
Show resolved Hide resolved
'the workers and use it; example, --network-interface "eth0,eth1".')
'the workers and use it. Also see --network-interface. '
'Example: --network-interfaces "eth0,eth1".')

# once deprecated use of --network-interface is removed, replace action=NicAction with action='append'
parser.add_argument('--network-interface', action=make_nic_action(deprecated=True), dest='nics',
help='Network interface that can be used for communication. Use multiple times '
'to configure multiple interfaces. If not specified, Horovod will find the'
'common NICs among all the workers and use it. Also see --network-interfaces. '
'Example: --network-interface eth0 --network-interface eth1')

parser.add_argument('--output-filename', action='store',
help='For Gloo, writes stdout / stderr of all processes to a filename of the form '
Expand Down Expand Up @@ -756,9 +807,6 @@ def _run(args):
# Set hosts to localhost if not specified
args.hosts = f'localhost:{args.num_proc}'

# Convert nics into set
args.nics = set(args.nics.split(',')) if args.nics else None

if _is_elastic(args):
return _run_elastic(args)
else:
Expand Down