Skip to content

Commit

Permalink
Rework network interfaces args in horovod.run and horovodrun
Browse files Browse the repository at this point in the history
  • Loading branch information
EnricoMi committed Apr 6, 2022
1 parent 133ef07 commit 6f383c0
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 10 deletions.
13 changes: 9 additions & 4 deletions horovod/runner/__init__.py
Expand Up @@ -112,7 +112,9 @@ def run(
use_gloo=None,
use_mpi=None,
mpi_args=None,
# network_interface is deprecated, use network_interfaces instead
network_interface=None,
network_interfaces=None,
executable=None,
# np is deprecated, use num_proc instead
np=None,
Expand Down Expand Up @@ -174,13 +176,16 @@ def run(
:param use_mpi: Run Horovod using the MPI controller. This will
be the default if Horovod was built with MPI support.
:param mpi_args: Extra arguments for the MPI controller. This is only used when use_mpi is True.
:param network_interface: Network interfaces to use for communication separated by comma. If
not specified, Horovod will find the common NICs among all the
workers and use those; example, eth0,eth1.
:param network_interfaces: List of network interfaces to use for communication. If not specified,
Horovod will find the common NICs among all the workers and use those.
Example: ["eth0", "eth1"].
:param executable: Optional executable to run when launching the workers. Defaults to `sys.executable`.
:return: Return a list which contains values return by all Horovod processes.
The index of the list corresponds to the rank of each Horovod process.
"""
if network_interface:
network_interfaces = network_interface.split(',')
warnings.warn('network_interface is deprecated, use network_interfaces instead', DeprecationWarning)
if np is not None:
num_proc = np
warnings.warn('np is deprecated, use num_proc instead', DeprecationWarning)
Expand Down Expand Up @@ -224,7 +229,7 @@ def wrapped_func():
hargs.verbose = verbose
hargs.use_gloo = use_gloo
hargs.use_mpi = use_mpi
hargs.nics = network_interface
hargs.nics = set(network_interfaces) if network_interfaces else None
hargs.run_func = wrapped_func
hargs.executable = executable

Expand Down
2 changes: 1 addition & 1 deletion horovod/runner/common/util/settings.py
Expand Up @@ -42,7 +42,7 @@ def __init__(self, num_proc=None, verbose=0, ssh_port=None, ssh_identity_file=No
:param run_func_mode: whether it is run function mode
:type run_func_mode: boolean
:param nics: specify the NICs to be used for tcp network communication.
:type nics: string
:type nics: Iterable[str]
:param elastic: enable elastic auto-scaling and fault tolerance mode
:type elastic: boolean
:param prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver
Expand Down
58 changes: 53 additions & 5 deletions horovod/runner/launch.py
Expand Up @@ -240,6 +240,49 @@ def __call__(self, parser, args, values, option_string=None):
return StoreOverrideBoolAction


def make_nic_action(deprecated):
# This is an append Action that splits the values on ','
# where that splitting can be marked deprecated, depending on the actual command line argument used
class NicAction(argparse.Action):
def __init__(self,
option_strings,
dest,
default=None,
type=None,
choices=None,
required=False,
help=None):
super(NicAction, self).__init__(
option_strings=option_strings,
dest=dest,
nargs=1,
default=default,
type=type,
choices=choices,
required=required,
help=help)

def __call__(self, parser, args, values, option_string=None):
if ',' in values[0]:
if deprecated:
# providing a list of interfaces is deprecated, so we warn but split it anyway
warnings.warn(f'Providing multiple interfaces via --network-interface is deprecated. '
f'Please use --network-interfaces instead, or use '
f'multiple --network-interface arguments with a '
f'single network interface each.',
DeprecationWarning)
values = values[0].split(',')

# union the existing dest nics with the new ones
items = getattr(args, self.dest, None)
items = set() if items is None else items
items = items.union(values)

setattr(args, self.dest, items)

return NicAction


def parse_args():
override_args = set()

Expand Down Expand Up @@ -273,10 +316,18 @@ def parse_args():
'HOROVOD_START_TIMEOUT can also be used to '
'specify the initialization timeout.')

parser.add_argument('--network-interface', action='store', dest='nics',
parser.add_argument('--network-interfaces', action=make_nic_action(deprecated=False), dest='nics',
help='Network interfaces that can be used for communication separated by '
'comma. If not specified, Horovod will find the common NICs among all '
'the workers and use it; example, --network-interface "eth0,eth1".')
'the workers and use it. Also see --network-interface. '
'Example: --network-interfaces "eth0,eth1".')

# once deprecated use of --network-interface is removed, replace action=NicAction with action='append'
parser.add_argument('--network-interface', action=make_nic_action(deprecated=True), dest='nics',
help='Network interface that can be used for communication. Use multiple times '
'to configure multiple interfaces. If not specified, Horovod will find the'
'common NICs among all the workers and use it. Also see --network-interfaces. '
'Example: --network-interface eth0 --network-interface eth1')

parser.add_argument('--output-filename', action='store',
help='For Gloo, writes stdout / stderr of all processes to a filename of the form '
Expand Down Expand Up @@ -756,9 +807,6 @@ def _run(args):
# Set hosts to localhost if not specified
args.hosts = f'localhost:{args.num_proc}'

# Convert nics into set
args.nics = set(args.nics.split(',')) if args.nics else None

if _is_elastic(args):
return _run_elastic(args)
else:
Expand Down

0 comments on commit 6f383c0

Please sign in to comment.