Skip to content
Permalink
Browse files

Separately build gloo for each plugins (#1209)

* separately build gloo for each plugins

Signed-off-by: Sihan Zeng <zsh@uber.com>

* fix format

Signed-off-by: Sihan Zeng <zsh@uber.com>
  • Loading branch information...
zsh-thu authored and alsrgv committed Jul 11, 2019
1 parent 35b55c5 commit 958695e7343ce470ad3b0d9df1967b5af3bd6ec3
Showing with 46 additions and 14 deletions.
  1. +46 −14 setup.py
@@ -22,6 +22,7 @@
import sys
import textwrap
import traceback
from copy import deepcopy
from distutils.errors import CompileError, DistutilsError, \
DistutilsPlatformError, LinkError
from distutils.sysconfig import customize_compiler
@@ -775,6 +776,7 @@ def build_tf_extension(build_ext, options):
tensorflow_mpi_lib.extra_compile_args = options['COMPILE_FLAGS'] + \
tf_compile_flags
tensorflow_mpi_lib.extra_link_args = options['LINK_FLAGS'] + tf_link_flags

tensorflow_mpi_lib.library_dirs = options['LIBRARY_DIRS']
tensorflow_mpi_lib.libraries = options['LIBRARIES']

@@ -826,6 +828,8 @@ def build_tf_extension(build_ext, options):
try:
with env(CC=cc_compiler, CXX=cxx_compiler, CFLAGS=cflags, CPPFLAGS=cppflags,
LDSHARED=ldshared):
if options['BUILD_GLOO']:
build_cmake(build_ext, gloo_lib, 'tf', tensorflow_mpi_lib)
customize_compiler(build_ext.compiler)
build_ext.build_extension(tensorflow_mpi_lib)
finally:
@@ -899,7 +903,15 @@ def is_mx_cuda():
return False


def build_mx_extension(build_ext, options):
def build_mx_extension(build_ext, global_options):
# Backup the options, preventing other plugins access libs that
# compiled with compiler of this plugin
options = deepcopy(global_options)

# First build gloo
if options['BUILD_GLOO']:
build_cmake(build_ext, gloo_lib, 'mxnet', options=options)

check_mx_version()
mx_compile_flags, mx_link_flags = get_mx_flags(
build_ext, options['COMPILE_FLAGS'])
@@ -1036,13 +1048,21 @@ def __exit__(self, type, value, traceback):
os.rename(file + '.protected', file)


def build_torch_extension(build_ext, options, torch_version):
def build_torch_extension(build_ext, global_options, torch_version):
# Backup the options, preventing other plugins access libs that
# compiled with compiler of this plugin
options = deepcopy(global_options)

have_cuda = is_torch_cuda()
if not have_cuda and check_macro(options['MACROS'], 'HAVE_CUDA'):
raise DistutilsPlatformError(
'Horovod build with GPU support was requested, but this PyTorch '
'installation does not support CUDA.')

# Build gloo
if options['BUILD_GLOO']:
build_cmake(build_ext, gloo_lib, 'torch', options=options)

# Update HAVE_CUDA to mean that PyTorch supports CUDA. Internally, we will be checking
# HOROVOD_GPU_(ALLREDUCE|ALLGATHER|BROADCAST) to decide whether we should use GPU
# version or transfer tensors to CPU memory for those operations.
@@ -1183,17 +1203,29 @@ def build_torch_extension_v2(build_ext, options, torch_version):
try:
with env(CC=cc_compiler, CXX=cxx_compiler, CFLAGS=cflags, CPPFLAGS=cppflags,
LDSHARED=ldshared):
if options['BUILD_GLOO']:
build_cmake(build_ext, gloo_lib, 'torchv2', ext)
customize_compiler(build_ext.compiler)
build_ext.build_extension(torch_mpi_lib_v2)
finally:
# Revert to the default compiler settings
customize_compiler(build_ext.compiler)


def build_cmake(build_ext, ext, output_dir, options):
def build_cmake(build_ext, ext, prefix, plugin_ext=None, options=None):
cmake_bin = 'cmake'

# Statically linked archive files go into the provided output directory
# All statically linked libraries will be placed here
lib_output_dir = os.path.abspath(os.path.join(build_ext.build_temp, 'lib', prefix))
if not os.path.exists(lib_output_dir):
os.makedirs(lib_output_dir)

if plugin_ext:
plugin_ext.library_dirs += [lib_output_dir]

if options:
options['LIBRARY_DIRS'] += [lib_output_dir]

extdir = os.path.abspath(
os.path.dirname(build_ext.get_ext_fullpath(ext.name)))
config = 'Debug' if build_ext.debug else 'Release'
@@ -1202,7 +1234,7 @@ def build_cmake(build_ext, ext, output_dir, options):
'-DCMAKE_BUILD_TYPE=' + config,
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(config.upper(), extdir),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(config.upper(),
output_dir),
lib_output_dir),
]
cmake_build_args = [
'--config', config,
@@ -1223,8 +1255,12 @@ def build_cmake(build_ext, ext, output_dir, options):
except OSError as e:
raise RuntimeError('CMake failed: {}'.format(str(e)))

# Add the library so other extensions will link against it during compilation
options['LIBRARIES'] += [ext.name]
# Add the library so the plugin will link against it during compilation
if plugin_ext:
plugin_ext.libraries += [ext.name]

if options:
options['LIBRARIES'] += [ext.name]


# run the customize_compiler
@@ -1233,23 +1269,19 @@ def build_extensions(self):
options = get_common_options(self)
built_plugins = []

# All statically linked libraries will be placed here
lib_output_dir = os.path.abspath(os.path.join(self.build_temp, 'lib'))
if not os.path.exists(lib_output_dir):
os.makedirs(lib_output_dir)
options['LIBRARY_DIRS'] += [lib_output_dir]

if is_mac:
print('INFO: Submodule Gloo cannot compile on MacOS, skip compiling '
'Gloo.')
options['BUILD_GLOO'] = False
elif not have_cmake:
# TODO: install cmake in local environment after entry point issue
# has some updates.
# https://github.com/scikit-build/cmake-python-distributions/issues/80
print('INFO: Submodule Gloo cannot compile without CMake, '
'skip compiling Gloo.')
options['BUILD_GLOO'] = False
else:
build_cmake(self, gloo_lib, lib_output_dir, options)
options['BUILD_GLOO'] = True

# If PyTorch is installed, it must be imported before TensorFlow, otherwise
# we may get an error: dlopen: cannot load any more object with static TLS

0 comments on commit 958695e

Please sign in to comment.
You can’t perform that action at this time.