Skip to content

Commit

Permalink
common: fix rdma fork problem (bytedance#192)
Browse files Browse the repository at this point in the history
* tools: add new launcher and improve sdist

* add manifest

* add bpslaunch

* common: fix rdma-fork problem

* update pslite submodule

* setup.py: improvement

* setup: fix rdma cpp flag

* setup: add warning for rdma header detection
  • Loading branch information
ymjiang authored and jasperzhong committed Jan 28, 2020
1 parent 7eba0ae commit 0fa57c5
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ lib64/
parts/
sdist/
var/
bin/
wheels/
*.egg-info/
.installed.cfg
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/ps-lite
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include */*
recursive-include * *.cc *.h
graft 3rdparty/ps-lite
1 change: 1 addition & 0 deletions byteps/common/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class BytePSGlobal {
static void ReportThreadFinish() { joined_thread_cnt.fetch_add(1); }
static bool IsAllThreadFinish(int total_thread_num);
static std::atomic_int joined_thread_cnt;
static int RoundUpToPageSize(int x) { return RoundUp(x, _pagesize); }

private:
static std::mutex _init_mutex;
Expand Down
1 change: 1 addition & 0 deletions byteps/common/shared_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace common {

void* BytePSSharedMemory::openSharedMemory(const std::string& prefix,
uint64_t key, size_t size) {
size = BytePSGlobal::RoundUpToPageSize(size);
std::string shm_name(prefix);
shm_name += std::to_string(key);
int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666);
Expand Down
6 changes: 5 additions & 1 deletion launcher/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def worker(local_rank, local_size, command):
os.makedirs(trace_path)
subprocess.check_call(command, env=my_env, stdout=sys.stdout, stderr=sys.stderr, shell=True)

if __name__ == "__main__":
def launch_bps():
print("BytePS launching " + os.environ["DMLC_ROLE"])
sys.stdout.flush()
check_env()
Expand All @@ -67,3 +67,7 @@ def worker(local_rank, local_size, command):

else:
import byteps.server


if __name__ == "__main__":
launch_bps()
25 changes: 22 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import sys
import re
import shutil
from shutil import rmtree
import textwrap
import shlex
Expand Down Expand Up @@ -227,6 +228,13 @@ def get_link_flags(build_ext):

raise DistutilsPlatformError(last_err)

def has_rdma_header():
ret_code = subprocess.call(
"echo '#include <rdma/rdma_cma.h>' | cpp -H -o /dev/null 2>/dev/null", shell=True)
if ret_code != 0:
import warnings
warnings.warn("\n\n No RDMA header file detected. Will disable RDMA for compilation! \n\n")
return ret_code==0

def get_common_options(build_ext):
cpp_flags = get_cpp_flags(build_ext)
Expand Down Expand Up @@ -270,7 +278,9 @@ def get_common_options(build_ext):

# RDMA and NUMA libs
LIBRARIES += ['numa']
if int(os.environ.get('BYTEPS_USE_RDMA', 0)):

# auto-detect rdma
if has_rdma_header():
LIBRARIES += ['rdmacm', 'ibverbs', 'rt']

# ps-lite
Expand Down Expand Up @@ -305,7 +315,9 @@ def build_server(build_ext, options):
server_lib.extra_link_args = options['LINK_FLAGS']
server_lib.extra_objects = options['EXTRA_OBJECTS']
server_lib.library_dirs = options['LIBRARY_DIRS']
if int(os.environ.get('BYTEPS_USE_RDMA', 0)):

# auto-detect rdma
if has_rdma_header():
server_lib.libraries = ['rdmacm', 'ibverbs', 'rt']
else:
server_lib.libraries = []
Expand Down Expand Up @@ -791,7 +803,7 @@ def build_extensions(self):
make_option = ""
if os.environ.get('CI', 'false') == 'false':
make_option += "-j "
if int(os.environ.get('BYTEPS_USE_RDMA', 0)):
if has_rdma_header():
make_option += "USE_RDMA=1 "

make_process = subprocess.Popen('make ' + make_option,
Expand Down Expand Up @@ -866,6 +878,12 @@ def build_extensions(self):


# Where the magic happens:

if os.path.exists('launcher/launch.py'):
if not os.path.exists('bin'):
os.mkdir('bin')
shutil.copyfile('launcher/launch.py', 'bin/bpslaunch')

setup(
name=NAME,
version=about['__version__'],
Expand Down Expand Up @@ -902,4 +920,5 @@ def build_extensions(self):
# which is undesirable. Luckily, `install` action will install cffi before executing build,
# so it's only necessary for `build*` or `bdist*` actions.
setup_requires=[],
scripts=['bin/bpslaunch']
)

0 comments on commit 0fa57c5

Please sign in to comment.