Skip to content

Commit

Permalink
Provide the timeout value in useful places
Browse files Browse the repository at this point in the history
- Also adds spark_require_list to dev_require_list so that
    pip install -e .[dev]
  also installs spark packages.
- Further sets h5py<3 in spark_require_list.

Signed-off-by: Enrico Minack <github@enrico.minack.dev>
  • Loading branch information
EnricoMi committed Nov 11, 2020
1 parent 5daa029 commit a3e7574
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 19 deletions.
2 changes: 1 addition & 1 deletion horovod/common/gloo/gloo_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ std::shared_ptr<gloo::Context> Rendezvous(const std::string& prefix,
store.reset(new MemoryStore());
}
LOG(DEBUG) << prefix << " rendezvous started for rank=" << rank << ", size=" << size
<< ", dev={" << dev->str() << "}";
<< ", dev={" << dev->str() << "}, timeout=" << timeout;

auto context = std::make_shared<gloo::rendezvous::Context>(rank, size);
context->setTimeout(timeout);
Expand Down
2 changes: 1 addition & 1 deletion horovod/common/gloo/http_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void HTTPStore::wait(const std::vector<std::string>& keys,
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::steady_clock::now() - start);
if (timeout != gloo::kNoTimeout && elapsed > timeout) {
GLOO_THROW_IO_EXCEPTION(GLOO_ERROR_MSG("Wait timeout for key(s): ",
GLOO_THROW_IO_EXCEPTION(GLOO_ERROR_MSG("Wait timeout after ", timeout, " seconds for key(s): ",
::gloo::MakeString(keys)));
}
/* sleep override */
Expand Down
2 changes: 1 addition & 1 deletion horovod/common/gloo/memory_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void MemoryStore::wait(const std::vector<std::string>& keys,
auto now = std::chrono::steady_clock::now();
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - start);
if (timeout != gloo::kNoTimeout && elapsed > timeout) {
GLOO_THROW_IO_EXCEPTION(GLOO_ERROR_MSG("Wait timeout for key(s): ",
GLOO_THROW_IO_EXCEPTION(GLOO_ERROR_MSG("Wait timeout after ", timeout, " seconds for key(s): ",
::gloo::MakeString(keys)));
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
Expand Down
9 changes: 8 additions & 1 deletion horovod/runner/common/util/timeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

class Timeout(object):
def __init__(self, timeout, message):
self._timeout = timeout
self._timeout_at = time.time() + timeout
self._message = message

Expand All @@ -29,4 +30,10 @@ def timed_out(self):

def check_time_out_for(self, activity):
if self.timed_out():
raise Exception(self._message.format(activity=activity))
raise Exception(
'{}{} Timeout after {} seconds.'.format(
self._message.format(activity=activity),
'.' if not self._message.rstrip().endswith('.') else '',
self._timeout
)
)
31 changes: 16 additions & 15 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,6 @@ def build_extensions(self):
# python packages required to use horovod in general
require_list = ['cloudpickle', 'psutil', 'pyyaml', 'dataclasses;python_version<"3.7"']

# python packages required / recommended to develop horovod
# e.g., set of framework versions pinned for development, keep in sync with Dockerfile.test.cpu
# NOTE: do not use versions with +cpu or +gpu here as users would need to add --find-links to pip
dev_require_list = ['tensorflow-cpu==1.15.0',
'keras==2.2.4',
'torch==1.2.0',
'torchvision==0.4.0',
'mxnet==1.5.0',
'pyspark==2.4.7']

# python packages required only to run tests
# Pin h5py: https://github.com/h5py/h5py/issues/1732
test_require_list = ['mock', 'pytest', 'pytest-forked', 'parameterized', 'h5py<3']

# framework dependencies
tensorflow_require_list = ['tensorflow']
tensorflow_cpu_require_list = ['tensorflow-cpu']
Expand All @@ -119,7 +105,8 @@ def build_extensions(self):
mxnet_require_list = ['mxnet>=1.4.1']
pyspark_require_list = ['pyspark>=2.3.2;python_version<"3.8"',
'pyspark>=3.0.0;python_version>="3.8"']
spark_require_list = ['h5py>=2.9', 'numpy', 'petastorm>=0.9.0,!=0.9.3', 'pyarrow>=0.15.0'] + \
# Pin h5py: https://github.com/h5py/h5py/issues/1732
spark_require_list = ['h5py<3', 'numpy', 'petastorm>=0.9.0,!=0.9.3', 'pyarrow>=0.15.0'] + \
pyspark_require_list
ray_require_list = ['ray']

Expand All @@ -131,6 +118,20 @@ def build_extensions(self):
mxnet_require_list + \
spark_require_list

# python packages required / recommended to develop horovod
# e.g., set of framework versions pinned for development, keep in sync with Dockerfile.test.cpu
# NOTE: do not use versions with +cpu or +gpu here as users would need to add --find-links to pip
dev_require_list = ['tensorflow-cpu==1.15.0',
'keras==2.2.4',
'torch==1.2.0',
'torchvision==0.4.0',
'mxnet==1.5.0',
'pyspark==2.4.7'] + spark_require_list

# python packages required only to run tests
# Pin h5py: https://github.com/h5py/h5py/issues/1732
test_require_list = ['mock', 'pytest', 'pytest-forked', 'parameterized', 'h5py<3']

# Skip cffi if pytorch extension explicitly disabled
if not os.environ.get('HOROVOD_WITHOUT_PYTORCH'):
require_list.append('cffi>=1.4.0')
Expand Down

0 comments on commit a3e7574

Please sign in to comment.