Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MXNet: use MXEnginePushAsync C API to push horovod operations #985

Merged
merged 6 commits into from Apr 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
40 changes: 20 additions & 20 deletions .buildkite/gen-pipeline.sh
Expand Up @@ -8,26 +8,26 @@ repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# list of all the tests
tests=( \
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2 \
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2 \
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2 \
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2 \
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2 \
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2 \
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2 \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0 \
)

build_test() {
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.test.cpu
Expand Up @@ -9,7 +9,7 @@ ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG MXNET_PACKAGE=mxnet==1.4.0.post0
ARG MXNET_PACKAGE=mxnet==1.5.0b20190412
ARG PYSPARK_PACKAGE=pyspark==2.4.0

# Set default shell to /bin/bash
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.test.gpu
Expand Up @@ -11,7 +11,7 @@ ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow-gpu==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG MXNET_PACKAGE=mxnet-cu90==1.4.0.post0
ARG MXNET_PACKAGE=mxnet-cu90==1.5.0b20190412
ARG PYSPARK_PACKAGE=pyspark==2.4.0
ARG HOROVOD_BUILD_FLAGS=HOROVOD_GPU_ALLREDUCE=NCCL
ARG HOROVOD_MIXED_INSTALL=0
Expand Down
46 changes: 24 additions & 22 deletions docker-compose.test.yml
Expand Up @@ -5,7 +5,7 @@ services:
context: .
dockerfile: Dockerfile.test.cpu
privileged: true
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2:
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2:
extends: test-cpu-base
build:
args:
Expand All @@ -15,7 +15,7 @@ services:
KERAS_PACKAGE: keras==2.0.0
PYTORCH_PACKAGE: torch==0.4.0
PYSPARK_PACKAGE: pyspark==2.1.2
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2:
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2:
extends: test-cpu-base
build:
args:
Expand All @@ -25,7 +25,7 @@ services:
KERAS_PACKAGE: keras==2.0.0
PYTORCH_PACKAGE: torch==0.4.0
PYSPARK_PACKAGE: pyspark==2.1.2
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2:
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_5_0-pyspark2_1_2:
extends: test-cpu-base
build:
args:
Expand All @@ -35,7 +35,7 @@ services:
KERAS_PACKAGE: keras==2.0.0
PYTORCH_PACKAGE: torch==0.4.0
PYSPARK_PACKAGE: pyspark==2.1.2
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2:
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2:
extends: test-cpu-base
build:
args:
Expand All @@ -45,7 +45,7 @@ services:
KERAS_PACKAGE: keras==2.1.2
PYTORCH_PACKAGE: torch==0.4.1
PYSPARK_PACKAGE: pyspark==2.3.2
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2:
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2:
extends: test-cpu-base
build:
args:
Expand All @@ -55,7 +55,7 @@ services:
KERAS_PACKAGE: keras==2.1.2
PYTORCH_PACKAGE: torch==0.4.1
PYSPARK_PACKAGE: pyspark==2.3.2
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2:
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2:
extends: test-cpu-base
build:
args:
Expand All @@ -65,7 +65,7 @@ services:
KERAS_PACKAGE: keras==2.1.2
PYTORCH_PACKAGE: torch==0.4.1
PYSPARK_PACKAGE: pyspark==2.3.2
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -75,7 +75,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -85,7 +85,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -95,7 +95,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0:
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -105,7 +105,8 @@ services:
KERAS_PACKAGE: git+https://github.com/keras-team/keras.git
PYTORCH_PACKAGE: torch-nightly
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0:
MXNET_PACKAGE: mxnet --pre
yuxihu marked this conversation as resolved.
Show resolved Hide resolved
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -115,7 +116,8 @@ services:
KERAS_PACKAGE: git+https://github.com/keras-team/keras.git
PYTORCH_PACKAGE: torch-nightly
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
MXNET_PACKAGE: mxnet --pre
yuxihu marked this conversation as resolved.
Show resolved Hide resolved
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-cpu-base
build:
args:
Expand All @@ -136,7 +138,7 @@ services:
environment:
- CUDA_VISIBLE_DEVICES
privileged: true
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2:
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2:
extends: test-gpu-base
build:
args:
Expand All @@ -147,7 +149,7 @@ services:
KERAS_PACKAGE: keras==2.1.2
PYTORCH_PACKAGE: torch==0.4.1
PYSPARK_PACKAGE: pyspark==2.3.2
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2:
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_5_0-pyspark2_3_2:
extends: test-gpu-base
build:
args:
Expand All @@ -158,7 +160,7 @@ services:
KERAS_PACKAGE: keras==2.1.2
PYTORCH_PACKAGE: torch==0.4.1
PYSPARK_PACKAGE: pyspark==2.3.2
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand All @@ -168,7 +170,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand All @@ -178,7 +180,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0:
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand All @@ -190,9 +192,9 @@ services:
TENSORFLOW_PACKAGE: tf-nightly-gpu==1.14.1-dev20190408
KERAS_PACKAGE: git+https://github.com/keras-team/keras.git
PYTORCH_PACKAGE: torch-nightly
MXNET_PACKAGE: mxnet-cu100==1.4.0.post0
MXNET_PACKAGE: mxnet-cu100 --pre
yuxihu marked this conversation as resolved.
Show resolved Hide resolved
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0:
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand All @@ -204,9 +206,9 @@ services:
TENSORFLOW_PACKAGE: tf-nightly-gpu==1.14.1-dev20190408
KERAS_PACKAGE: git+https://github.com/keras-team/keras.git
PYTORCH_PACKAGE: torch-nightly
MXNET_PACKAGE: mxnet-cu100==1.4.0.post0
MXNET_PACKAGE: mxnet-cu100 --pre
yuxihu marked this conversation as resolved.
Show resolved Hide resolved
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand All @@ -216,7 +218,7 @@ services:
KERAS_PACKAGE: keras==2.2.2
PYTORCH_PACKAGE: torch==1.0.0
PYSPARK_PACKAGE: pyspark==2.4.0
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0:
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_5_0-pyspark2_4_0:
extends: test-gpu-base
build:
args:
Expand Down
6 changes: 6 additions & 0 deletions horovod/mxnet/adapter.cc
Expand Up @@ -84,6 +84,12 @@ MXTemporaryBuffer<T>::MXTemporaryBuffer(int device, int dtype)
this->tensor_ = TensorUtil::New(device, dtype);
}

template <class T>
MXTemporaryBuffer<T>::MXTemporaryBuffer(T* tensor)
: MXTensor<T>(nullptr) {
this->tensor_ = tensor;
}

template <class T> MXTemporaryBuffer<T>::~MXTemporaryBuffer() {
TensorUtil::Free(this->tensor_);
}
Expand Down
1 change: 1 addition & 0 deletions horovod/mxnet/adapter.h
Expand Up @@ -51,6 +51,7 @@ template <class T> class MXTensor : public Tensor {
template <class T> class MXTemporaryBuffer : public MXTensor<T> {
public:
MXTemporaryBuffer(int device, int dtype);
MXTemporaryBuffer(T* tensor);
~MXTemporaryBuffer();
virtual T* tensor() const;
};
Expand Down