Skip to content

Commit

Permalink
Merge branch 'horovod:master' into spark-torch-gradient-accumulation
Browse files Browse the repository at this point in the history
Signed-off-by: Li Jiang <bnujli@gmail.com>
  • Loading branch information
thinkall committed Sep 9, 2022
2 parents 04fa540 + 1fbdab5 commit ac55b99
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 368 deletions.
5 changes: 2 additions & 3 deletions .buildkite/gen-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,9 @@ build_test() {
echo " env:"
echo " COMPOSE_HTTP_TIMEOUT: 300"
echo " plugins:"
echo " - docker-compose#v3.5.0:"
echo " - docker-compose#v3.10.0:"
echo " build: ${test}"
echo " image-repository: ${repository}"
echo " cache-from: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest"
echo " config: docker-compose.test.yml"
echo " push-retries: 5"
echo " - ecr#v1.2.0:"
Expand All @@ -110,7 +109,7 @@ run_test() {
echo " env:"
echo " COMPOSE_HTTP_TIMEOUT: 300"
echo " plugins:"
echo " - docker-compose#v3.5.0:"
echo " - docker-compose#v3.10.0:"
echo " run: ${test}"
echo " volumes: \"./artifacts:/artifacts\""
echo " config: docker-compose.test.yml"
Expand Down
82 changes: 17 additions & 65 deletions .github/gen-workflow-ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,29 @@ def main():
pipeline = yaml.load(proc.stdout, Loader=Loader)
steps = pipeline.get('steps', [])

images = [plugin['docker-compose#v3.5.0']['build']
docker_compose_plugins = {plugin_name
for step in steps if isinstance(step, dict) and 'label' in step
and step['label'].startswith(':docker: Build ')
for plugins in step['plugins']
for plugin_name in plugins.keys() if plugin_name.startswith('docker-compose#')}
if len(docker_compose_plugins) == 0:
raise RuntimeError('No docker-compose plugin found')
if len(docker_compose_plugins) > 1:
raise RuntimeError('Multiple docker-compose plugins found')

docker_compose_plugin = list(docker_compose_plugins)[0]
images = [plugin[docker_compose_plugin]['build']
for step in steps if isinstance(step, dict) and 'label' in step
and step['label'].startswith(':docker: Build ')
for plugin in step['plugins'] if 'docker-compose#v3.5.0' in plugin]
for plugin in step['plugins'] if docker_compose_plugin in plugin]

cpu_tests = [(re.sub(r' \(test-.*', '', re.sub(':[^:]*: ', '', step['label'])),
step['command'],
step['timeout_in_minutes'],
plugin['docker-compose#v3.5.0']['run'])
plugin[docker_compose_plugin]['run'])
for step in steps if isinstance(step, dict) and 'label' in step and 'command' in step
and not step['label'].startswith(':docker: Build ') and '-cpu-' in step['label']
for plugin in step['plugins'] if 'docker-compose#v3.5.0' in plugin]
for plugin in step['plugins'] if docker_compose_plugin in plugin]

# we need to distinguish the two oneccl variants of some tests
cpu_tests = [(label + (' [ONECCL OFI]' if 'mpirun_command_ofi' in command else (' [ONECCL MPI]' if 'mpirun_command_mpi' in command else '')),
Expand Down Expand Up @@ -335,54 +346,10 @@ def build_and_test_images(id: str,
f' - name: Setup docker-compose\n'
f' run: pip install docker-compose\n'
f'\n'
f' - name: Configure AWS credentials\n'
f' id: aws\n'
f' uses: aws-actions/configure-aws-credentials@v1\n'
f' # AWS credentials are used to authenticate against AWS ECR to pull and push test images\n'
f' # We can only authenticate when running on Horovod repo (not a fork)\n'
f' if: >\n'
f' github.repository == \'horovod/horovod\' &&\n'
f' ( github.event_name != \'pull_request\' || github.event.pull_request.head.repo.full_name == github.repository )\n'
f' continue-on-error: true\n'
f' with:\n'
f' aws-access-key-id: ${{{{ secrets.AWS_ACCESS_KEY_ID }}}}\n'
f' aws-secret-access-key: ${{{{ secrets.AWS_SECRET_ACCESS_KEY }}}}\n'
f' aws-region: us-east-1\n'
f'\n'
f' - name: Login to Amazon ECR\n'
f' id: ecr\n'
f' if: steps.aws.outcome == \'success\'\n'
f' continue-on-error: true\n'
f' uses: aws-actions/amazon-ecr-login@v1\n'
f'\n'
f' - name: Add cache_from to docker-compose YAML\n'
f' if: steps.ecr.outcome == \'success\'\n'
f' run: |\n'
f' cat > docker-compose.test.override.yml <<EOF\n'
f' version: \'2.3\'\n'
f' services:\n'
f' ${{{{ matrix.image }}}}:\n'
f' build:\n'
f' cache_from:\n'
f' - ${{{{ steps.ecr.outputs.registry }}}}/buildkite:horovod-${{{{ matrix.image }}}}-latest\n'
f' EOF\n'
f' cat docker-compose.test.override.yml\n'
f' shell: bash\n'
f'\n'
f' - name: Pull latest test image\n'
f' if: steps.ecr.outcome == \'success\'\n'
f' continue-on-error: true\n'
f' run: |\n'
f' docker pull ${{{{ steps.ecr.outputs.registry }}}}/buildkite:horovod-${{{{ matrix.image }}}}-latest\n'
f' env:\n'
f' DOCKER_BUILDKIT: 1\n'
f'\n'
f' - name: Build\n'
f' id: build\n'
f' run: |\n'
f' override_yaml=""\n'
f' if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi\n'
f' .github/timeout-and-retry.sh ${{{{ matrix.build_timeout }}}}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{{{ matrix.image }}}}\n'
f' .github/timeout-and-retry.sh ${{{{ matrix.build_timeout }}}}m 3 10 docker-compose -f docker-compose.test.yml build ${{{{ matrix.image }}}}\n'
f' env:\n'
f' COMPOSE_DOCKER_CLI_BUILD: 1\n'
f' DOCKER_BUILDKIT: 1\n'
Expand All @@ -403,22 +370,7 @@ def build_and_test_images(id: str,
f' if: always() && contains(matrix.image, \'-cpu-\')\n'
f' with:\n'
f' name: Unit Test Results - ${{{{ matrix.image }}}}\n'
f' path: artifacts/${{{{ matrix.image }}}}/**/*.xml\n'
f'\n'
f' - name: Push test image\n'
f' # We push test image to AWS ECR on push to Horovod master (not a fork)\n'
f' if: >\n'
f' github.event_name == \'push\' &&\n'
f' github.ref == \'refs/heads/master\' &&\n'
f' github.repository == \'horovod/horovod\' &&\n'
f' steps.ecr.outcome == \'success\'\n'
f' continue-on-error: true\n'
f' run: |\n'
f' docker image ls | head\n'
f' docker tag horovod_${{{{ matrix.image }}}} ${{{{ steps.ecr.outputs.registry }}}}/buildkite:horovod-${{{{ matrix.image }}}}-latest\n'
f' docker push ${{{{ steps.ecr.outputs.registry }}}}/buildkite:horovod-${{{{ matrix.image }}}}-latest\n'
f' docker image ls | head\n'
f' shell: bash\n')
f' path: artifacts/${{{{ matrix.image }}}}/**/*.xml\n')

def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3) -> str:
if 'init-workflow' not in needs:
Expand Down
183 changes: 3 additions & 180 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -429,54 +429,10 @@ jobs:
- name: Setup docker-compose
run: pip install docker-compose

- name: Configure AWS credentials
id: aws
uses: aws-actions/configure-aws-credentials@v1
# AWS credentials are used to authenticate against AWS ECR to pull and push test images
# We can only authenticate when running on Horovod repo (not a fork)
if: >
github.repository == 'horovod/horovod' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
continue-on-error: true
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1

- name: Login to Amazon ECR
id: ecr
if: steps.aws.outcome == 'success'
continue-on-error: true
uses: aws-actions/amazon-ecr-login@v1

- name: Add cache_from to docker-compose YAML
if: steps.ecr.outcome == 'success'
run: |
cat > docker-compose.test.override.yml <<EOF
version: '2.3'
services:
${{ matrix.image }}:
build:
cache_from:
- ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
EOF
cat docker-compose.test.override.yml
shell: bash

- name: Pull latest test image
if: steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
env:
DOCKER_BUILDKIT: 1

- name: Build
id: build
run: |
override_yaml=""
if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }}
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
Expand Down Expand Up @@ -2459,21 +2415,6 @@ jobs:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml

- name: Push test image
# We push test image to AWS ECR on push to Horovod master (not a fork)
if: >
github.event_name == 'push' &&
github.ref == 'refs/heads/master' &&
github.repository == 'horovod/horovod' &&
steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker image ls | head
docker tag horovod_${{ matrix.image }} ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker push ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker image ls | head
shell: bash

build-and-test-heads:
name: "Build and Test heads (${{ matrix.image }})"
needs: [init-workflow, build-and-test]
Expand Down Expand Up @@ -2567,54 +2508,10 @@ jobs:
- name: Setup docker-compose
run: pip install docker-compose

- name: Configure AWS credentials
id: aws
uses: aws-actions/configure-aws-credentials@v1
# AWS credentials are used to authenticate against AWS ECR to pull and push test images
# We can only authenticate when running on Horovod repo (not a fork)
if: >
github.repository == 'horovod/horovod' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
continue-on-error: true
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1

- name: Login to Amazon ECR
id: ecr
if: steps.aws.outcome == 'success'
continue-on-error: true
uses: aws-actions/amazon-ecr-login@v1

- name: Add cache_from to docker-compose YAML
if: steps.ecr.outcome == 'success'
run: |
cat > docker-compose.test.override.yml <<EOF
version: '2.3'
services:
${{ matrix.image }}:
build:
cache_from:
- ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
EOF
cat docker-compose.test.override.yml
shell: bash

- name: Pull latest test image
if: steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
env:
DOCKER_BUILDKIT: 1

- name: Build
id: build
run: |
override_yaml=""
if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }}
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
Expand Down Expand Up @@ -4597,21 +4494,6 @@ jobs:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml

- name: Push test image
# We push test image to AWS ECR on push to Horovod master (not a fork)
if: >
github.event_name == 'push' &&
github.ref == 'refs/heads/master' &&
github.repository == 'horovod/horovod' &&
steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker image ls | head
docker tag horovod_${{ matrix.image }} ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker push ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker image ls | head
shell: bash

build-mins:
name: "Build mins (${{ matrix.image }})"
needs: [init-workflow, build-and-test]
Expand Down Expand Up @@ -4671,54 +4553,10 @@ jobs:
- name: Setup docker-compose
run: pip install docker-compose

- name: Configure AWS credentials
id: aws
uses: aws-actions/configure-aws-credentials@v1
# AWS credentials are used to authenticate against AWS ECR to pull and push test images
# We can only authenticate when running on Horovod repo (not a fork)
if: >
github.repository == 'horovod/horovod' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
continue-on-error: true
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1

- name: Login to Amazon ECR
id: ecr
if: steps.aws.outcome == 'success'
continue-on-error: true
uses: aws-actions/amazon-ecr-login@v1

- name: Add cache_from to docker-compose YAML
if: steps.ecr.outcome == 'success'
run: |
cat > docker-compose.test.override.yml <<EOF
version: '2.3'
services:
${{ matrix.image }}:
build:
cache_from:
- ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
EOF
cat docker-compose.test.override.yml
shell: bash

- name: Pull latest test image
if: steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
env:
DOCKER_BUILDKIT: 1

- name: Build
id: build
run: |
override_yaml=""
if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }}
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
Expand All @@ -4731,21 +4569,6 @@ jobs:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml

- name: Push test image
# We push test image to AWS ECR on push to Horovod master (not a fork)
if: >
github.event_name == 'push' &&
github.ref == 'refs/heads/master' &&
github.repository == 'horovod/horovod' &&
steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker image ls | head
docker tag horovod_${{ matrix.image }} ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker push ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker image ls | head
shell: bash

build-and-test-macos:
name: "Build and Test macOS (${{ matrix.image }}-macos)"
needs: [init-workflow, build-and-test]
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def build_extensions(self):
'pyspark>=3.0.0;python_version>="3.8"']
spark_require_list = ['numpy', 'petastorm>=0.12.0', 'pyarrow>=0.15.0', 'fsspec>=2021.07.0']
# https://github.com/ray-project/ray/pull/17465
ray_require_list = ['ray', 'aioredis<2']
# google-api-core>=2.9.0 depends on protobuf<5.0.0dev,>=3.20.1, which conflicts with
# tensorflow protobuf~=3.20 and pytorch-lightning protobuf<3.20,>=3.9.2
ray_require_list = ['ray', 'aioredis<2', 'google-api-core<2.9.0']
pytorch_spark_require_list = pytorch_require_list + \
spark_require_list + \
pyspark_require_list + \
Expand Down

0 comments on commit ac55b99

Please sign in to comment.