diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index efb6a95da6c..8d5f2194b78 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -21,7 +21,7 @@ jobs: version-cpu: name: "Latest Accelerate CPU [version]" - runs-on: [self-hosted, intel-cpu, 8-cpu, ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] needs: get-version steps: - name: Set up Docker Buildx @@ -41,7 +41,7 @@ jobs: version-cuda: name: "Latest Accelerate GPU [version]" - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] needs: get-version steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 75b9fb9eefe..59f3e4dda61 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,9 +11,19 @@ concurrency: cancel-in-progress: false jobs: + clean-storage: + name: "Clean docker image storage" + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + steps: + - name: Clean storage + run: | + docker image prune --all -f --filter "until=48h" + docker system prune --all -f --filter "until=48h" + latest-cpu: name: "Latest Accelerate CPU [dev]" - runs-on: [self-hosted, intel-cpu, 8-cpu, ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + needs: clean-storage steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -31,7 +41,8 @@ jobs: latest-cuda: name: "Latest Accelerate GPU [dev]" - runs-on: [self-hosted, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + needs: clean-storage steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index a06cae176c7..09e64e060e3 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -13,7 +13,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0" TEST_TYPE: "single_gpu" @@ -22,25 +22,23 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone & pip install run: | source activate accelerate - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run test on GPUs - working-directory: accelerate run: | source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | source activate accelerate @@ -48,14 +46,13 @@ jobs: make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0,1" TEST_TYPE: "multi_gpu" @@ -64,19 +61,18 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone run: | source activate accelerate - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run core and big modeling tests on GPUs - working-directory: accelerate run: | source activate accelerate make test_core @@ -84,14 +80,12 @@ jobs: make test_cli - name: Run Integration tests on GPUs - working-directory: accelerate if: always() run: | source activate accelerate make test_integrations - name: Run examples on GPUs - working-directory: accelerate if: always() run: | source activate accelerate @@ -99,7 +93,6 @@ jobs: make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | pip install slack_sdk tabulate diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml index 42bf11e7516..7dacab8c508 100644 --- a/.github/workflows/run_merge_tests.yml +++ b/.github/workflows/run_merge_tests.yml @@ -10,7 +10,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0" container: @@ -18,81 +18,72 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - - name: Install accelerate + - name: Update clone & pip install run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing,test_trackers] -U; - pip install pytest-reportlog tabulate ; + source activate accelerate + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} + pip install -e .[testing,test_trackers] -U + pip install pytest-reportlog tabulate - - name: Run CLI tests (use make cli) - working-directory: accelerate + - name: Run CLI tests run: | - source activate accelerate; + source activate accelerate make test_cli - name: Run test on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; + source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; - pip uninstall comet_ml -y; + source activate accelerate + pip uninstall comet_ml -y make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | - pip install tabulate; + pip install tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] - env: - CUDA_VISIBLE_DEVICES: 0,1 + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing,test_trackers] -U; + source activate accelerate + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} + pip install -e .[testing,test_trackers] -U pip install pytest-reportlog tabulate - name: Run test on GPUs - working-directory: accelerate run: | - source activate accelerate; + source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; - pip uninstall comet_ml -y; + source activate accelerate + pip uninstall comet_ml -y make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | - source activate accelerate; - python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + pip install tabulate + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index cd82295e4e2..94e50e61ff3 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -25,7 +25,7 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] strategy: fail-fast: false matrix: @@ -34,22 +34,22 @@ jobs: "0,1" ] steps: - - name: Install transformers - run: | + - name: Update accelerate clone and pip install + working-directory: accelerate/ + run: source activate accelerate; - git clone https://github.com/huggingface/transformers --depth 1; - cd transformers; - pip install .[torch,deepspeed-testing]; - cd ..; + git config --global --add safe.directory '*'; + git checkout main && git fetch && git checkout ${{ github.sha }}; + pip install -e .; - - name: Install accelerate + - name: Update transformers clone & pip install + working-directory: transformers/ run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }} ; - pip install -e .[testing]; - cd ..; + source activate accelerate + git config --global --add safe.directory '*' + git checkout main && git pull + pip install .[torch,deepspeed-testing] + pip uninstall comet_ml wandb -y - name: Show installed libraries run: | @@ -89,20 +89,20 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] strategy: fail-fast: false steps: - - name: Install accelerate + - name: Update accelerate clone and pip install + working-directory: accelerate/ run: source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing]; - cd .. + git config --global --add safe.directory '*'; + git checkout main && git fetch && git checkout ${{ github.sha }}; + pip install -e .; - - name: Install skorch + - name: Update skorch clone & pip install + working-directory: skorch/ run: | source activate accelerate git config --global --add safe.directory '*'