diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 7e30cde735fa..5da145c2b006 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -12,9 +12,6 @@ on: slice_id: required: true type: number - runner_map: - required: false - type: string docker: required: true type: string @@ -54,10 +51,12 @@ jobs: matrix: folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} runs-on: - group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }} + group: '${{ inputs.machine_type }}' container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + machine_type: ${{ steps.set_machine_type.outputs.machine_type }} steps: - name: Echo input and matrix info shell: bash @@ -111,6 +110,7 @@ jobs: run: pip freeze - name: Set `machine_type` for report and artifact names + id: set_machine_type working-directory: /transformers shell: bash run: | @@ -126,6 +126,7 @@ jobs: echo "$machine_type" echo "machine_type=$machine_type" >> $GITHUB_ENV + echo "machine_type=$machine_type" >> $GITHUB_OUTPUT - name: Run all tests on GPU working-directory: /transformers @@ -159,5 +160,5 @@ jobs: job: run_models_gpu report_repo_id: ${{ inputs.report_repo_id }} gpu_name: ${{ inputs.runner_type }} - machine_type: ${{ inputs.machine_type }} + machine_type: ${{ needs.run_models_gpu.outputs.machine_type }} secrets: inherit diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 78c7f3c60f23..01f5a0a48bdd 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -88,6 +88,7 @@ jobs: job: run_trainer_and_fsdp_gpu slack_report_channel: "#transformers-ci-daily-training" docker: huggingface/transformers-all-latest-gpu + runner_type: "a10" ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci commit_sha: ${{ github.sha }} diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index a5dbc9d59a82..7129b1867fc4 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -68,7 +68,6 @@ jobs: outputs: folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} - runner_map: ${{ steps.set-matrix.outputs.runner_map }} quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }} steps: - name: Update clone @@ -95,7 +94,6 @@ jobs: if [ "${{ inputs.job }}" = "run_models_gpu" ]; then echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT - echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT @@ -119,14 +117,13 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [single-gpu, multi-gpu] + machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache] slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} uses: ./.github/workflows/model_jobs.yml with: folder_slices: ${{ needs.setup.outputs.folder_slices }} machine_type: ${{ matrix.machine_type }} slice_id: ${{ matrix.slice_id }} - runner_map: ${{ needs.setup.outputs.runner_map }} docker: ${{ inputs.docker }} commit_sha: ${{ inputs.commit_sha || github.sha }} runner_type: ${{ inputs.runner_type }} @@ -147,9 +144,10 @@ jobs: folder_slices: ${{ needs.setup.outputs.folder_slices }} machine_type: ${{ matrix.machine_type }} slice_id: ${{ matrix.slice_id }} - runner_map: ${{ needs.setup.outputs.runner_map }} docker: ${{ inputs.docker }} commit_sha: ${{ inputs.commit_sha || github.sha }} + runner_type: ${{ inputs.runner_type }} + report_repo_id: ${{ inputs.report_repo_id }} report_name_prefix: run_trainer_and_fsdp_gpu secrets: inherit diff --git a/utils/get_runner_map.py b/utils/get_runner_map.py deleted file mode 100644 index 7b36651165bc..000000000000 --- a/utils/get_runner_map.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is used to get a map containing the information of runners to use in GitHub Actions workflow files. -This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners. - -The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json). -Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports. -We will work on the tests toward to use A10 for all CI jobs. -""" - -import os - -import requests - - -if __name__ == "__main__": - # T4 - t4_runners = { - "single-gpu": "aws-g4dn-4xlarge-cache", - "multi-gpu": "aws-g4dn-12xlarge-cache", - } - - # A10 - a10_runners = { - "single-gpu": "aws-g5-4xlarge-cache", - "multi-gpu": "aws-g5-12xlarge-cache", - } - - tests = os.getcwd() - model_tests = os.listdir(os.path.join(tests, "models")) - d1 = sorted(filter(os.path.isdir, os.listdir(tests))) - d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests])) - d1.remove("models") - d = d2 + d1 - - response = requests.get( - "https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json" - ) - # The models that we want to run with T4 runners - jobs_using_t4 = response.json() - - runner_map = {} - for key in d: - modified_key = key - if modified_key.startswith("models/"): - modified_key = key[len("models/") :] - if modified_key in jobs_using_t4: - runner_map[key] = t4_runners - else: - runner_map[key] = a10_runners - - print(runner_map)