Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions .github/workflows/model_jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ on:
slice_id:
required: true
type: number
runner_map:
required: false
type: string
docker:
required: true
type: string
Expand Down Expand Up @@ -54,10 +51,12 @@ jobs:
matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on:
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
group: '${{ inputs.machine_type }}'
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
machine_type: ${{ steps.set_machine_type.outputs.machine_type }}
steps:
- name: Echo input and matrix info
shell: bash
Expand Down Expand Up @@ -111,6 +110,7 @@ jobs:
run: pip freeze

- name: Set `machine_type` for report and artifact names
id: set_machine_type
working-directory: /transformers
shell: bash
run: |
Expand All @@ -126,6 +126,7 @@ jobs:

echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
echo "machine_type=$machine_type" >> $GITHUB_OUTPUT

- name: Run all tests on GPU
working-directory: /transformers
Expand Down Expand Up @@ -159,5 +160,5 @@ jobs:
job: run_models_gpu
report_repo_id: ${{ inputs.report_repo_id }}
gpu_name: ${{ inputs.runner_type }}
machine_type: ${{ inputs.machine_type }}
machine_type: ${{ needs.run_models_gpu.outputs.machine_type }}
secrets: inherit
1 change: 1 addition & 0 deletions .github/workflows/self-scheduled-caller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:
job: run_trainer_and_fsdp_gpu
slack_report_channel: "#transformers-ci-daily-training"
docker: huggingface/transformers-all-latest-gpu
runner_type: "a10"
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
commit_sha: ${{ github.sha }}
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ jobs:
outputs:
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
steps:
- name: Update clone
Expand All @@ -95,7 +94,6 @@ jobs:
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
Expand All @@ -119,14 +117,13 @@ jobs:
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
uses: ./.github/workflows/model_jobs.yml
with:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
machine_type: ${{ matrix.machine_type }}
slice_id: ${{ matrix.slice_id }}
runner_map: ${{ needs.setup.outputs.runner_map }}
docker: ${{ inputs.docker }}
commit_sha: ${{ inputs.commit_sha || github.sha }}
runner_type: ${{ inputs.runner_type }}
Expand All @@ -147,9 +144,10 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
machine_type: ${{ matrix.machine_type }}
slice_id: ${{ matrix.slice_id }}
runner_map: ${{ needs.setup.outputs.runner_map }}
docker: ${{ inputs.docker }}
commit_sha: ${{ inputs.commit_sha || github.sha }}
runner_type: ${{ inputs.runner_type }}
report_repo_id: ${{ inputs.report_repo_id }}
report_name_prefix: run_trainer_and_fsdp_gpu
secrets: inherit

Expand Down
65 changes: 0 additions & 65 deletions utils/get_runner_map.py

This file was deleted.

Loading