From b88ea9ff313c78e06b8b96458603317e62c279d9 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Wed, 20 Aug 2025 10:10:24 +0200 Subject: [PATCH 1/2] fix nightly ci --- .github/workflows/check_failed_tests.yml | 5 ++- .github/workflows/model_jobs.yml | 5 ++- .github/workflows/self-nightly-caller.yml | 37 ++++++++------------- .github/workflows/self-scheduled-caller.yml | 11 ++++-- .github/workflows/self-scheduled.yml | 21 ++++++++---- .github/workflows/slack-report.yml | 10 +++++- utils/notification_service.py | 6 ++-- 7 files changed, 55 insertions(+), 40 deletions(-) diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml index adafd5c046bd..2214ef6c4387 100644 --- a/.github/workflows/check_failed_tests.yml +++ b/.github/workflows/check_failed_tests.yml @@ -21,6 +21,9 @@ on: report_repo_id: required: true type: string + commit_sha: + required: false + type: string env: @@ -87,7 +90,7 @@ jobs: - name: Update clone working-directory: /transformers if: ${{ env.process == 'true' }} - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Get target commit working-directory: /transformers/utils diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index b7818d798d5b..6ca019e7c93f 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -18,6 +18,9 @@ on: docker: required: true type: string + commit_sha: + required: false + type: string report_name_prefix: required: false default: run_models_gpu @@ -70,7 +73,7 @@ jobs: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml index 5538e2d56e74..97fda8841fc4 100644 --- a/.github/workflows/self-nightly-caller.yml +++ b/.github/workflows/self-nightly-caller.yml @@ -1,43 +1,32 @@ -name: Self-hosted runner (nightly-ci) - +name: Nvidia CI with nightly torch on: repository_dispatch: - schedule: - - cron: "17 2 * * *" + # triggered when the daily scheduled Nvidia CI is completed. + # This way, we can compare the results more easily. + workflow_run: + workflows: ["Nvidia CI"] + branches: ["main"] + types: [completed] push: branches: - - run_nightly_ci* + - run_ci_with_nightly_torch* jobs: - build_nightly_ci_images: - name: Build Nightly CI Docker Images - if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) + build_nightly_torch_ci_images: + name: Build CI Docker Images with nightly torch uses: ./.github/workflows/build-nightly-ci-docker-images.yml secrets: inherit model-ci: name: Model CI - needs: [build_nightly_ci_images] + needs: [build_nightly_torch_ci_images] uses: ./.github/workflows/self-scheduled.yml with: job: run_models_gpu slack_report_channel: "#transformers-ci-past-future" - runner: ci docker: huggingface/transformers-all-latest-torch-nightly-gpu ci_event: Nightly CI - secrets: inherit - - deepspeed-ci: - name: DeepSpeed CI - needs: [build_nightly_ci_images] - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_torch_cuda_extensions_gpu - slack_report_channel: "#transformers-ci-past-future" - runner: ci - # test deepspeed nightly build with the latest release torch - docker: huggingface/transformers-pytorch-deepspeed-latest-gpu - ci_event: Nightly CI - working-directory-prefix: /workspace + report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly + commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }} secrets: inherit diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 88eee91a3bc2..d709c562251e 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -1,5 +1,4 @@ -name: Self-hosted runner (scheduled) - +name: Nvidia CI on: repository_dispatch: @@ -7,7 +6,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_scheduled_ci* + - run_nvidia_ci* workflow_dispatch: inputs: prev_workflow_run_id: @@ -54,6 +53,7 @@ jobs: docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit torch-pipeline: @@ -65,6 +65,7 @@ jobs: docker: huggingface/transformers-pytorch-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit example-ci: @@ -76,6 +77,7 @@ jobs: docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit trainer-fsdp-ci: @@ -87,6 +89,7 @@ jobs: docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit deepspeed-ci: @@ -99,6 +102,7 @@ jobs: ci_event: Daily CI working-directory-prefix: /workspace report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit quantization-ci: @@ -110,4 +114,5 @@ jobs: docker: huggingface/transformers-quantization-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci + commit_sha: ${{ github.sha }} secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 4b482c28fbb7..b5f6685d30e7 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (scheduled) +name: Nvidia CI (job definitions) # Note that each job's dependencies go into a corresponding docker file. # @@ -28,6 +28,9 @@ on: report_repo_id: required: true type: string + commit_sha: + required: false + type: string env: @@ -46,8 +49,8 @@ env: jobs: setup: - if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job) name: Setup + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job) strategy: matrix: machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache] @@ -119,6 +122,7 @@ jobs: slice_id: ${{ matrix.slice_id }} runner_map: ${{ needs.setup.outputs.runner_map }} docker: ${{ inputs.docker }} + commit_sha: ${{ inputs.commit_sha || github.sha }} secrets: inherit run_trainer_and_fsdp_gpu: @@ -137,6 +141,7 @@ jobs: slice_id: ${{ matrix.slice_id }} runner_map: ${{ needs.setup.outputs.runner_map }} docker: ${{ inputs.docker }} + commit_sha: ${{ inputs.commit_sha || github.sha }} report_name_prefix: run_trainer_and_fsdp_gpu secrets: inherit @@ -155,7 +160,7 @@ jobs: steps: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers @@ -223,7 +228,7 @@ jobs: steps: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers @@ -292,7 +297,7 @@ jobs: steps: - name: Update clone working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: ${{ inputs.working-directory-prefix }}/transformers @@ -400,7 +405,7 @@ jobs: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers @@ -464,6 +469,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 2 + ref: ${{ inputs.commit_sha || github.sha }} - name: Install transformers run: pip install transformers @@ -518,6 +524,7 @@ jobs: quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} ci_event: ${{ inputs.ci_event }} report_repo_id: ${{ inputs.report_repo_id }} + commit_sha: ${{ inputs.commit_sha || github.sha }} secrets: inherit @@ -528,7 +535,7 @@ jobs: uses: ./.github/workflows/check_failed_tests.yml with: docker: ${{ inputs.docker }} - start_sha: ${{ github.sha }} + start_sha: ${{ inputs.commit_sha || github.sha }} job: ${{ inputs.job }} slack_report_channel: ${{ inputs.slack_report_channel }} ci_event: ${{ inputs.ci_event }} diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 5ef749469645..88da4e38d4dc 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -24,6 +24,10 @@ on: report_repo_id: required: true type: string + commit_sha: + required: false + type: string + env: TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} @@ -41,6 +45,10 @@ jobs: echo "Setup status: ${{ inputs.setup_status }}" - uses: actions/checkout@v4 + with: + fetch-depth: 2 + ref: ${{ inputs.commit_sha || github.sha }} + - uses: actions/download-artifact@v4 - name: Prepare some setup values @@ -67,7 +75,7 @@ jobs: SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: ${{ inputs.ci_event }} - CI_SHA: ${{ github.sha }} + CI_SHA: ${{ inputs.commit_sha || github.sha }} CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} REPORT_REPO_ID: ${{ inputs.report_repo_id }} diff --git a/utils/notification_service.py b/utils/notification_service.py index 33abd1790c71..b6d4e8b84c72 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -669,7 +669,7 @@ def payload(self) -> str: "text": { "type": "mrkdwn", # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment. - "text": f"*There are {nb_new_failed_tests} failed tests unique to {'this run' if not is_amd_daily_ci_workflow else 'AMD'}*\n\n(compared to Nvidia CI: )", + "text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to{' Nvidia CI ' if is_scheduled_ci_run else ' '}run: )", }, "accessory": { "type": "button", @@ -1406,13 +1406,13 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any: is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule" # For AMD workflow runs: the different AMD CI callers (MI210/MI250/MI300, etc.) are triggered by `workflow_run` # event of `.github/workflows/self-scheduled-amd-caller.yml`. - if is_amd_daily_ci_workflow: + if os.environ.get("GITHUB_EVENT_NAME") == "workflow_run": # Get the path to the file on the runner that contains the full event webhook payload. event_payload_path = os.environ.get("GITHUB_EVENT_PATH") # Load the event payload with open(event_payload_path) as fp: event_payload = json.load(fp) - # The event that triggers the `workflow_run` event. + # The event that triggers the original `workflow_run`. if "workflow_run" in event_payload: is_scheduled_ci_run = event_payload["workflow_run"]["event"] == "schedule" From c2167ecc970f04ee192c392351c10085f0dd7d9b Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:59:28 +0200 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> --- .github/workflows/self-nightly-caller.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml index 97fda8841fc4..6192c8039fc9 100644 --- a/.github/workflows/self-nightly-caller.yml +++ b/.github/workflows/self-nightly-caller.yml @@ -20,7 +20,7 @@ jobs: model-ci: name: Model CI - needs: [build_nightly_torch_ci_images] + needs: build_nightly_torch_ci_images uses: ./.github/workflows/self-scheduled.yml with: job: run_models_gpu