huggingface · ydshieh · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025 · ydshieh
diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml
@@ -21,6 +21,9 @@ on:
       report_repo_id:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
 
 
 env:
@@ -87,7 +90,7 @@ jobs:
       - name: Update clone
         working-directory: /transformers
         if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Get target commit
         working-directory: /transformers/utils

diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
@@ -18,6 +18,9 @@ on:
       docker:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
       report_name_prefix:
         required: false
         default: run_models_gpu
@@ -70,7 +73,7 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers

diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml
@@ -1,43 +1,32 @@
-name: Self-hosted runner (nightly-ci)
-
+name: Nvidia CI with nightly torch
 
 on:
   repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
+  # triggered when the daily scheduled Nvidia CI is completed.
+  # This way, we can compare the results more easily.
+  workflow_run:
+    workflows: ["Nvidia CI"]
+    branches: ["main"]
+    types: [completed]
   push:
     branches:
-      - run_nightly_ci*
+      - run_ci_with_nightly_torch*
 
 jobs:
-  build_nightly_ci_images:
-    name: Build Nightly CI Docker Images
-    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
+  build_nightly_torch_ci_images:
+    name: Build CI Docker Images with nightly torch
     uses: ./.github/workflows/build-nightly-ci-docker-images.yml
     secrets: inherit
 
   model-ci:
     name: Model CI
-    needs: [build_nightly_ci_images]
+    needs: build_nightly_torch_ci_images
     uses: ./.github/workflows/self-scheduled.yml
     with:
       job: run_models_gpu
       slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
       docker: huggingface/transformers-all-latest-torch-nightly-gpu
       ci_event: Nightly CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    needs: [build_nightly_ci_images]
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
-      # test deepspeed nightly build with the latest release torch
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Nightly CI
-      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
+      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
     secrets: inherit
diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml
@@ -1,13 +1,12 @@
-name: Self-hosted runner (scheduled)
-
+name: Nvidia CI
 
 on:
   repository_dispatch:
   schedule:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_scheduled_ci*
+      - run_nvidia_ci*
   workflow_dispatch:
     inputs:
       prev_workflow_run_id:
@@ -54,6 +53,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   torch-pipeline:
@@ -65,6 +65,7 @@ jobs:
       docker: huggingface/transformers-pytorch-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   example-ci:
@@ -76,6 +77,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   trainer-fsdp-ci:
@@ -87,6 +89,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   deepspeed-ci:
@@ -99,6 +102,7 @@ jobs:
       ci_event: Daily CI
       working-directory-prefix: /workspace
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   quantization-ci:
@@ -110,4 +114,5 @@ jobs:
       docker: huggingface/transformers-quantization-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -1,4 +1,4 @@
-name: Self-hosted runner (scheduled)
+name: Nvidia CI (job definitions)
 
 # Note that each job's dependencies go into a corresponding docker file.
 #
@@ -28,6 +28,9 @@ on:
       report_repo_id:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
 
 
 env:
@@ -46,8 +49,8 @@ env:
 
 jobs:
   setup:
-    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
     name: Setup
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
     strategy:
       matrix:
         machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
@@ -119,6 +122,7 @@ jobs:
       slice_id: ${{ matrix.slice_id }}
       runner_map: ${{ needs.setup.outputs.runner_map }}
       docker: ${{ inputs.docker }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
     secrets: inherit
 
   run_trainer_and_fsdp_gpu:
@@ -137,6 +141,7 @@ jobs:
       slice_id: ${{ matrix.slice_id }}
       runner_map: ${{ needs.setup.outputs.runner_map }}
       docker: ${{ inputs.docker }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
       report_name_prefix: run_trainer_and_fsdp_gpu
     secrets: inherit
 
@@ -155,7 +160,7 @@ jobs:
     steps:
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -223,7 +228,7 @@ jobs:
     steps:
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -292,7 +297,7 @@ jobs:
     steps:
       - name: Update clone
         working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: ${{ inputs.working-directory-prefix }}/transformers
@@ -400,7 +405,7 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -464,6 +469,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 2
+          ref: ${{ inputs.commit_sha || github.sha }}
 
       - name: Install transformers
         run: pip install transformers
@@ -518,6 +524,7 @@ jobs:
       quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
       ci_event: ${{ inputs.ci_event }}
       report_repo_id: ${{ inputs.report_repo_id }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
 
     secrets: inherit
 
@@ -528,7 +535,7 @@ jobs:
     uses: ./.github/workflows/check_failed_tests.yml
     with:
       docker: ${{ inputs.docker }}
-      start_sha: ${{ github.sha }}
+      start_sha: ${{ inputs.commit_sha || github.sha }}
       job: ${{ inputs.job }}
       slack_report_channel: ${{ inputs.slack_report_channel }}
       ci_event: ${{ inputs.ci_event }}

diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml
@@ -24,6 +24,10 @@ on:
       report_repo_id:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
+
 
 env:
   TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@@ -41,6 +45,10 @@ jobs:
           echo "Setup status: ${{ inputs.setup_status }}"
 
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+          ref: ${{ inputs.commit_sha || github.sha }}
+
       - uses: actions/download-artifact@v4
 
       - name: Prepare some setup values
@@ -67,7 +75,7 @@ jobs:
           SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
           CI_EVENT: ${{ inputs.ci_event }}
-          CI_SHA: ${{ github.sha }}
+          CI_SHA: ${{ inputs.commit_sha || github.sha }}
           CI_TEST_JOB: ${{ inputs.job }}
           SETUP_STATUS: ${{ inputs.setup_status }}
           REPORT_REPO_ID: ${{ inputs.report_repo_id }}

diff --git a/utils/notification_service.py b/utils/notification_service.py
@@ -669,7 +669,7 @@ def payload(self) -> str:
                         "text": {
                             "type": "mrkdwn",
                             # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment.
-                            "text": f"*There are {nb_new_failed_tests} failed tests unique to {'this run' if not is_amd_daily_ci_workflow else 'AMD'}*\n\n(compared to Nvidia CI: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
+                            "text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to{' Nvidia CI ' if is_scheduled_ci_run else ' '}run: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
                         },
                         "accessory": {
                             "type": "button",
@@ -1406,13 +1406,13 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
     is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule"
     # For AMD workflow runs: the different AMD CI callers (MI210/MI250/MI300, etc.) are triggered by `workflow_run`
     #  event of `.github/workflows/self-scheduled-amd-caller.yml`.
-    if is_amd_daily_ci_workflow:
+    if os.environ.get("GITHUB_EVENT_NAME") == "workflow_run":
         # Get the path to the file on the runner that contains the full event webhook payload.
         event_payload_path = os.environ.get("GITHUB_EVENT_PATH")
         # Load the event payload
         with open(event_payload_path) as fp:
             event_payload = json.load(fp)
-            # The event that triggers the `workflow_run` event.
+            # The event that triggers the original `workflow_run`.
             if "workflow_run" in event_payload:
                 is_scheduled_ci_run = event_payload["workflow_run"]["event"] == "schedule"