Run torchbench on CI (#373)

* Run torchbench on CI * Run torchbench on CI #2 * Run torchbench on CI #3 * Run torchbench on CI #4 * Run torchbench on CI #5 * Run torchbench on CI #6 * Run torchbench on CI #7 * Change #1 * Change #2 * Run torchbench on CI #8 * Run torchbench on CI #9 * Install dep * Change #11 * Change #12 * Change #13 * Change #14 * Change #15 * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * More fixes * Update dependencies * other fixes * more fixes * more fixes * more fixes * more fixes #2 * more fixes #2 * more fixes #2 * more fixes * other fixes * other fixes * other fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * additional changes * other fixes * other fixes * additional changes * other fixes * other fixes * other fixes * other fixes * other fixes * Address PR comments * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes * more fixes
kulinseth · Mar 29, 2023 · 956c9c5 · 956c9c5
1 parent 2c6d325
commit 956c9c5
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 23 deletions.
diff --git a/.github/scripts/run_torchbench.py b/.github/scripts/run_torchbench.py
@@ -1,14 +1,10 @@
 """
-Generate a torchbench test report from a file containing the PR body.
+Generate a torchbench test report from a file based on GLOBAL_PR_LIST.
 Currently, only supports running tests on specified model names
 
 Testing environment:
-- Intel Xeon 8259CL @ 2.50 GHz, 24 Cores with disabled Turbo and HT
-- Nvidia Tesla T4
-- Nvidia Driver 470.82.01
-- Python 3.8
-- CUDA 11.3
 """
+
 # Known issues:
 # 1. Does not reuse the build artifact in other CI workflows
 # 2. CI jobs are serialized because there is only one worker
@@ -22,6 +18,46 @@
 
 from typing import List, Tuple
 
+GLOBAL_PR_LIST = [
+    "test_train[alexnet-mps-eager]",
+    "test_train[dcgan-mps-eager]",
+    "test_train[hf_Bert-mps-eager]",
+    "test_train[mnasnet1_0-mps-eager]",
+    "test_train[mobilenet_v2-mps-eager]",
+    "test_train[pytorch_unet-mps-eager]",
+    "test_train[resnet18-mps-eager]",
+    "test_train[resnet50-mps-eager]",
+    "test_train[resnext50_32x4d-mps-eager]",
+    "test_train[shufflenet_v2_x1_0-mps-eager]",
+    "test_train[timm_efficientnet-mps-eager]",
+    "test_train[timm_nfnet-mps-eager]",
+    "test_train[timm_regnet-mps-eager]",
+    "test_train[timm_resnest-mps-eager]",
+    "test_train[timm_vision_transformer-mps-eager]",
+    "test_train[timm_vovnet-mps-eager]",
+    "test_train[soft_actor_critic-mps-eager]",
+    "test_train[hf_DistilBert-mps-eager]",
+    "test_train[hf_Bart-mps-eager]",
+    "test_train[hf_Albert-mps-eager]",
+    "test_train[hf_GPT2-mps-eager]",
+    "test_train[lennard_jones-mps-eager]",
+    "test_train[pytorch_stargan-mps-eager]",
+    "test_train[pytorch_struct-mps-eager]",
+    "test_train[timm_vision_transformer_large-mps-eager]",
+    "test_train[functorch_dp_cifar10-mps-eager]",
+    "test_train[squeezenet1_1-mps-eager]",
+    "test_train[hf_T5_base-mps-eager]",
+    "test_train[hf_T5_large-mps-eager]",
+    "test_train[densenet121-mps-eager]",
+    "test_train[phlippe_resnet-mps-eager]",
+    "test_train[phlippe_densenet-mps-eager]",
+    "test_train[tts_angular-mps-eager]",
+    "test_train[DALLE2_pytorch-mps-eager]",
+    "test_train[functorch_maml_omniglot-mps-eager]",
+    "test_train[demucs-mps-eager]",
+    "test_train[vgg16-mps-eager]"
+]
+
 TORCHBENCH_CONFIG_NAME = "config.yaml"
 TORCHBENCH_USERBENCHMARK_CONFIG_NAME = "ub-config.yaml"
 MAGIC_PREFIX = "RUN_TORCHBENCH:"
@@ -78,7 +114,14 @@ def find_current_branch(repo_path: str) -> str:
 
 def deploy_torchbench_config(output_dir: str, config: str, config_name: str = TORCHBENCH_CONFIG_NAME) -> None:
     # Create test dir if needed
-    pathlib.Path(output_dir).mkdir(exist_ok=True)
+    # pathlib.Path(output_dir).mkdir(exist_ok=True)
+    try:
+        pathlib.Path(output_dir).mkdir(parents=True, exist_ok=False)
+    except FileExistsError:
+        print("Folder is already there")
+    else:
+        print("Folder was created")
+
     # TorchBench config file name
     config_path = os.path.join(output_dir, config_name)
     with open(config_path, "w") as fp:
@@ -100,23 +143,9 @@ def is_valid_ub_dir(ub_path: str) -> bool:
 def extract_models_from_pr(torchbench_path: str, prbody_file: str) -> Tuple[List[str], List[str]]:
     model_list = []
     userbenchmark_list = []
-    pr_list = []
-    with open(prbody_file, "r") as pf:
-        lines = map(lambda x: x.strip(), pf.read().splitlines())
-        magic_lines = list(filter(lambda x: x.startswith(MAGIC_PREFIX), lines))
-        if magic_lines:
-            # Only the first magic line will be recognized.
-            pr_list = list(map(lambda x: x.strip(), magic_lines[0][len(MAGIC_PREFIX):].split(",")))
-    valid_models = get_valid_models(torchbench_path)
-    valid_ubs = get_valid_userbenchmarks(torchbench_path)
+    pr_list = GLOBAL_PR_LIST
     for pr_bm in pr_list:
-        if pr_bm in valid_models or pr_bm == "ALL":
-            model_list.append(pr_bm)
-        elif pr_bm in valid_ubs:
-            userbenchmark_list.append(pr_bm)
-        else:
-            print(f"The model or benchmark {pr_bm} you specified does not exist in TorchBench suite. Please double check.")
-            exit(-1)
+        model_list.append(pr_bm)
     # Shortcut: if pr_list is ["ALL"], run all the model tests
     if "ALL" in model_list:
         model_list = ["ALL"]

diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
@@ -0,0 +1,110 @@
+name: TorchBench CI
+on:
+  pull_request:
+    types: [labeled, opened, synchronize, reopened]
+  workflow_dispatch:
+
+env:
+  PYTHON_VERSION: "3.9"
+  # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
+  PR_NUM: ${{ github.event.number }}
+  PR_BODY: ${{ github.event.pull_request.body }}
+  PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
+  PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+
+jobs:
+  run-torchbench:
+    runs-on: [macos-torchbench]
+    # Set to 6 hours
+    timeout-minutes: 360
+    if: contains(github.event.pull_request.labels.*.name, 'ciflow/torchbench')
+    steps:
+
+      - name: Clean up disk space before running MacOS workflow
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+
+      - name: Create conda environment and install deps
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} pip install boto3
+          ${CONDA_RUN} conda install -y pytest tabulate gitpython git-lfs tqdm psutil
+          ${CONDA_RUN} conda install -yq -c conda-forge spacy sentencepiece transformers
+          ${CONDA_RUN} pip3 install --pre torch torchtext torchvision torchaudio torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+
+      - name: Setup TorchBench branch
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          PR_BODY_FILE=/tmp/pr-body.txt
+          echo "$PR_BODY" > ${PR_BODY_FILE}
+          ${CONDA_RUN} python3 .github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch
+
+      - name: Checkout TorchBench
+        uses: malfet/checkout@silent-checkout
+        with:
+          repository: razarmehr/benchmark
+          path: benchmark
+          lfs: false
+          ref: ${{ env.TORCHBENCH_BRANCH }}
+
+      - name: Run TorchBench
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -x
+          # pushd pytorch
+          PR_MERGE_BASE=$(git rev-parse origin/master)
+          # popd
+          PR_BODY_FILE=/tmp/pr-body.txt
+          echo "$PR_BODY" > ${PR_BODY_FILE}
+          # shellcheck source=/dev/null
+          ${CONDA_RUN} python3 .github/scripts/run_torchbench.py \
+                  --pr-body "$PR_BODY_FILE" \
+                  run \
+                  --pytorch-path "${PWD}" \
+                  --torchbench-path "${PWD}"/benchmark \
+                  --pr-num "$PR_NUM" \
+                  --pr-base-sha "$PR_MERGE_BASE" \
+                  --pr-head-sha "$PR_HEAD_SHA"
+
+      - name: Remove conda environment and cleanup
+        run: |
+          rm /tmp/pr-body.txt
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: TorchBench result
+          path: ~/.torchbench/bisection/pr${{ github.event.number }}
+
+      - name: Clean up disk space
+        if: always()
+        continue-on-error: true
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true