Merge pull request bigscience-workshop#18 from OpenGPTX/feature/add_k…

…8s_action_runner Feature/add k8s action runner
janEbert · Sep 19, 2022 · d89b00e · d89b00e
2 parents 688337e + 78e7875
commit d89b00e
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 223 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,205 +7,26 @@ on:
       - "**.py"
 
 jobs:
-
-# GPU sizes and types that we could use:
-# g4dn.12xlarge  4x 16GB T4 (CC 7.5) (low availability)
-# p3.8xlarge     4x 16GB V100 (CC 7.0) (very low availability)
-
-# Unfit:
-# g3.16xlarge    4x 8GB Tesla M60 (CC 5.2) (not supported by cuda-11)
-# p2.8xlarge     8x 12GB K80 (CC 3.7 not supported by cuda-11)
-
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-1
-      # don't use the following subnets as p3.8xlarge is not supported there:
-      # - subnet-06576a4b # us-east-1d
-      # - subnet-859322b4 # us-east-1e
-      # - subnet-47cfad21 # us-east-1b
-      - name: Try to start EC2 runner (a)
-        id: try-us-east-1a
-        uses: machulav/ec2-github-runner@v2
-        continue-on-error: true
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: g4dn.12xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-b7533b96 # us-east-1c
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-      - name: Try to start EC2 runner (b)
-        id: try-us-east-1b
-        if: steps.try-us-east-1a.outcome == 'failure'
-        uses: machulav/ec2-github-runner@v2
-        continue-on-error: true
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: g4dn.12xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-a396b2ad # us-east-1f
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-      - name: Try to start EC2 runner (c)
-        id: try-us-east-1c
-        if: steps.try-us-east-1b.outcome == 'failure'
-        uses: machulav/ec2-github-runner@v2
-        continue-on-error: true
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: g4dn.12xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-df0f6180 # us-east-1a
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-      - name: Try to start EC2 runner (a-2)
-        id: try-us-east-1a-2
-        if: steps.try-us-east-1c.outcome == 'failure'
-        uses: machulav/ec2-github-runner@v2
-        continue-on-error: true
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: p3.8xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-b7533b96 # us-east-1c
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-      - name: Try to start EC2 runner (b-2)
-        id: try-us-east-1b-2
-        if: steps.try-us-east-1a-2.outcome == 'failure'
-        uses: machulav/ec2-github-runner@v2
-        continue-on-error: true
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: p3.8xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-a396b2ad # us-east-1f
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-      - name: Try to start EC2 runner (c-2)
-        id: try-us-east-1c-2
-        if: steps.try-us-east-1b-2.outcome == 'failure'
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-0ad997818d90480f2
-          ec2-instance-type: p3.8xlarge
-          security-group-id: sg-f2a4e2fc
-          subnet-id: subnet-df0f6180 # us-east-1a
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-      - name: See if any of 3 sub-regions had the resource
-        id: start-ec2-runner
-        run: |
-          if [ "${{ steps.try-us-east-1a.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1a.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a.outputs.ec2-instance-id }}"
-          fi
-          if [ "${{ steps.try-us-east-1b.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1b.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b.outputs.ec2-instance-id }}"
-          fi
-          if [ "${{ steps.try-us-east-1c.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1c.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c.outputs.ec2-instance-id }}"
-          fi
-          if [ "${{ steps.try-us-east-1a-2.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1a-2.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a-2.outputs.ec2-instance-id }}"
-          fi
-          if [ "${{ steps.try-us-east-1b-2.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1b-2.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b-2.outputs.ec2-instance-id }}"
-          fi
-          if [ "${{ steps.try-us-east-1c-2.outcome }}" = "success" ]; then
-            echo "::set-output name=label::${{ steps.try-us-east-1c-2.outputs.label }}"
-            echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c-2.outputs.ec2-instance-id }}"
-          fi
-
-
   do-the-job:
     name: Do the job on the runner
-    needs: start-runner # required to start the main job when the runner is ready
-    # need to figure out how to cancel the previous build if a new push was made the old test is still running
-    # concurrency: # cancel previous build on a new push
-    #   group: ${{ github.ref }} # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#github-context
-    #   cancel-in-progress: true
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: self-hosted
     steps:
       - name: NVIDIA-SMI
         run: nvidia-smi
 
       - name: Checkout
-        uses: actions/checkout@v2
-
-      - name: Install Dependencies
+        uses: actions/checkout@v3
+
+#      - name: Install Dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install -r requirements.txt
+#          pip install pytest-timeout
+#
+#      - name: Run tests
+#        run: pytest --timeout=600 tests
+
+      - name: Full check
         run: |
-          pip install --upgrade pip
-          pip install -r requirements.txt
-          pip install pytest-timeout
-
-      - name: Run tests
-        run: pytest --timeout=600 tests
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - do-the-job # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-1
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+          export PATH=/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin:/opt/conda/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin
+          pytest tests/test_training_debug.py
diff --git a/.github/workflows/test_runner.yml b/.github/workflows/test_runner.yml
diff --git a/k8s/README.md b/k8s/README.md
@@ -0,0 +1,21 @@
+# Setup and configuration
+
+Runners are controlled and only spawned by the [Actions-Runner-Controller](https://github.com/actions-runner-controller/actions-runner-controller) (ARC), so they will not show up in Github's Runner setting while idling.
+
+The main backend software can only be installed by the clusters' Admins.
+However, users in the `project-ns-opengptx` namespace can configure the controller using normal k8s deployment yaml in the below session.
+
+Authentication for runners are done using Github-app as instructed in the ARC repo.
+
+
+# Deployment files for running github actions on k8s cluster
+
+`arc_runner_deployment.yaml` deploys runner managed by [Actions-Runner-Controller](https://github.com/actions-runner-controller/actions-runner-controller) (ARC). These runners are only created when need, thus does not permanently block resource on the cluster.
+
+`unmanaged_runner_deployment.yaml` is the simplest way to deploy a runner. 
+However, this is not recommended for runners with GPU access, because these runners will permanently block/occupy GPU on the cluster.
+
+To deploy runner:
+```bash
+kubectl -f arc_runner_deployment.yaml
+```
diff --git a/k8s/arc_runner_deployment.yaml b/k8s/arc_runner_deployment.yaml
@@ -0,0 +1,55 @@
+# runnerdeployment.yaml
+apiVersion: actions.summerwind.dev/v1alpha1
+kind: RunnerDeployment
+metadata:
+  name: action-runner-deploymment
+  namespace: project-ns-opengptx
+spec:
+  replicas: 0
+  template:
+    spec:
+      repository: OpenGPTX/bigscience_megatron_deepspeed
+      ephemeral: true
+      dockerEnabled: false
+
+      env:
+      - name: RUNNER_ASSETS_DIR
+        value: "/actions-runner"
+
+      image: hub.cc-asp.fraunhofer.de/dockerhub_proxy_cache/malteos/obmd:22.08-py3-runner
+      imagePullPolicy: Always
+      resources:
+        requests:
+          cpu: 3            #<-- same value for requests and limits
+          memory: "10Gi"    
+          nvidia.com/gpu: 1	#Assign the same values for GPU requests and limits.
+        limits:
+          cpu: 3            #<-- same value for requests and limits
+          memory: "10Gi"    
+          nvidia.com/gpu: 1	#Assign integer values. GPU has no fraction values.
+
+      tolerations:
+      - key: "nvidia.com"
+        operator: "Equal"
+        value: "a100"
+        effect: "NoSchedule"
+
+---
+apiVersion: actions.summerwind.dev/v1alpha1
+kind: HorizontalRunnerAutoscaler
+metadata:
+  name: example-runner-autoscaler
+  namespace: project-ns-opengptx
+spec:
+  minReplicas: 0
+  maxReplicas: 1
+
+  scaleDownDelaySecondsAfterScaleOut: 120
+  scaleTargetRef:
+    kind: RunnerDeployment
+    name: action-runner-deploymment
+
+  metrics:
+  - type: TotalNumberOfQueuedAndInProgressWorkflowRuns
+    repositoryNames:
+    - OpenGPTX/bigscience_megatron_deepspeed
diff --git a/k8s/unmanaged_runner_deployment.yaml b/k8s/unmanaged_runner_deployment.yaml
@@ -0,0 +1,39 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: unmanaged-github-runner
+  namespace: project-ns-opengptx
+spec:
+  containers:
+  - name: runner-container
+    image: hub.cc-asp.fraunhofer.de/dockerhub_proxy_cache/library/nginx:latest    #<-- Your docker image
+    env:
+      - name: REPO_URL
+        value: "https://github.com/OpenGPTX/bigscience_megatron_deepspeed"
+      - name: REG_TOKEN
+        value: "ADMDER2UAJZ57H57SOUUXXXXXXXXX"  # Setting > Actions > Runners > New self-hosted runner
+
+    command: ["/bin/sh", "-c"]
+    args:       #<-- Command to run in container. Override docker's default entry point
+      - useradd -m runner && cd /home/runner;
+        mkdir actions-runner && cd actions-runner;
+        curl -o actions-runner-linux-x64-2.295.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.295.0/actions-runner-linux-x64-2.295.0.tar.gz;
+        echo "a80c1ab58be3cd4920ac2e51948723af33c2248b434a8a20bd9b3891ca4000b6  actions-runner-linux-x64-2.295.0.tar.gz" | shasum -a 256 -c;
+        tar xzf ./actions-runner-linux-x64-2.295.0.tar.gz;
+        su runner -c "cd /home/runner/actions-runner && ./config.sh --unattended --ephemeral --url $(REPO_URL) --token $(REG_TOKEN); ./run.sh"
+
+    resources:
+      requests:
+        cpu: 3            #<-- same value for requests and limits
+        memory: "5Gi"    
+        nvidia.com/gpu: 1	#Assign the same values for GPU requests and limits.
+      limits:
+        cpu: 3            #<-- same value for requests and limits
+        memory: "5Gi"    
+        nvidia.com/gpu: 1	#Assign integer values. GPU has no fraction values.
+
+  tolerations:
+  - key: "nvidia.com"
+    operator: "Equal"
+    value: "a100"
+    effect: "NoSchedule"
diff --git a/tests/__init__.py b/tests/__init__.py