Skip to content

Commit

Permalink
Merge pull request bigscience-workshop#18 from OpenGPTX/feature/add_k…
Browse files Browse the repository at this point in the history
…8s_action_runner

Feature/add k8s action runner
  • Loading branch information
mali-git committed Sep 19, 2022
2 parents 688337e + 78e7875 commit d89b00e
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 223 deletions.
209 changes: 15 additions & 194 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,205 +7,26 @@ on:
- "**.py"

jobs:

# GPU sizes and types that we could use:
# g4dn.12xlarge 4x 16GB T4 (CC 7.5) (low availability)
# p3.8xlarge 4x 16GB V100 (CC 7.0) (very low availability)

# Unfit:
# g3.16xlarge 4x 8GB Tesla M60 (CC 5.2) (not supported by cuda-11)
# p2.8xlarge 8x 12GB K80 (CC 3.7 not supported by cuda-11)

start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
# don't use the following subnets as p3.8xlarge is not supported there:
# - subnet-06576a4b # us-east-1d
# - subnet-859322b4 # us-east-1e
# - subnet-47cfad21 # us-east-1b
- name: Try to start EC2 runner (a)
id: try-us-east-1a
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-b7533b96 # us-east-1c
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (b)
id: try-us-east-1b
if: steps.try-us-east-1a.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-a396b2ad # us-east-1f
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (c)
id: try-us-east-1c
if: steps.try-us-east-1b.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: g4dn.12xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-df0f6180 # us-east-1a
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (a-2)
id: try-us-east-1a-2
if: steps.try-us-east-1c.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-b7533b96 # us-east-1c
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (b-2)
id: try-us-east-1b-2
if: steps.try-us-east-1a-2.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-a396b2ad # us-east-1f
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (c-2)
id: try-us-east-1c-2
if: steps.try-us-east-1b-2.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0ad997818d90480f2
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-df0f6180 # us-east-1a
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: See if any of 3 sub-regions had the resource
id: start-ec2-runner
run: |
if [ "${{ steps.try-us-east-1a.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1a.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1b.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1b.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1c.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1c.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1a-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1a-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a-2.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1b-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1b-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b-2.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1c-2.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1c-2.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c-2.outputs.ec2-instance-id }}"
fi
do-the-job:
name: Do the job on the runner
needs: start-runner # required to start the main job when the runner is ready
# need to figure out how to cancel the previous build if a new push was made the old test is still running
# concurrency: # cancel previous build on a new push
# group: ${{ github.ref }} # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#github-context
# cancel-in-progress: true
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: self-hosted
steps:
- name: NVIDIA-SMI
run: nvidia-smi

- name: Checkout
uses: actions/checkout@v2

- name: Install Dependencies
uses: actions/checkout@v3

# - name: Install Dependencies
# run: |
# pip install --upgrade pip
# pip install -r requirements.txt
# pip install pytest-timeout
#
# - name: Run tests
# run: pytest --timeout=600 tests

- name: Full check
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install pytest-timeout
- name: Run tests
run: pytest --timeout=600 tests

stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner # required to get output from the start-runner job
- do-the-job # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
export PATH=/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin:/opt/conda/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin
pytest tests/test_training_debug.py
29 changes: 0 additions & 29 deletions .github/workflows/test_runner.yml

This file was deleted.

21 changes: 21 additions & 0 deletions k8s/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Setup and configuration

Runners are controlled and only spawned by the [Actions-Runner-Controller](https://github.com/actions-runner-controller/actions-runner-controller) (ARC), so they will not show up in Github's Runner setting while idling.

The main backend software can only be installed by the clusters' Admins.
However, users in the `project-ns-opengptx` namespace can configure the controller using normal k8s deployment yaml in the below session.

Authentication for runners are done using Github-app as instructed in the ARC repo.


# Deployment files for running github actions on k8s cluster

`arc_runner_deployment.yaml` deploys runner managed by [Actions-Runner-Controller](https://github.com/actions-runner-controller/actions-runner-controller) (ARC). These runners are only created when need, thus does not permanently block resource on the cluster.

`unmanaged_runner_deployment.yaml` is the simplest way to deploy a runner.
However, this is not recommended for runners with GPU access, because these runners will permanently block/occupy GPU on the cluster.

To deploy runner:
```bash
kubectl -f arc_runner_deployment.yaml
```
55 changes: 55 additions & 0 deletions k8s/arc_runner_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# runnerdeployment.yaml
apiVersion: actions.summerwind.dev/v1alpha1
kind: RunnerDeployment
metadata:
name: action-runner-deploymment
namespace: project-ns-opengptx
spec:
replicas: 0
template:
spec:
repository: OpenGPTX/bigscience_megatron_deepspeed
ephemeral: true
dockerEnabled: false

env:
- name: RUNNER_ASSETS_DIR
value: "/actions-runner"

image: hub.cc-asp.fraunhofer.de/dockerhub_proxy_cache/malteos/obmd:22.08-py3-runner
imagePullPolicy: Always
resources:
requests:
cpu: 3 #<-- same value for requests and limits
memory: "10Gi"
nvidia.com/gpu: 1 #Assign the same values for GPU requests and limits.
limits:
cpu: 3 #<-- same value for requests and limits
memory: "10Gi"
nvidia.com/gpu: 1 #Assign integer values. GPU has no fraction values.

tolerations:
- key: "nvidia.com"
operator: "Equal"
value: "a100"
effect: "NoSchedule"

---
apiVersion: actions.summerwind.dev/v1alpha1
kind: HorizontalRunnerAutoscaler
metadata:
name: example-runner-autoscaler
namespace: project-ns-opengptx
spec:
minReplicas: 0
maxReplicas: 1

scaleDownDelaySecondsAfterScaleOut: 120
scaleTargetRef:
kind: RunnerDeployment
name: action-runner-deploymment

metrics:
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
repositoryNames:
- OpenGPTX/bigscience_megatron_deepspeed
39 changes: 39 additions & 0 deletions k8s/unmanaged_runner_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: v1
kind: Pod
metadata:
name: unmanaged-github-runner
namespace: project-ns-opengptx
spec:
containers:
- name: runner-container
image: hub.cc-asp.fraunhofer.de/dockerhub_proxy_cache/library/nginx:latest #<-- Your docker image
env:
- name: REPO_URL
value: "https://github.com/OpenGPTX/bigscience_megatron_deepspeed"
- name: REG_TOKEN
value: "ADMDER2UAJZ57H57SOUUXXXXXXXXX" # Setting > Actions > Runners > New self-hosted runner

command: ["/bin/sh", "-c"]
args: #<-- Command to run in container. Override docker's default entry point
- useradd -m runner && cd /home/runner;
mkdir actions-runner && cd actions-runner;
curl -o actions-runner-linux-x64-2.295.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.295.0/actions-runner-linux-x64-2.295.0.tar.gz;
echo "a80c1ab58be3cd4920ac2e51948723af33c2248b434a8a20bd9b3891ca4000b6 actions-runner-linux-x64-2.295.0.tar.gz" | shasum -a 256 -c;
tar xzf ./actions-runner-linux-x64-2.295.0.tar.gz;
su runner -c "cd /home/runner/actions-runner && ./config.sh --unattended --ephemeral --url $(REPO_URL) --token $(REG_TOKEN); ./run.sh"

resources:
requests:
cpu: 3 #<-- same value for requests and limits
memory: "5Gi"
nvidia.com/gpu: 1 #Assign the same values for GPU requests and limits.
limits:
cpu: 3 #<-- same value for requests and limits
memory: "5Gi"
nvidia.com/gpu: 1 #Assign integer values. GPU has no fraction values.

tolerations:
- key: "nvidia.com"
operator: "Equal"
value: "a100"
effect: "NoSchedule"
Empty file added tests/__init__.py
Empty file.

0 comments on commit d89b00e

Please sign in to comment.