-
Notifications
You must be signed in to change notification settings - Fork 348
CML cloud runner #108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
CML cloud runner #108
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
20f9773
code wip
DavidGOrtega 168cf68
TIMEOUT_TIMER
DavidGOrtega ab9ec9f
docker entrypoint
DavidGOrtega b74827b
setInterval
DavidGOrtega 1f60175
RUNNER_IDLE_TIMEOUT seconds
DavidGOrtega ddf59cd
gitlab and docker runner
DavidGOrtega aee4949
backslashes
DavidGOrtega 10ad3df
merge master
DavidGOrtega 6ed39f8
remove docker publish test
DavidGOrtega c1a3474
remove dockermachine in docker gpu
DavidGOrtega cc7d542
cml self-hosted runner
DavidGOrtega dd7bb8e
cloud runner
DavidGOrtega 6593bee
deploy tag runner for testing
DavidGOrtega df463db
set back if dockerfile
DavidGOrtega 5a36bf9
docker repos
DavidGOrtega 049f5f7
Merge branch 'master' of https://github.com/iterative/cml into cml-ru…
DavidGOrtega f8cb6cb
bump version
DavidGOrtega File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,170 @@ | ||
| #!/usr/bin/env node | ||
|
|
||
| const { spawn } = require('child_process'); | ||
| const { exec, randid } = require('../src/utils'); | ||
| const { URL } = require('url'); | ||
|
|
||
| const { | ||
| RUNNER_PATH, | ||
|
|
||
| DOCKER_MACHINE, | ||
| RUNNER_REPO, | ||
| RUNNER_IDLE_TIMEOUT = 5 * 60, | ||
| RUNNER_LABELS = '', | ||
| RUNNER_NAME = randid(), | ||
| RUNNER_EXECUTOR = 'shell', | ||
| RUNNER_RUNTIME = '', | ||
| RUNNER_IMAGE = 'dvcorg/cml:latest' | ||
| } = process.env; | ||
|
|
||
| const { protocol, host, pathname } = new URL(RUNNER_REPO); | ||
| const RUNNER_REPO_ORIGIN = `${protocol}//${host}`; | ||
| process.env.GITHUB_REPOSITORY = process.env.CI_PROJECT_PATH = pathname.substring( | ||
| 1 | ||
| ); | ||
| process.env.CI_API_V4_URL = `${RUNNER_REPO_ORIGIN}/api/v4/`; | ||
|
|
||
| const IS_GITHUB = RUNNER_REPO_ORIGIN === 'https://github.com'; | ||
| let TIMEOUT_TIMER = 0; | ||
| let JOB_RUNNING = false; | ||
| let RUNNER_TOKEN; | ||
| let GITLAB_CI_TOKEN; | ||
|
|
||
| const { get_runner_token, register_runner } = IS_GITHUB | ||
| ? require('../src/github') | ||
| : require('../src/gitlab'); | ||
|
|
||
| const shutdown_docker_machine = async () => { | ||
| console.log('Shutting down docker machine'); | ||
| try { | ||
| DOCKER_MACHINE && | ||
| console.log(await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`)); | ||
| } catch (err) { | ||
| console.log(err.message); | ||
| } | ||
| }; | ||
|
|
||
| const shutdown = async error => { | ||
| try { | ||
| console.log('Unregistering runner'); | ||
|
|
||
| try { | ||
| if (IS_GITHUB) { | ||
| console.log( | ||
| await exec( | ||
| `${RUNNER_PATH}/config.sh remove --token "${RUNNER_TOKEN}"` | ||
| ) | ||
| ); | ||
| } else { | ||
| console.log(await exec(`gitlab-runner verify --delete`)); | ||
| console.log( | ||
| await exec( | ||
| `gitlab-runner unregister --url "${RUNNER_REPO_ORIGIN}" --token "${GITLAB_CI_TOKEN}" ` | ||
| ) | ||
| ); | ||
| } | ||
| } catch (err) {} | ||
|
|
||
| await shutdown_docker_machine(); | ||
|
|
||
| if (error) throw error; | ||
|
|
||
| return process.exit(0); | ||
| } catch (err) { | ||
| console.error(err); | ||
| return process.exit(1); | ||
| } | ||
| }; | ||
|
|
||
| process.on('SIGTERM', shutdown); | ||
| process.on('SIGINT', shutdown); | ||
| process.on('SIGQUIT', shutdown); | ||
|
|
||
| const run = async () => { | ||
| RUNNER_TOKEN = await get_runner_token(); | ||
| if (!RUNNER_TOKEN) { | ||
| throw new Error( | ||
| 'RUNNER_TOKEN is needed to start the runner. Are you setting a runner?' | ||
| ); | ||
| } | ||
|
|
||
| if (IS_GITHUB && RUNNER_EXECUTOR !== 'shell') { | ||
| throw new Error('Github only supports shell executor'); | ||
| } | ||
|
|
||
| console.log(`Starting runner with ${RUNNER_EXECUTOR} executor`); | ||
|
|
||
| let command; | ||
| if (IS_GITHUB) { | ||
| console.log('Registering Github runner'); | ||
| console.log( | ||
| await exec( | ||
| `${RUNNER_PATH}/config.sh --url "${RUNNER_REPO}" --token "${RUNNER_TOKEN}" --name "${RUNNER_NAME}" --labels "${RUNNER_LABELS}" --work "_work"` | ||
| ) | ||
| ); | ||
|
|
||
| command = `${RUNNER_PATH}/run.sh`; | ||
| } else { | ||
| console.log('Registering Gitlab runner'); | ||
| const runner = await register_runner({ | ||
| tags: RUNNER_LABELS, | ||
| token: RUNNER_TOKEN | ||
| }); | ||
|
|
||
| GITLAB_CI_TOKEN = runner.token; | ||
|
|
||
| command = `gitlab-runner --log-format="json" run-single \ | ||
| --url "https://gitlab.com/" \ | ||
| --token "${runner.token}" \ | ||
| --executor "${RUNNER_EXECUTOR}" \ | ||
| --docker-runtime "${RUNNER_RUNTIME}" \ | ||
| --docker-image "${RUNNER_IMAGE}" \ | ||
| --wait-timeout ${RUNNER_IDLE_TIMEOUT} \ | ||
| --name "${RUNNER_NAME}" \ | ||
| --request-concurrency 1 \ | ||
| --limit 1`; | ||
| } | ||
|
|
||
| const proc = spawn(command, { shell: true }); | ||
|
|
||
| proc.stderr.on('data', data => { | ||
| data && console.log(data.toString('utf8')); | ||
|
|
||
| if (data && !IS_GITHUB) { | ||
| try { | ||
| const { msg } = JSON.parse(data); | ||
| msg.includes('runner has not received a job') && shutdown(); | ||
| } catch (err) {} | ||
| } | ||
| }); | ||
|
|
||
| proc.stdout.on('data', async data => { | ||
| data && console.log(data.toString('utf8')); | ||
|
|
||
| if (data && IS_GITHUB && data.includes('Running job')) { | ||
| JOB_RUNNING = true; | ||
| TIMEOUT_TIMER = 0; | ||
| } | ||
|
|
||
| if ( | ||
| data && | ||
| IS_GITHUB && | ||
| data.includes('Job') && | ||
| data.includes('completed with result') | ||
| ) { | ||
| JOB_RUNNING = false; | ||
| } | ||
| }); | ||
|
|
||
| const watcher = setInterval(() => { | ||
| IS_GITHUB && | ||
| TIMEOUT_TIMER >= RUNNER_IDLE_TIMEOUT && | ||
| shutdown() && | ||
| clearInterval(watcher); | ||
| if (!JOB_RUNNING) TIMEOUT_TIMER++; | ||
| }, 1000); | ||
| }; | ||
|
|
||
| run().catch(err => { | ||
| shutdown(err); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| ARG DOCKER_FROM=cml | ||
|
|
||
| FROM dvcorg/${DOCKER_FROM}:latest as base | ||
|
|
||
| LABEL maintainer="dvc.org" | ||
|
|
||
| # DOCKER, DOCKER MACHINE, GITLAB RUNNER AND GITHUB RUNNER | ||
| ENV RUNNER_PATH=/home/runner | ||
| ENV RUNNER_ALLOW_RUNASROOT=1 | ||
|
|
||
| RUN mkdir ${RUNNER_PATH} | ||
| WORKDIR ${RUNNER_PATH} | ||
|
|
||
| RUN curl -fsSL https://get.docker.com -o get-docker.sh && sh get-docker.sh && \ | ||
| curl -L "https://github.com/docker/compose/releases/download/1.24.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose && \ | ||
| chmod +x /usr/local/bin/docker-compose && \ | ||
| curl -L https://github.com/docker/machine/releases/download/v0.16.2/docker-machine-`uname -s`-`uname -m` >/tmp/docker-machine && \ | ||
| chmod +x /tmp/docker-machine && mv /tmp/docker-machine /usr/local/bin/docker-machine && \ | ||
| wget -O /usr/local/bin/gitlab-runner https://gitlab-runner-downloads.s3.amazonaws.com/latest/binaries/gitlab-runner-linux-amd64 && \ | ||
| chmod +x /usr/local/bin/gitlab-runner && \ | ||
| gitlab-runner install --user=root --working-directory=${RUNNER_PATH} && \ | ||
| wget https://github.com/actions/runner/releases/download/v2.263.0/actions-runner-linux-x64-2.263.0.tar.gz && \ | ||
| tar xzf actions-runner-linux-x64-2.263.0.tar.gz && \ | ||
| ./bin/installdependencies.sh && \ | ||
| apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| CMD ["cml-cloud-runner-entrypoint"] |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need
DOCKER_FROM? Why don't we use that in other images likecml-gpu-py3?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the same question about
buildargsUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we build the images using Dockerfile-cloud-runner, so to not repeat 4 dockerfiles we just use the FROM wit args coming from buildargs in the plugin. According to the specs they have to come from the action ENV
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no success in resource allocation. what should happen?
Job will fail and workflow wont happen
job never finishes? how we can handle this properly?
Very interesting question. Are we speaking about the deploy job or the train?
In both cases the workflow will timeout since the whole workflow has a limited time. However a non ending training process will end up in a machine working forever. In any circumstance the user should see that the workflow did not succeed properly.
This make me think in the next iteration where we can actually add a check of the machine being cleaned up.
is it possible that the training is done but the machine is still working?
No, they have an idle mechanism, if no jobs are handled in RUNNER_IDLE_TIMEOUT in secs they kill them self
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍