From 949bed7e187e1233f86ee0f8571cf4b2d576f6d2 Mon Sep 17 00:00:00 2001 From: Ankush Agarwal Date: Wed, 26 Sep 2018 20:11:01 -0700 Subject: [PATCH] Add a e2e-test for katib This is run as part of kfctl e2e test --- scripts/kfctl.sh | 25 ++++----- scripts/util.sh | 2 +- testing/test_deploy.py | 51 +++++++++++++++++++ .../workflows/components/workflows.libsonnet | 22 ++++++-- 4 files changed, 84 insertions(+), 16 deletions(-) diff --git a/scripts/kfctl.sh b/scripts/kfctl.sh index 1c645af7532..cd9b5376ac2 100755 --- a/scripts/kfctl.sh +++ b/scripts/kfctl.sh @@ -35,7 +35,7 @@ createEnv() { # Namespace where kubeflow is deployed echo K8S_NAMESPACE=${K8S_NAMESPACE:-"kubeflow"} >> ${ENV_FILE} - case "$PLATFORM" in + case "$PLATFORM" in minikube) echo KUBEFLOW_CLOUD=minikube >> ${ENV_FILE} echo MOUNT_LOCAL=${MOUNT_LOCAL} >> ${ENV_FILE} @@ -52,29 +52,29 @@ createEnv() { echo PROJECT must be set either using environment variable PROJECT echo or by setting the default project in gcloud exit 1 - fi - + fi + # Name of the deployment DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-"kubeflow"} echo DEPLOYMENT_NAME="${DEPLOYMENT_NAME}" >> ${ENV_FILE} - + # Kubeflow directories echo KUBEFLOW_DM_DIR=${KUBEFLOW_DM_DIR:-"$(pwd)/gcp_config"} >> ${ENV_FILE} echo KUBEFLOW_SECRETS_DIR=${KUBEFLOW_SECRETS_DIR:-"$(pwd)/secrets"} >> ${ENV_FILE} echo KUBEFLOW_K8S_MANIFESTS_DIR="$(pwd)/k8s_specs" >> ${ENV_FILE} - + # Name of the K8s context to create. echo KUBEFLOW_K8S_CONTEXT=${DEPLOYMENT_NAME} >> ${ENV_FILE} - + # GCP Zone # The default should be a zone that supports Haswell. ZONE=${ZONE:-$(gcloud config get-value compute/zone 2>/dev/null)} echo ZONE=${ZONE:-"us-east1-d"} >> ${ENV_FILE} - + # Email for cert manager EMAIL=${EMAIL:-$(gcloud config get-value account 2>/dev/null)} echo EMAIL=${EMAIL} >> ${ENV_FILE} - + # GCP Static IP Name echo KUBEFLOW_IP_NAME=${KUBEFLOW_IP_NAME:-"${DEPLOYMENT_NAME}-ip"} >> ${ENV_FILE} # Name of the endpoint @@ -82,13 +82,13 @@ createEnv() { echo KUBEFLOW_ENDPOINT_NAME=${KUBEFLOW_ENDPOINT_NAME} >> ${ENV_FILE} # Complete hostname echo KUBEFLOW_HOSTNAME=${KUBEFLOW_HOSTNAME:-"${KUBEFLOW_ENDPOINT_NAME}.endpoints.${PROJECT}.cloud.goog"} >> ${ENV_FILE} - + echo CONFIG_FILE=${CONFIG_FILE:-"cluster-kubeflow.yaml"} >> ${ENV_FILE} - + if [ -z "${PROJECT_NUMBER}" ]; then PROJECT_NUMBER=$(gcloud projects describe ${PROJECT} --format='value(project_number)') fi - + echo PROJECT_NUMBER=${PROJECT_NUMBER} >> ${ENV_FILE} ;; *) @@ -210,6 +210,7 @@ ksApply () { ks apply default -c centraldashboard ks apply default -c tf-job-operator ks apply default -c argo + ks apply default -c katib ks apply default -c spartakus popd @@ -247,7 +248,7 @@ if [ "${COMMAND}" == "generate" ]; then if [ "${PLATFORM}" == "minikube" ]; then create_local_fs_mount_spec if ${MOUNT_LOCAL}; then - ks param set jupyterhub disks "local-notebooks" + ks param set jupyterhub disks "local-notebooks" ks param set jupyterhub notebookUid `id -u` ks param set jupyterhub notebookGid `id -g` ks param set jupyterhub accessLocalFs true diff --git a/scripts/util.sh b/scripts/util.sh index 49eb43e7e3d..5b309cadee2 100644 --- a/scripts/util.sh +++ b/scripts/util.sh @@ -66,7 +66,7 @@ function createKsApp() { ks generate tf-job-operator tf-job-operator ks generate argo argo - + ks generate katib katib # Enable collection of anonymous usage metrics # To disable metrics collection. Remove the spartakus component. # cd ks_app diff --git a/testing/test_deploy.py b/testing/test_deploy.py index ac383152d32..f4bd502d140 100644 --- a/testing/test_deploy.py +++ b/testing/test_deploy.py @@ -29,7 +29,9 @@ import json import logging import os +import re import shutil +import subprocess import tempfile import time import uuid @@ -189,6 +191,50 @@ def deploy_model(args): api_client, namespace, args.deploy_name + "-v1", timeout_minutes=10) logging.info("Verified TF serving started.") +def test_successful_deployment(deployment_name): + """ Tests if deployment_name is successfully running using kubectl """ + # TODO use the python kubernetes library to get deployment status + # This is using kubectl right now + retries = 20 + i = 0 + while True: + if i == retries: + raise Exception('Deployment failed: ' + deployment_name) + try: + output = util.run(["kubectl", "get", "deployment", deployment_name]) + logging.info("output = \n" + output) + if output.count('\n') == 1: + output = output.split('\n')[1] + output = re.split(' +', output) + desired_pods = output[1] + current_pods = output[2] + uptodate_pods = output[3] + available_pods = output[4] + logging.info("desired_pods " + desired_pods) + logging.info("current_pods " + current_pods) + logging.info("uptodate_pods " + uptodate_pods) + logging.info("available_pods " + available_pods) + if desired_pods == current_pods and \ + desired_pods == uptodate_pods and \ + desired_pods == available_pods: + return True + except subprocess.CalledProcessError as e: + logging.error(e) + logging.info("Sleeping 5 seconds and retrying..") + time.sleep(5) + i += 1 + + +def test_katib(args): + test_successful_deployment('vizier-core') + test_successful_deployment('vizier-db') + test_successful_deployment('vizier-suggestion-grid') + test_successful_deployment('vizier-suggestion-random') + test_successful_deployment('studyjob-controller') + test_successful_deployment('modeldb-backend') + test_successful_deployment('modeldb-db') + test_successful_deployment('modeldb-frontend') + def deploy_argo(args): api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) @@ -601,6 +647,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements parser_argo_job.set_defaults(func=deploy_argo) + parser_katib_test = subparsers.add_parser( + "test_katib", help="Test Katib") + + parser_katib_test.set_defaults(func=test_katib) + parser_minikube = subparsers.add_parser( "deploy_minikube", help="Setup a K8s cluster on minikube.") diff --git a/testing/workflows/components/workflows.libsonnet b/testing/workflows/components/workflows.libsonnet index 65ac3d0a813..a7505d8b4b1 100644 --- a/testing/workflows/components/workflows.libsonnet +++ b/testing/workflows/components/workflows.libsonnet @@ -80,7 +80,6 @@ srcDir: self.srcRootDir + "/kubeflow/kubeflow", image: "gcr.io/kubeflow-ci/test-worker:latest", - // value of KUBECONFIG environment variable. This should be a full path. kubeConfig: self.testDir + "/.kube/kubeconfig", @@ -107,7 +106,6 @@ env_vars:: [], side_cars: [], - activeDeadlineSeconds: 1800, // Set 30 minute timeout for each template local template = self, @@ -242,6 +240,25 @@ }, dependencies: ["wait-for-kubeflow"], }, // test-argo-deploy + { + + template: tests.buildTemplate { + name: "test-katib-deploy", + command: [ + "python", + "-m", + "testing.test_deploy", + "--project=kubeflow-ci", + "--github_token=$(GITHUB_TOKEN)", + "--namespace=" + tests.stepsNamespace, + "--test_dir=" + tests.testDir, + "--artifacts_dir=" + tests.artifactsDir, + "--deploy_name=test-katib", + "test_katib", + ], + }, + dependencies: ["wait-for-kubeflow"], + }, // test-katib { template: tests.buildTemplate { name: "pytorchjob-deploy", @@ -299,7 +316,6 @@ argoTaskTemplates: std.map(function(i) i.template.argoTemplate , self.tasks), - argoTemplates: [self.argoDagTemplate] + self.argoTaskTemplates, }, // kfTests