Create a python script to deploy Kubeflow on GCP via deployment manager.

* The script replaces our bash script. * A python script should make it easier to make our tests more robust by adding retries and error handling. Related to kubeflow#836 verify Kubeflow deployed correctly with deployment manager.
jlewi · May 24, 2018 · 4c99f72 · 4c99f72
1 parent 6628f93
commit 4c99f72
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 16 deletions.
diff --git a/testing/deploy_kubeflow_gcp.py b/testing/deploy_kubeflow_gcp.py
@@ -0,0 +1,171 @@
+"""Deploy Kubeflow on GCP using deployment manager and the bootstrapper."""
+import argparse
+import datetime
+import json
+import logging
+import os
+import requests
+import time
+import yaml
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from kubernetes.config import kube_config
+import deploy_utils
+from kubeflow.testing import test_helper
+from kubeflow.testing import util  # pylint: disable=no-name-in-module
+
+def parse_args():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    "--project", required=True, type=str, 
+    help="The project to deploy in.")
+
+  parser.add_argument(
+    "--name", required=True, type=str, 
+    help="The name for the deployment.")
+
+  parser.add_argument(
+    "--config", required=True, type=str, 
+    help="The path to the YAML file for the deployment config to use.")
+
+  parser.add_argument(
+    "--imports", default="", type=str, 
+    help=("Comma separated list of files to import as part of the deployment "
+          "manager manifest"))
+
+  args, _ = parser.parse_known_args()
+  return args
+
+def wait_for_operation(client,
+                       project,
+                       op_id,
+                       timeout=datetime.timedelta(hours=1),
+                       polling_interval=datetime.timedelta(seconds=5)):
+  """Wait for the specified operation to complete.
+
+  Args:
+    client: Client for the API that owns the operation.
+    project: project
+    op_id: Operation id.
+    timeout: A datetime.timedelta expressing the amount of time to wait before
+      giving up.
+    polling_interval: A datetime.timedelta to represent the amount of time to
+      wait between requests polling for the operation status.
+
+  Returns:
+    op: The final operation.
+
+  Raises:
+    TimeoutError: if we timeout waiting for the operation to complete.
+  """
+  endtime = datetime.datetime.now() + timeout
+  while True:
+    op = client.operations().get(
+        project=project, operation=op_id).execute()
+
+    status = op.get("status", "")
+    # Need to handle other status's
+    if status == "DONE":
+      return op
+    if datetime.datetime.now() > endtime:
+      raise TimeoutError(
+        "Timed out waiting for op: {0} to complete.".format(op_id))
+    time.sleep(polling_interval.total_seconds())
+
+  # Linter complains if we don't have a return here even though its unreachable.
+  return None
+
+def deploy_kubeflow_gcp(test_case):
+  """Deploy Kubeflow."""
+  args = parse_args()
+  test_dir = test_case.test_suite.test_dir
+  project = args.project
+  deployment_name = args.name
+  credentials = GoogleCredentials.get_application_default()
+  deploy = discovery.build("deploymentmanager", "v2", credentials=credentials)
+
+  deployments = deploy.deployments()
+
+  import_files = args.imports.split(",")
+
+  imports = []
+
+  with open(args.config) as hf:
+    content = hf.read()
+
+  for f in import_files:
+    with open(f) as hf:
+      name = os.path.basename(f)
+      imports.append({
+        "name": name,
+        "content": hf.read(),
+      })
+
+
+  body = {
+    "name": deployment_name,
+    "target": {
+      "config": {
+        "content": content,
+      },
+      "imports": imports,
+    },  
+  }
+
+
+  project = "cloud-ml-dev"
+  response = None
+  try: 
+    logging.info("Creating deployment %s in project %s", deployment_name, 
+                 project)
+    response = deployments.insert(project=project, body=body).execute()
+  except errors.HttpError as e:
+    logging.error("Got exception %s", e)
+    if not e.content:
+      raise
+
+    try:
+      content = json.loads(e.content)
+    except ValueError:
+      logging.error("Could not parse content %s as json", e.content)
+
+    code = content.get("error", {}).get("code")
+    if code == requests.codes.CONFLICT:
+      logging.info("Deployment %s already exists", deployment_name)
+    else:
+      raise
+
+  if response:
+    op_id = response["name"]
+
+  else:
+    # Get the deployment and make sure its up
+    d = deployments.get(project=project, deployment=deployment_name).execute()
+    op_id = d.get("operation", {}).get("name")
+    if not op_id:
+      raise ValueError("Could not get operation name.")
+
+  logging.info("Wait for deployment; operation %s", op_id)
+  final_status = wait_for_operation(deploy, project, op_id)
+
+  logging.info("Deployment status\n%s:", json.dumps(final_status, 
+                                                    sort_keys=True,
+                                                    indent=2, 
+                                                    separators=(',', ': ')))
+
+  if final_status.get("status") != "DONE":
+    logging.error("Deployment operation isn't done.")
+    raise RuntimeError("Deployment operation isn't done.")
+
+def main():
+  test_case = test_helper.TestCase(
+    name='deploy_kubeflow_gcp', test_func=deploy_kubeflow_gcp)
+  test_suite = test_helper.init(
+    name='deploy_kubeflow_gcp', test_cases=[test_case])
+  test_suite.run()
+
+if __name__ == "__main__":
+  main()
diff --git a/testing/workflows/components/gke_deploy.libsonnet b/testing/workflows/components/gke_deploy.libsonnet
@@ -239,22 +239,14 @@
               [],  // no sidecars
             ),
             buildTemplate("create-deployment", [
-              "bash",
-              "-c",
-              std.join(
-                " ",
-                commonCommands + [
-                  "&&",
-                  "gcloud",
-                  "deployment-manager",
-                  "--project=" + project,
-                  "deployments",
-                  "create",
-                  deployName,
-                  "--config=" + configDir + "/cluster-kubeflow.yaml",
-                ]
-              ),
-            ]),  // create-deployment
+              "python",
+              "-m",
+              "testing.deploy_kubeflow_gcp",
+              "--project=" + project,
+              "--name=" + deployName",
+              "--config=" + configDir + "/cluster-kubeflow.yaml",
+              "--config=" + configDir + "/cluster.jinja",              
+            ]),  // create-deployment            
             // Setup and teardown using GKE.
             buildTemplate("delete-deployment", [
               "bash",