Create python scripts for deploying Kubeflow on GCP via deployment ma…

…nager. * The scripts replaces our bash commands * For teardown we want to add retries to better handle INTERNAL_ERRORS with deployment manager that are causing the test to be flaky. Related to #836 verify Kubeflow deployed correctly with deployment manager.
kubeflow · May 24, 2018 · 3fa104e · 3fa104e
1 parent 01ccdff
commit 3fa104e
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 39 deletions.
diff --git a/docs/gke/configs/cluster.jinja b/docs/gke/configs/cluster.jinja
@@ -180,6 +180,11 @@ e.g. creating namespaces, service accounts, stateful set to run the bootstrapper
         value: >
           $.concat("Bearer ", $.googleOauth2AccessToken())
     descriptorUrl: https://$(ref.{{ CLUSTER_NAME }}.endpoint)/swaggerapi/{{ endpoint }}
+
+
+  metadata:
+      {# Set policy to abandon to avoid RESOURCE_NOT_FOUND_ERRORS on delete. #}
+      deletePolicy: ABANDON
 {% endfor %}
 
 {# Enable the resource manager API. This is needed below to get IAM policy.

diff --git a/testing/deploy_kubeflow.py b/testing/deploy_kubeflow.py
@@ -4,6 +4,9 @@
 
 import yaml
 from kubernetes.config import kube_config
+# TODO(jlewi): We should be using absolute imports always.
+# So it should be from testing import deploy_utils because testing
+# is the top level python package.
 from . import deploy_utils
 from kubeflow.testing import test_helper
 from kubeflow.testing import util  # pylint: disable=no-name-in-module

diff --git a/testing/deploy_kubeflow_gcp.py b/testing/deploy_kubeflow_gcp.py
@@ -0,0 +1,129 @@
+"""Deploy Kubeflow on GCP using deployment manager and the bootstrapper."""
+import argparse
+import datetime
+import json
+import logging
+import os
+import requests
+import time
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from testing import deploy_utils
+from kubeflow.testing import test_helper
+
+def parse_args():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    "--project", required=True, type=str, 
+    help="The project to deploy in.")
+
+  parser.add_argument(
+    "--name", required=True, type=str, 
+    help="The name for the deployment.")
+
+  parser.add_argument(
+    "--config", required=True, type=str, 
+    help="The path to the YAML file for the deployment config to use.")
+
+  parser.add_argument(
+    "--imports", default="", type=str, 
+    help=("Comma separated list of files to import as part of the deployment "
+          "manager manifest"))
+
+  args, _ = parser.parse_known_args()
+  return args
+
+def deploy_kubeflow_gcp(_):
+  """Deploy Kubeflow."""
+  args = parse_args()
+  project = args.project
+  deployment_name = args.name
+  credentials = GoogleCredentials.get_application_default()
+  deploy = discovery.build("deploymentmanager", "v2", credentials=credentials)
+
+  deployments = deploy.deployments()
+
+  import_files = []
+
+  if args.imports:
+    import_files = args.imports.split(",")
+
+  imports = []
+
+  with open(args.config) as hf:
+    content = hf.read()
+
+  for f in import_files:    
+    with open(f) as hf:
+      name = os.path.basename(f)
+      imports.append({
+        "name": name,
+        "content": hf.read(),
+      })
+
+
+  body = {
+    "name": deployment_name,
+    "target": {
+      "config": {
+        "content": content,
+      },
+      "imports": imports,
+    },  
+  }
+
+  response = None
+  try: 
+    logging.info("Creating deployment %s in project %s", deployment_name, 
+                 project)
+    response = deployments.insert(project=project, body=body).execute()
+  except errors.HttpError as e:
+    logging.error("Got exception %s", e)
+    if not e.content:
+      raise
+
+    try:
+      content = json.loads(e.content)
+    except ValueError:
+      logging.error("Could not parse content %s as json", e.content)
+
+    code = content.get("error", {}).get("code")
+    if code == requests.codes.CONFLICT:
+      logging.info("Deployment %s already exists", deployment_name)
+    else:
+      raise
+
+  if response:
+    op_id = response["name"]
+
+  else:
+    # Get the deployment and make sure its up
+    d = deployments.get(project=project, deployment=deployment_name).execute()
+    op_id = d.get("operation", {}).get("name")
+    if not op_id:
+      raise ValueError("Could not get operation name.")
+
+  logging.info("Wait for deployment; operation %s", op_id)
+  final_status = deploy_utils.wait_for_operation(deploy, project, op_id)
+
+  logging.info("Deployment status\n%s:", json.dumps(final_status, 
+                                                    sort_keys=True,
+                                                    indent=2, 
+                                                    separators=(',', ': ')))
+
+  if final_status.get("status") != "DONE":
+    logging.error("Deployment operation isn't done.")
+    raise RuntimeError("Deployment operation isn't done.")
+
+def main():
+  test_case = test_helper.TestCase(
+    name='deploy_kubeflow_gcp', test_func=deploy_kubeflow_gcp)
+  test_suite = test_helper.init(
+    name='deploy_kubeflow_gcp', test_cases=[test_case])
+  test_suite.run()
+
+if __name__ == "__main__":
+  main()
diff --git a/testing/deploy_utils.py b/testing/deploy_utils.py
@@ -128,3 +128,52 @@ def setup_kubeflow_ks_app(dir, namespace, github_token, api_client):
   os.symlink(source, target_dir)
 
   return app_dir
+
+def log_operation_status(operation):
+  """A callback to use with wait_for_operation."""
+  name = operation.get("name", "")
+  status = operation.get("status", "")
+  logging.info("Operation %s status %s", name, status)
+
+def wait_for_operation(client,
+                       project,
+                       op_id,
+                       timeout=datetime.timedelta(hours=1),
+                       polling_interval=datetime.timedelta(seconds=5),
+                       status_callback=log_operation_status):
+  """Wait for the specified operation to complete.
+
+  Args:
+    client: Client for the API that owns the operation.
+    project: project
+    op_id: Operation id.
+    timeout: A datetime.timedelta expressing the amount of time to wait before
+      giving up.
+    polling_interval: A datetime.timedelta to represent the amount of time to
+      wait between requests polling for the operation status.
+
+  Returns:
+    op: The final operation.
+
+  Raises:
+    TimeoutError: if we timeout waiting for the operation to complete.
+  """
+  endtime = datetime.datetime.now() + timeout
+  while True:
+    op = client.operations().get(
+      project=project, operation=op_id).execute()
+
+    if status_callback:
+      status_callback(op)
+
+    status = op.get("status", "")
+    # Need to handle other status's
+    if status == "DONE":
+      return op
+    if datetime.datetime.now() > endtime:
+      raise TimeoutError(
+        "Timed out waiting for op: {0} to complete.".format(op_id))
+    time.sleep(polling_interval.total_seconds())
+
+  # Linter complains if we don't have a return here even though its unreachable.
+  return None
diff --git a/testing/teardown_kubeflow_gcp.py b/testing/teardown_kubeflow_gcp.py
@@ -0,0 +1,151 @@
+"""Deploy Kubeflow on GCP using deployment manager and the bootstrapper."""
+import argparse
+import datetime
+import json
+import logging
+import requests
+from retrying import retry
+import time
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from kubeflow.testing import test_helper
+from testing import deploy_utils
+
+def parse_args():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    "--project", required=True, type=str, 
+    help="The project to deploy in.")
+
+  parser.add_argument(
+    "--name", required=True, type=str, 
+    help="The name for the deployment.")
+
+  args, _ = parser.parse_known_args()
+  return args
+
+def wait_for_operation(client,
+                       project,
+                       op_id,
+                       timeout=datetime.timedelta(hours=1),
+                       polling_interval=datetime.timedelta(seconds=5)):
+  """Wait for the specified operation to complete.
+
+  Args:
+    client: Client for the API that owns the operation.
+    project: project
+    op_id: Operation id.
+    timeout: A datetime.timedelta expressing the amount of time to wait before
+      giving up.
+    polling_interval: A datetime.timedelta to represent the amount of time to
+      wait between requests polling for the operation status.
+
+  Returns:
+    op: The final operation.
+
+  Raises:
+    TimeoutError: if we timeout waiting for the operation to complete.
+  """
+  endtime = datetime.datetime.now() + timeout
+  while True:
+    op = client.operations().get(
+        project=project, operation=op_id).execute()
+
+    status = op.get("status", "")
+    # Need to handle other status's
+    if status == "DONE":
+      return op
+    if datetime.datetime.now() > endtime:
+      raise TimeoutError(
+        "Timed out waiting for op: {0} to complete.".format(op_id))
+    time.sleep(polling_interval.total_seconds())
+
+  # Linter complains if we don't have a return here even though its unreachable.
+  return None
+
+@retry(stop_max_attempt_number=3)
+def teardown_kubeflow_gcp(_):
+  """Teardown Kubeflow deployment."""
+  args = parse_args()
+  project = args.project
+  deployment_name = args.name
+  credentials = GoogleCredentials.get_application_default()
+  deploy = discovery.build("deploymentmanager", "v2", credentials=credentials)
+
+  deployments = deploy.deployments()
+
+  response = None
+  try: 
+    logging.info("Deleting deployment %s in project %s", deployment_name, 
+                 project)
+    response = deployments.delete(project=project, 
+                                  deployment=deployment_name).execute()
+  except errors.HttpError as e:
+    logging.error("Got exception %s", e)
+    if not e.content:
+      raise
+
+    try:
+      content = json.loads(e.content)
+    except ValueError:
+      logging.error("Could not parse content %s as json", e.content)
+
+    code = content.get("error", {}).get("code")
+    if code == requests.codes.not_found:
+      logging.info("Deployment %s does not exist", deployment_name)
+      return
+    elif code == requests.codes.conflict:
+      logging.warning("Deployment %s return error 409 when trying to delete. "
+                      "One possible cause is deletion is already in progress", 
+                      deployment_name)
+    else:
+      raise
+
+  if not response:
+    # An operation was most likely already in progress. Lets get that operation.
+    d = deployments.get(project=project, deployment=deployment_name).execute()
+    op_id = d.get("operation", {}).get("name")
+    if not op_id:
+      raise ValueError("Could not get operation name.")    
+  else:
+    op_id = response["name"]
+
+  logging.info("Wait for deployment; operation %s", op_id)
+  final_status = deploy_utils.wait_for_operation(deploy, project, op_id)
+
+  op_errors = final_status.get("error", {}).get("errors", [])
+
+  if op_errors:    
+    logging.error("Deployment operation had errors\n%s:", json.dumps(final_status, 
+                                                      sort_keys=True,
+                                                      indent=2, 
+                                                      separators=(',', ': ')))
+
+    raise RuntimeError("Deployment operation had errors.")
+
+  if final_status.get("status") != "DONE":
+    logging.error("Deployment operation isn't done.")
+    raise RuntimeError("Deployment operation isn't done.")
+
+  if final_status.get("operationType", "").lower() != "delete":
+    # Its possible that if an operation was already in progress then the
+    # operation we just waited for was not a delete operation.
+    # We wanted to wait for that operation to finish and then raise an error
+    # so that the delete will be retried.
+    message = ("Operation {0} is type {1} which is not a delete "
+               "operation.").format(op_id, final_status.get("operationType"))
+    logging.error(message)
+    raise ValueError(message)
+
+def main():
+  test_case = test_helper.TestCase(
+    name='teardown_kubeflow_gcp', test_func=teardown_kubeflow_gcp)
+  test_suite = test_helper.init(
+    name='deploy_kubeflow_gcp', test_cases=[test_case])
+  test_suite.run()
+
+if __name__ == "__main__":
+  main()