# TF on GKE

This notebook shows how to run the [TensorFlow CIFAR10 sample](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) on GKE using TfJobs

In [2]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

## Requirements

To run this notebook you must have the following installed
  * gcloud
  * kubectl
  * helm
  * kubernetes python client library
  
There is a Docker image based on Datalab suitable for running this notebook.

You can start that container as follows

```
docker run --name=gke-datalab -p "127.0.0.1:8081:8080" \
    -v "${HOME}:/content/datalab/home" \
    -v /var/run/docker.sock:/var/run/docker.sock -d  -e "PROJECT_ID=" \
    gcr.io/tf-on-k8s-dogfood/gke-datalab:v20171025-28df43b-dirty
```
  * You need to map in docker so that we can build docker images inside the container.

## Preliminaries

In [42]:
from __future__ import print_function

import kubernetes
from kubernetes import client as k8s_client
from kubernetes import config as k8s_config
from kubernetes.client.rest import ApiException
import datetime
from googleapiclient import discovery
from googleapiclient import errors
from oauth2client.client import GoogleCredentials
import os
import logging
from pprint import pprint
import StringIO
import subprocess
import yaml

logging.getLogger().setLevel(logging.INFO)

TF_JOB_GROUP = "mlkube.io"
TF_JOB_VERSION = "v1beta1"
TF_JOB_PLURAL = "tfjobs"
TF_JOB_KIND = "TfJob"

Change **project** to a project you have access to.
* GKE should be enabled for that project
* Optional change the cluster name

In [4]:
project="cloud-ml-dev"
zone="us-east1-d"
cluster_name="gke-tf-example"

gke = discovery.build("container", "v1")

### Some Utility Functions

In [5]:
def run(command, cwd=None):
  logging.info("Running: %s", " ".join(command))
  subprocess.check_call(command, cwd=cwd)

class TimeoutError(Exception):
  """An error indicating an operation timed out."""

def wait_for_operation(client,
                       project,
                       zone,
                       op_id,
                       timeout=datetime.timedelta(hours=1),
                       polling_interval=datetime.timedelta(seconds=5)):
  """Wait for the specified operation to complete.

  Args:
    client: Client for the API that owns the operation.
    project: project
    zone: Zone. Set to none if its a global operation
    op_id: Operation id.
    timeout: A datetime.timedelta expressing the amount of time to wait before
      giving up.
    polling_interval: A datetime.timedelta to represent the amount of time to
      wait between requests polling for the operation status.

  Returns:
    op: The final operation.

  Raises:
    TimeoutError: if we timeout waiting for the operation to complete.
  """
  endtime = datetime.datetime.now() + timeout
  while True:
    if zone:
      op = client.projects().zones().operations().get(
          projectId=project, zone=zone,
          operationId=op_id).execute()
    else:
      op = client.globalOperations().get(project=project,
                                         operation=op_id).execute()

    status = op.get("status", "")
    # Need to handle other status's
    if status == "DONE":
      return op
    if datetime.datetime.now() > endtime:
      raise TimeoutError("Timed out waiting for op: {0} to complete.".format(
          op_id))
    time.sleep(polling_interval.total_seconds())


## GKE Cluster Setup

* The instructions below create a **CPU** cluster
* To create a GKE cluster with GPUs sign up for the [GKE GPU Alpha](https://goo.gl/forms/ef7eh2x00hV3hahx1)
* TODO(jlewi): Update code once GPUs are in beta.

In [6]:
def create_cluster(gke, name, project, zone):
  """Create the cluster.

  Args:
    gke: Client for GKE.

  """
  cluster_request = {
      "cluster": {
          "name": name,
          "description": "A GKE cluster for TF.",
          "initialNodeCount": 1,
          "nodeConfig": {
              "machineType": "n1-standard-8",
          },
      }
  }
  request = gke.projects().zones().clusters().create(body=cluster_request,
                                                     projectId=project,
                                                     zone=zone)

  try:
    logging.info("Creating cluster; project=%s, zone=%s, name=%s", project,
                 zone, name)
    response = request.execute()
    logging.info("Response %s", response)
    create_op = wait_for_operation(gke, project, zone, response["name"])
    logging.info("Cluster creation done.\n %s", create_op)

  except errors.HttpError as e:
    logging.error("Exception occured creating cluster: %s, status: %s",
                  e, e.resp["status"])
    # Status appears to be a string.
    if e.resp["status"] == '409':      
      pass
    else:
      raise

create_cluster(gke, cluster_name, project, zone)      
logging.info("Configuring kubectl")
run(["gcloud", "--project=" + project, "container",
     "clusters", "--zone=" + zone, "get-credentials", cluster_name])


INFO:root:Creating cluster; project=cloud-ml-dev, zone=us-east1-d, name=gke-tf-example
ERROR:root:Exception occured creating cluster: <HttpError 409 when requesting https://container.googleapis.com/v1/projects/cloud-ml-dev/zones/us-east1-d/clusters?alt=json returned "The resource "projects/cloud-ml-dev/zones/us-east1-d/clusters/gke-tf-example" already exists.">, status: 409
INFO:root:Configuring kubectl
INFO:root:Running: gcloud --project=cloud-ml-dev container clusters --zone=us-east1-d get-credentials gke-tf-example


### Install the Operator

In [19]:
run(["helm", "init"])

INFO:root:Running: helm init


In [20]:
CHART="https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz"
run(["helm", "install", CHART, "-n", "tf-job", "--wait", "--replace"])

INFO:root:Running: helm install https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz -n tf-job --wait --replace


### Create a TfJob

In [50]:
k8s_config.load_kube_config()
api_client = k8s_client.ApiClient()
crd_api = k8s_client.CustomObjectsApi(api_client)

namespace = "default"
job_name = "tf-job-example"
body = {}
body['apiVersion'] = TF_JOB_GROUP + "/" + TF_JOB_VERSION
body['kind'] = TF_JOB_KIND
body['metadata'] = {}
body['metadata']['name'] = job_name
body['metadata']['namespace'] = namespace

spec = """
  replicaSpecs:
    - replicas: 1
      tfReplicaType: MASTER
      template:
        spec:
          containers:
            - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff
              name: tensorflow
          restartPolicy: OnFailure
    - replicas: 1
      tfReplicaType: WORKER
      template:
        spec:
          containers:
            - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff
              name: tensorflow
          restartPolicy: OnFailure
    - replicas: 2
      tfReplicaType: PS
"""
spec_buffer = StringIO.StringIO(spec)
body['spec'] = yaml.load(spec_buffer)

try: 
    # Create a Resource
    api_response = crd_api.create_namespaced_custom_object(TF_JOB_GROUP, TF_JOB_VERSION, namespace, TF_JOB_PLURAL, body)
    pprint(api_response)
except ApiException as e:
    print(
        "Exception when calling DefaultApi->apis_fqdn_v1_namespaces_namespace_resource_post: %s\n" % 
        e)

{u'apiVersion': u'mlkube.io/v1beta1',
 u'kind': u'TfJob',
 u'metadata': {u'clusterName': u'',
               u'creationTimestamp': u'2017-10-25T21:41:32Z',
               u'deletionGracePeriodSeconds': None,
               u'deletionTimestamp': None,
               u'name': u'tf-job-example',
               u'namespace': u'default',
               u'resourceVersion': u'85669',
               u'selfLink': u'/apis/mlkube.io/v1beta1/namespaces/default/tfjobs/tf-job-example',
               u'uid': u'4441fb92-b9cd-11e7-af95-42010a8e019d'},
 u'spec': {u'replicaSpecs': [{u'replicas': 1,
                              u'template': {u'spec': {u'containers': [{u'image': u'gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff',
                                                                       u'name': u'tensorflow'}],
                                                      u'restartPolicy': u'OnFailure'}},
                              u'tfReplicaType': u'MASTER'},
                             {u'replica

### Get status of job

In [51]:
k8s_config.load_kube_config()
api_client = k8s_client.ApiClient()
crd_api = k8s_client.CustomObjectsApi(api_client)

results = crd_api.get_namespaced_custom_object(TF_JOB_GROUP, TF_JOB_VERSION, namespace, TF_JOB_PLURAL, job_name)

pprint(results)

{u'apiVersion': u'mlkube.io/v1beta1',
 u'kind': u'TfJob',
 u'metadata': {u'clusterName': u'',
               u'creationTimestamp': u'2017-10-25T21:41:32Z',
               u'generation': 0,
               u'name': u'tf-job-example',
               u'namespace': u'default',
               u'resourceVersion': u'85720',
               u'selfLink': u'/apis/mlkube.io/v1beta1/namespaces/default/tfjobs/tf-job-example',
               u'uid': u'4441fb92-b9cd-11e7-af95-42010a8e019d'},
 u'spec': {u'RuntimeId': u'xxjf',
           u'replicaSpecs': [{u'IsDefaultPS': False,
                              u'replicas': 1,
                              u'template': {u'metadata': {u'creationTimestamp': None},
                                            u'spec': {u'containers': [{u'image': u'gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff',
                                                                       u'name': u'tensorflow',
                                                                       u'reso