# Sample for Kubeflow TFJob SDK

This is a sample for Kubeflow TFJob SDK `kubeflow-tfjob`.

The notebook shows how to use Kubeflow TFJob SDK to create, get, wait, check and delete tfjob.

In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container

from kubeflow.tfjob import constants
from kubeflow.tfjob import utils
from kubeflow.tfjob import V1ReplicaSpec
from kubeflow.tfjob import V1TFJob
from kubeflow.tfjob import V1TFJobSpec
from kubeflow.tfjob import TFJobClient

Define namespace where tfjob needs to be created to. If not specified, below function defines namespace to the current one where SDK is running in the cluster, otherwise it will deploy to default namespace.

In [2]:
namespace = utils.get_default_target_namespace()

### Define TFJob

The demo only creates a worker of TFJob to run mnist sample.

In [3]:
container = V1Container(
    name="tensorflow",
    image="gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
    command=[
        "python",
        "/var/tf_mnist/mnist_with_summaries.py",
        "--log_dir=/train/logs", "--learning_rate=0.01",
        "--batch_size=150"
        ]
)

worker = V1ReplicaSpec(
    replicas=1,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

tfjob = V1TFJob(
    api_version="kubeflow.org/v1",
    kind="TFJob",
    metadata=V1ObjectMeta(name="mnist",namespace=namespace),
    spec=V1TFJobSpec(
        clean_pod_policy="None",
        tf_replica_specs={"Worker": worker}
    )
)

### Create TFJob

In [4]:
tfjob_client = TFJobClient()
tfjob_client.create(tfjob, namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2019-12-17T05:40:26Z',
  'generation': 1,
  'name': 'mnist',
  'namespace': 'default',
  'resourceVersion': '13585452',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/tfjobs/mnist',
  'uid': 'b9faefd7-208f-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'tfReplicaSpecs': {'Worker': {'replicas': 1,
    'restartPolicy': 'Never',
    'template': {'spec': {'containers': [{'command': ['python',
         '/var/tf_mnist/mnist_with_summaries.py',
         '--log_dir=/train/logs',
         '--learning_rate=0.01',
         '--batch_size=150'],
        'image': 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0',
        'name': 'tensorflow'}]}}}}}}

### Get the created TFJob 

In [5]:
tfjob_client.get('mnist', namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2019-12-17T05:40:26Z',
  'generation': 1,
  'name': 'mnist',
  'namespace': 'default',
  'resourceVersion': '13585464',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/tfjobs/mnist',
  'uid': 'b9faefd7-208f-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'tfReplicaSpecs': {'Worker': {'replicas': 1,
    'restartPolicy': 'Never',
    'template': {'spec': {'containers': [{'command': ['python',
         '/var/tf_mnist/mnist_with_summaries.py',
         '--log_dir=/train/logs',
         '--learning_rate=0.01',
         '--batch_size=150'],
        'image': 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0',
        'name': 'tensorflow'}]}}}}},
 'status': {'conditions': [{'lastTransitionTime': '2019-12-17T05:40:26Z',
    'lastUpdateTime': '2019-12-17T05:40:26Z',
    'message': 'TFJob mnist is created.',
    'reason': 'TFJobCreated',
    'status': 'True',
    'type': 'Created'}],
  'r

### Get the TFJob status, check if the TFJob has been started.

In [6]:
tfjob_client.get_job_status('mnist', namespace=namespace)

'Running'

### Wait for the specified job to finish

In [7]:
tfjob_client.wait_for_job('mnist', namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2019-12-17T05:40:26Z',
  'generation': 1,
  'name': 'mnist',
  'namespace': 'default',
  'resourceVersion': '13586024',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/tfjobs/mnist',
  'uid': 'b9faefd7-208f-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'tfReplicaSpecs': {'Worker': {'replicas': 1,
    'restartPolicy': 'Never',
    'template': {'spec': {'containers': [{'command': ['python',
         '/var/tf_mnist/mnist_with_summaries.py',
         '--log_dir=/train/logs',
         '--learning_rate=0.01',
         '--batch_size=150'],
        'image': 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0',
        'name': 'tensorflow'}]}}}}},
 'status': {'completionTime': '2019-12-17T05:42:19Z',
  'conditions': [{'lastTransitionTime': '2019-12-17T05:40:26Z',
    'lastUpdateTime': '2019-12-17T05:40:26Z',
    'message': 'TFJob mnist is created.',
    'reason': 'TFJobCreated',
    'st

### Check if the TFJob succeeded

In [8]:
tfjob_client.if_job_succeeded('mnist', namespace=namespace)

True

### Delete the TFJob

In [9]:
tfjob_client.delete('mnist', namespace=namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'mnist',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': 'b9faefd7-208f-11ea-9e34-00000a1001ee'}}