# Sample for Kubeflow PyTorchJob SDK

This is a sample for Kubeflow PyTorchJob SDK `kubeflow-pytorchjob`.

The notebook shows how to use Kubeflow PyTorchJob SDK to create, get, wait, check and delete PyTorchJob.

In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements

from kubeflow.pytorchjob import constants
from kubeflow.pytorchjob import utils
from kubeflow.pytorchjob import V1ReplicaSpec
from kubeflow.pytorchjob import V1PyTorchJob
from kubeflow.pytorchjob import V1PyTorchJobSpec
from kubeflow.pytorchjob import PyTorchJobClient

Define namespace where pytorchjob needs to be created to. If not specified, below function defines namespace to the current one where SDK is running in the cluster, otherwise it will deploy to default namespace.

In [2]:
namespace = utils.get_default_target_namespace()

### Define PyTorchJob

The demo only creates a worker of PyTorchJob to run mnist sample.

In [3]:
container = V1Container(
    name="pytorch",
    image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
    args=["--backend","gloo"]
)

master = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

worker = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

pytorchjob = V1PyTorchJob(
    api_version="kubeflow.org/v1",
    kind="PyTorchJob",
    metadata=V1ObjectMeta(name="pytorch-dist-mnist-gloo",namespace=namespace),
    spec=V1PyTorchJobSpec(
        clean_pod_policy="None",
        pytorch_replica_specs={"Master": master,
                               "Worker": worker}
    )
)

### Create PyTorchJob

In [4]:
pytorch_client = PyTorchJobClient()
pytorch_client.create(pytorchjob)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2019-12-18T02:22:07Z',
  'generation': 1,
  'name': 'pytorch-dist-mnist-gloo',
  'namespace': 'default',
  'resourceVersion': '13983940',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/pytorchjobs/pytorch-dist-mnist-gloo',
  'uid': '3055f681-213d-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'pytorchReplicaSpecs': {'Master': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}},
   'Worker': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}}}}}

### Get the created PyTorchJob 

In [5]:
pytorch_client.get('pytorch-dist-mnist-gloo')

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2019-12-18T02:22:07Z',
  'generation': 1,
  'name': 'pytorch-dist-mnist-gloo',
  'namespace': 'default',
  'resourceVersion': '13983953',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/pytorchjobs/pytorch-dist-mnist-gloo',
  'uid': '3055f681-213d-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'pytorchReplicaSpecs': {'Master': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}},
   'Worker': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}}}},
 'status': {'conditions': [{'lastTransitionTime': '2019-12-18T02:22:07Z',
    'lastUpdateTim

### Get the PyTorchJob status, check if the PyTorchJob has been started.

In [6]:
pytorch_client.get_job_status('pytorch-dist-mnist-gloo', namespace=namespace)

'Created'

### Wait for the specified PyTorchJob to finish

In [7]:
pytorch_client.wait_for_job('pytorch-dist-mnist-gloo', namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2019-12-18T02:22:07Z',
  'generation': 1,
  'name': 'pytorch-dist-mnist-gloo',
  'namespace': 'default',
  'resourceVersion': '13985828',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/default/pytorchjobs/pytorch-dist-mnist-gloo',
  'uid': '3055f681-213d-11ea-9e34-00000a1001ee'},
 'spec': {'cleanPodPolicy': 'None',
  'pytorchReplicaSpecs': {'Master': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}},
   'Worker': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],
        'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',
        'name': 'pytorch'}]}}}}},
 'status': {'completionTime': '2019-12-18T02:27:50Z',
  'conditions': [{'lastTransitionTime'

### Check if the PyTorchJob succeeded

In [8]:
pytorch_client.is_job_succeeded('pytorch-dist-mnist-gloo', namespace=namespace)

True

### Delete the PyTorchJob

In [9]:
pytorch_client.delete('pytorch-dist-mnist-gloo')

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'pytorch-dist-mnist-gloo',
  'group': 'kubeflow.org',
  'kind': 'pytorchjobs',
  'uid': '3055f681-213d-11ea-9e34-00000a1001ee'}}