# Create a TF Job from a Jupyter Notebook Script

## Configuration Parameters for the TF Job

In [103]:
# Notebook to be converted to TF Job
#NOTE that any cells in the notebook that need to converted to a TFJob NEEDS TO HAVE "# fairing:include-cell" included in the beginning of the cell
jupyter_notebook_name = 'tf_model_trainer.ipynb'

# Additional files used by the notebook
# If an additional library not present in the base image is being used, add this to a requirement.txt file and add the requirements.txt file to the
# list of files in the input_files
additional_files =['../utilities/common_utilities.py', 
                 '../utilities/evaluation_utilities.py', 
                 '../utilities/modeldb_tf_utilities.py',
                 '../utilities/from_tfrecords.py',
                 '../utilities/to_tfrecords.py',
                 '../utilities/google_utils.py',
                 '../utilities/model_utilities.py',
                 '../model_configs/text_based_tfrecord_config.yaml',
                 '../model_configs/text_based_config.yaml']


# Cluster configuration
num_cpu = 6 # Number of CPUs
cpu_memory=40  #Memory in Gigs   
num_gpu = 0  # Number of GPUs to be allocated
run_id = '5'
additional_tag = f'nont_{run_id}'

## Setting up TF Job in a Kubernetes Pod

### Imports

In [104]:
from kubeflow import fairing 
from kubeflow.fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire
from kubeflow.fairing.builders import append
from kubeflow.fairing.deployers import job
from kubeflow.fairing import constants
from kubeflow.fairing.builders import cluster
import sys
sys.path.append('../utilities/')
import common_utilities
import os
import json
import time

### Creating TFJob Image

In [105]:
GCP_PROJECT = fairing.backends.gcp.guess_project_name()
NAMESPACE = fairing.backends.utils.get_current_k8s_namespace()
print(GCP_PROJECT)
print(NAMESPACE)

zulilymodeltraining
rmenon


In [106]:
# Setting up google container repositories (GCR) for storing output containers
base_name = jupyter_notebook_name.split(".")[0]
PROJECT_NAME = 'p13n-model-training'
IMAGE_NAME = base_name  #The generated TFJob image will start with this name
DOCKER_REGISTRY = 'gcr.io/{}/{}/{}'.format(GCP_PROJECT, NAMESPACE, PROJECT_NAME)
# Base image to use for creating the TFJob image
BASE_IMAGE = f'gcr.io/{GCP_PROJECT}/kubeflow-notebooks/jupyter_tensorflow:v1.3.0_tensorflow-2.5.0-v3'.format(GCP_PROJECT)
# Latest image as of June 2021
constants.constants.KANIKO_IMAGE = "gcr.io/kaniko-project/executor:v1.6.0"

In [107]:
# Converts the notebook mentioned in the notebook_file parameter.
# All the code in the cells with fairing comments will be added to a python file with the same name as the ipython notebook.
# Later on while building the docker image this python file will be added to the docker image. 
preprocessor = ConvertNotebookPreprocessorWithFire(notebook_file = jupyter_notebook_name)

if not preprocessor.input_files:
    preprocessor.input_files = set()
preprocessor.input_files =  set([os.path.normpath(f) for f in additional_files])
preprocessor.preprocess()

[I 211210 05:32:11 converted_notebook:191] Converting tf_model_trainer.ipynb to tf_model_trainer.py
[I 211210 05:32:11 converted_notebook:194] Creating entry point for the class name None


[PosixPath('tf_model_trainer.py'),
 '../model_configs/text_based_tfrecord_config.yaml',
 '../utilities/from_tfrecords.py',
 '../utilities/evaluation_utilities.py',
 '../utilities/google_utils.py',
 '../utilities/common_utilities.py',
 '../utilities/to_tfrecords.py',
 '../utilities/modeldb_tf_utilities.py',
 '../utilities/model_utilities.py',
 '../model_configs/text_based_config.yaml']

In [108]:
# This builds the base image as a pod within the Kubernetes cluster
st = time.time()
# We use the automatically generated Dockerfile.
# If you have a custom Dockerfile to use uncomment the parameter below
cluster_builder = cluster.cluster.ClusterBuilder(registry = DOCKER_REGISTRY,
                                                 base_image = BASE_IMAGE,
                                                 image_name = IMAGE_NAME,
                                                 preprocessor = preprocessor,
                                                 #dockerfile_path="Dockerfile",
                                                 pod_spec_mutators = [fairing.backends.gcp.add_gcp_credentials_if_exists], #required to have correct serivceAccount specified to run pod with
                                                 context_source = cluster.gcs_context.GCSContextSource(gcp_project=GCP_PROJECT),
                                                 cleanup=True)
cluster_builder.build()
cluster_image = cluster_builder.image_tag
print("----------------------------------------------------------\n")
print(cluster_image)
print("This process took {}secs".format(time.time() - st))

[I 211210 05:32:11 cluster:46] Building image using cluster builder.
[I 211210 05:32:11 base:107] Creating docker context: /tmp/fairing_context_9oyn15ga
[I 211210 05:32:11 converted_notebook:191] Converting tf_model_trainer.ipynb to tf_model_trainer.py
[I 211210 05:32:11 converted_notebook:194] Creating entry point for the class name None
[W 211210 05:32:11 gcp:65] Not able to find gcp credentials secret: user-gcp-sa
[W 211210 05:32:11 gcp:67] Trying workload identity service account: default-editor
[W 211210 05:32:11 manager:298] Waiting for fairing-builder-hcc9n-cbthm to start...
[W 211210 05:32:11 manager:298] Waiting for fairing-builder-hcc9n-cbthm to start...
[W 211210 05:32:11 manager:298] Waiting for fairing-builder-hcc9n-cbthm to start...
[W 211210 05:32:12 manager:298] Waiting for fairing-builder-hcc9n-cbthm to start...
[I 211210 05:32:13 manager:303] Pod started running True


E1210 05:32:17.622770       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
[36mINFO[0m[0004] Retrieving image manifest gcr.io/zulilymodeltraining/kubeflow-notebooks/jupyter_tensorflow:v1.3.0_tensorflow-2.5.0-v3
[36mINFO[0m[0004] Retrieving image gcr.io/zulilymodeltraining/kubeflow-notebooks/jupyter_tensorflow:v1.3.0_tensorflow-2.5.0-v3 from registry gcr.io
[36mINFO[0m[0004] Retrieving image manifest gcr.io/zulilymodeltraining/kubeflow-notebooks/jupyter_tensorflow:v1.3.0_tensorflow-2.5.0-v3
[36mINFO[0m[0004] Returning cached image manifest
[36mINFO[0m[0004] Built cross stage deps: map[]
[36mINFO[0m[0004] Retrieving image manifest gcr.io/zulilymodeltraining/kubeflow-notebooks/jupyter_tensorflow:v1.3.0_tensorflow-2.5.0-v3
[36mINFO[0m[0004] Returning cached image manifest
[36mINFO[0m[0004] Retrieving image manifest gcr.io/zulilymodeltra

[W 211210 05:35:03 cluster:106] Cleaning up job fairing-builder-hcc9n...


----------------------------------------------------------

gcr.io/zulilymodeltraining/rmenon/p13n-model-training/tf_model_trainer:CB9695A6
This process took 171.94618797302246secs


In [109]:
# This step adds the additional files that are required.
st = time.time()
preprocessor.preprocess()
builder = append.append.AppendBuilder(registry=DOCKER_REGISTRY,
                                      image_name=IMAGE_NAME,
                                      base_image = cluster_image,
                                      preprocessor=preprocessor)

# If this step fails runs gcloud auth configure-docker in terminal
builder.build()
cluster_image = builder.image_tag
print("----------------------------------------------------------\n")
print(cluster_image)
print("This process took {}secs".format(time.time() - st))

[I 211210 05:35:03 converted_notebook:191] Converting tf_model_trainer.ipynb to tf_model_trainer.py
[I 211210 05:35:03 converted_notebook:194] Creating entry point for the class name None
[W 211210 05:35:03 append:50] Building image using Append builder...
[I 211210 05:35:03 base:107] Creating docker context: /tmp/fairing_context_yfc0y71a
[I 211210 05:35:03 converted_notebook:191] Converting tf_model_trainer.ipynb to tf_model_trainer.py
[I 211210 05:35:03 converted_notebook:194] Creating entry point for the class name None
[W 211210 05:35:03 base:88] tf_model_trainer.py already exists in Fairing context, skipping...
[I 211210 05:35:03 docker_creds_:234] Loading Docker credentials for repository 'gcr.io/zulilymodeltraining/rmenon/p13n-model-training/tf_model_trainer:CB9695A6'
[I 211210 05:35:03 docker_creds_:152] Invoking 'docker-credential-gcloud' to obtain Docker credentials.
[I 211210 05:35:04 docker_creds_:175] Successfully obtained Docker credentials.
[W 211210 05:35:04 append:54] 

----------------------------------------------------------

gcr.io/zulilymodeltraining/rmenon/p13n-model-training/tf_model_trainer:88FC693C
This process took 4.318068265914917secs


### Creating a TF Job config and launching in Kubernetes pod

#### Helper function

In [110]:
def tf_job_trial_spec(script_name: str, tf_job_image: str, cpu_request: float, memory_request: float, num_gpu: int = 0, additional_tag: str = '') -> dict:
    """
    Create trial spec for tensorflow job. 
    Read more about this here: https://www.kubeflow.org/docs/components/training/tftraining/
    Currently we are creating a tfjob with 1 worker and not doing distributed training. We will provide example of distributed training in future version.
    Additional tag is any additional tag you want to provide to the job name. 
    The name of the TFJob instance created in the Kubernetest pod will be f"{script_name}_{num_gpu}gpu_{cpu_request}c_{memory_request}gm_{additional_tag}" 
    
   :param script_name: Name of the jupyter script that was converted to a TFJob (without the ipynb extension). This name will also be used as the prefix for 
   the name of the TFJob instance that will be created in a Kubernetes pod (see above for how full name of instance is created)
   :param tf_job_image: Name of TF Job image to be deployed
   :param cpu_request: Number of CPUs requested
   :param memory_request: Gigs of memory requested
   :param num_gpu: Number of GPUs requested
   :param additional_tag: Additional tag added to name of created TFJob instance (see above for how full name of instance is created)
   """

    cpu_limit = cpu_request
    memory_limit = memory_request
    
    resources_dict = {
        "requests": {
            "cpu": f"{cpu_request}",
            "memory": f"{memory_request}Gi" 
        },
        "limits": {
            "cpu": f"{cpu_limit}",
            "memory": f"{memory_limit}Gi" 
        }
    }
    
    if num_gpu > 0:
        resources_dict["limits"]["nvidia.com/gpu"] = f"{num_gpu}"
        resources_dict["requests"]["nvidia.com/gpu"] = f"{num_gpu}" 
    
    if additional_tag:
        additional_tag = '-' + additional_tag
    
    job_name = f"{script_name}_{num_gpu}gpu_{cpu_request}c_{memory_request}gb{additional_tag}"
    job_name = job_name.replace(".", "-")
    job_name = job_name.replace("_", "-")
        
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "TFJob",
        "metadata": {
            "name": job_name
        },
        "spec": {
            # https://www.kubeflow.org/docs/components/training/tftraining/#tensorflow-logs
            "cleanPodPolicy": "All",
            "tfReplicaSpecs": {
                "Worker" : {
                    "replicas": 1,
                    "template": {
                        "metadata": {
                            "annotations": {
                                 "sidecar.istio.io/inject": "false"
                            }
                        },
                    "spec":{
                        "serviceAccountName": "default-editor",
                        "containers": [
                            {
                                "name": "tensorflow",
                                "command": ["python", f"/app/{script_name}.py"],                                
                                "image": tf_job_image,
                                "imagePullPolicy": "Always",
                                "resources": resources_dict,
                                "workingDir": "/app/"
                                }
                            ],
                            "restartPolicy": "OnFailure"
                        }
                    }
                }
            }
        }
    }
    return trial_spec

#### Creating a TF Job config and launching in Kubernetes pod

In [111]:
rand_hash = common_utilities.random_alphanumeric_str() # Additional tag to differentiate between experiment runs
additional_tag = f"{additional_tag}_{rand_hash}" # Just extra precaution that we are not over-writing containers.

trial_spec = tf_job_trial_spec(script_name=base_name, tf_job_image=cluster_image, cpu_request=num_cpu, memory_request=cpu_memory, 
                               num_gpu=num_gpu, additional_tag=additional_tag)
print("Here's the spec that will be deployed: ")
trial_spec

Here's the spec that will be deployed: 


{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'name': 'tf-model-trainer-0gpu-6c-40gb-nont-5-uegyfa6c'},
 'spec': {'cleanPodPolicy': 'All',
  'tfReplicaSpecs': {'Worker': {'replicas': 1,
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'serviceAccountName': 'default-editor',
      'containers': [{'name': 'tensorflow',
        'command': ['python', '/app/tf_model_trainer.py'],
        'image': 'gcr.io/zulilymodeltraining/rmenon/p13n-model-training/tf_model_trainer:88FC693C',
        'imagePullPolicy': 'Always',
        'resources': {'requests': {'cpu': '6', 'memory': '40Gi'},
         'limits': {'cpu': '6', 'memory': '40Gi'}},
        'workingDir': '/app/'}],
      'restartPolicy': 'OnFailure'}}}}}}

In [112]:
from kubeflow.tfjob.api import tf_job_client as tf_job_client_module
tf_job_client = tf_job_client_module.TFJobClient()
tf_job_client.create(trial_spec, namespace=NAMESPACE)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2021-12-10T05:35:07Z',
  'generation': 1,
  'managedFields': [{'apiVersion': 'kubeflow.org/v1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {},
      'f:tfReplicaSpecs': {'.': {},
       'f:Worker': {'.': {},
        'f:replicas': {},
        'f:template': {'.': {},
         'f:metadata': {'.': {},
          'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
         'f:spec': {'.': {},
          'f:containers': {},
          'f:restartPolicy': {},
          'f:serviceAccountName': {}}}}}}},
    'manager': 'Swagger-Codegen',
    'operation': 'Update',
    'time': '2021-12-10T05:35:07Z'}],
  'name': 'tf-model-trainer-0gpu-6c-40gb-nont-5-uegyfa6c',
  'namespace': 'rmenon',
  'resourceVersion': '329786761',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/rmenon/tfjobs/tf-model-trainer-0gpu-6c-40gb-nont-5-uegyfa6c',
  'uid': 'c5bfc232-2301-4882-b6a5-a0f64e4e4b74'},
 'spec':

## Helpful commands to monitor Kubernetes job

Run commands on the terminal
* Get list of TF Jobs in cluster: **kubectl get tfjob**
* Get list of all pods running TFJobs: **kubectl get po**
* Get logs on a particular pod: **kubectl logs -f \<POD_NAME\>**
    * Find the pod name from the *kubectl get po* command
* Delete completed TF Jobs: **kubectl delete TFjob \<JOB_NAME\>**
    * Find the job name from *kubectl get tfjob* command