In [1]:
!pip install tensorflow-datasets --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
%%writefile fairing-tfjob-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: fairing-tfjob-data-pvc
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 100Mi

Overwriting fairing-tfjob-pvc.yaml


In [3]:
!kubectl apply -f fairing-tfjob-pvc.yaml

persistentvolumeclaim/fairing-tfjob-data-pvc unchanged


In [4]:
#fairing:include-cell
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import tensorflow as tf
import tensorflow_datasets as tfds

def build_and_compile_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10)
    ])
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])
    return model


@tfds.decode.make_decoder(output_dtype=tf.float32)
def decode_image(example, feature):
    return tf.cast(feature.decode_example(example), dtype=tf.float32) / 255


def train():
    print("TensorFlow version: ", tf.__version__)

    BATCH_SIZE = 64

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    mnist = tfds.builder('mnist', data_dir='./mnist')
    mnist.download_and_prepare()

    mnist_train, mnist_test = mnist.as_dataset(
        split=['train', 'test'],
        decoders={'image': decode_image()},
        as_supervised=True)
    train_input_dataset = mnist_train.cache().repeat().shuffle(
        buffer_size=50000).batch(BATCH_SIZE)
    eval_input_dataset = mnist_test.cache().repeat().batch(BATCH_SIZE)

    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    train_input_dataset = train_input_dataset.with_options(options)
    eval_input_dataset = eval_input_dataset.with_options(options)

    print("Training...")

    with strategy.scope():
        multi_worker_model = build_and_compile_model()

    num_train_examples = mnist.info.splits['train'].num_examples
    train_steps = num_train_examples // BATCH_SIZE
    train_epochs = 10

    multi_worker_model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps)

    # Evaluate the model on test set
    score = multi_worker_model.evaluate(eval_input_dataset, steps=10)
    print('Test accuracy: ', score[1])

    model_base_path = '/app/data/models'
    version = 1
    model_path = os.path.join(model_base_path, str(version))
    
    multi_worker_model.save(model_path)

In [5]:
%%writefile Dockerfile
FROM tensorflow/tensorflow:2.1.0-py3

RUN pip install tensorflow-datasets==2.0.0

WORKDIR /app
COPY /app/fairing_notebook_cluster_tfjob.py /app

Overwriting Dockerfile


In [6]:
def fairing_run():
    import uuid
    from kubeflow import fairing
    from kubeflow.fairing.kubernetes import utils as k8s_utils
    from kubeflow.fairing.builders.cluster.minio_context import MinioContextSource
    from kubeflow.fairing.preprocessors.converted_notebook import FilterIncludeCell
        
    CONTAINER_REGISTRY = 'kangwoo'

    namespace = 'admin'
    job_name = f'fairing-notebook-cluster-tfjob-{uuid.uuid4().hex[:4]}'

    s3_endpoint = 'minio-service.kubeflow.svc.cluster.local:9000'
    minio_endpoint = "http://"+s3_endpoint
    minio_username = "minio"
    minio_key = "minio123"
    minio_region = "us-east-1"

    minio_context_source = MinioContextSource(endpoint_url=minio_endpoint, minio_secret=minio_username, minio_secret_key=minio_key, region_name=minio_region)
    fairing.config.set_preprocessor('notebook', notebook_preprocessor=FilterIncludeCell)
    fairing.config.set_builder('cluster', registry=CONTAINER_REGISTRY, image_name="fairing-notebook-cluster-tfjob", dockerfile_path="Dockerfile",
                           context_source=minio_context_source)

    fairing.config.set_deployer('tfjob', namespace=namespace, job_name=job_name, cleanup=False, stream_log=True, 
                            worker_count=2,
                            pod_spec_mutators=[k8s_utils.mounting_pvc(pvc_name='fairing-tfjob-data-pvc', pvc_mount_path='/app/data')])

    fairing.config.run()

In [7]:
#fairing:include-cell
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        fairing_run()
    else:
        train()

[I 200501 01:31:45 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7face85aebe0>
[I 200501 01:31:45 config:136] Using builder: <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder object at 0x7facc93a99e8>
[I 200501 01:31:45 config:138] Using deployer: <kubeflow.fairing.deployers.tfjob.tfjob.TfJob object at 0x7fac51f256d8>
[I 200501 01:31:45 cluster:46] Building image using cluster builder.
[I 200501 01:31:45 base:107] Creating docker context: /tmp/fairing_context_xbg3wfjm
[I 200501 01:31:45 converted_notebook:127] Converting fairing_notebook_cluster_tfjob.ipynb to fairing_notebook_cluster_tfjob.py
[W 200501 01:31:45 manager:296] Waiting for fairing-builder-428k5-wqf4b to start...
[W 200501 01:31:45 manager:296] Waiting for fairing-builder-428k5-wqf4b to start...
[W 200501 01:31:45 manager:296] Waiting for fairing-builder-428k5-wqf4b to start...
[I 200501 01:31:47 manager:302] Pod started running True


[36mINFO[0m[0003] Resolved base name tensorflow/tensorflow:2.1.0-py3 to tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0003] Resolved base name tensorflow/tensorflow:2.1.0-py3 to tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0003] Downloading base image tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0005] Error while retrieving image from cache: getting file info: stat /cache/sha256:14ec674cefd622aa9d45f07485500da254acaf8adfef80bd0f279db03c735689: no such file or directory
[36mINFO[0m[0005] Downloading base image tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0007] Built cross stage deps: map[]
[36mINFO[0m[0007] Downloading base image tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0008] Error while retrieving image from cache: getting file info: stat /cache/sha256:14ec674cefd622aa9d45f07485500da254acaf8adfef80bd0f279db03c735689: no such file or directory
[36mINFO[0m[0008] Downloading base image tensorflow/tensorflow:2.1.0-py3
[36mINFO[0m[0010] Unpacking rootfs as cmd RUN pip inst

[W 200501 01:33:47 job:101] The tfjob fairing-notebook-cluster-tfjob-3f70 launched.
[W 200501 01:33:47 manager:296] Waiting for fairing-notebook-cluster-tfjob-3f70-worker-0 to start...
[W 200501 01:33:47 manager:296] Waiting for fairing-notebook-cluster-tfjob-3f70-worker-0 to start...
[W 200501 01:33:47 manager:296] Waiting for fairing-notebook-cluster-tfjob-3f70-worker-0 to start...
[I 200501 01:33:59 manager:302] Pod started running True


2020-05-01 01:33:59.758060: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-05-01 01:33:59.758122: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-05-01 01:33:59.758130: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-05-01 01:34:00.325597: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-05-01 01:34:00.325619: E tensorflow/stream_executor/cuda