In [1]:
%%writefile tf_dist.py

from __future__ import absolute_import, division, print_function, unicode_literals
import os
import json
import tensorflow_datasets as tfds
import tensorflow as tf
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

BUFFER_SIZE = 10000
BATCH_SIZE = 64
# Scaling MNIST data from (0, 255] to (0., 1.]

def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255
    return image, label
datasets, info = tfds.load(name='mnist',
                           with_info=True,
                           as_supervised=True)

train_datasets_unbatched = datasets['train'].map(scale).shuffle(BUFFER_SIZE)
train_datasets = train_datasets_unbatched.batch(BATCH_SIZE)

def build_and_compile_cnn_model():
    model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
      metrics=['accuracy'])
    return model

NUM_WORKERS = 2
# Here the batch size scales up by number of workers since 
# `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, 
# and now this becomes 128.
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE)
with strategy.scope():
    multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(x=train_datasets, epochs=3)

Writing tf_dist.py


In [8]:
%%writefile requirements.txt
tensorflow_datasets

Overwriting requirements.txt


## Fairing

In [2]:
import fairing
from fairing.preprocessors.base import BasePreProcessor
from fairing.preprocessors.function import FunctionPreProcessor

from fairing.builders.append.append import AppendBuilder
from fairing.deployers.tfjob.tfjob import TfJob
from fairing.builders.docker.docker import DockerBuilder

In [4]:
# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)

In [3]:
from kubernetes.client.models.v1_resource_requirements import V1ResourceRequirements
def add_gpu(kube_manager, pod_spec, namespace):
    pod_spec.containers[0].resources = V1ResourceRequirements(limits={"nvidia.com/gpu":"1"})

In [9]:
preprocessor = BasePreProcessor(command=["python","tf_dist.py"],
                                input_files=["tf_dist.py", "requirements.txt"])

In [10]:
builder = DockerBuilder(registry=DOCKER_REGISTRY,
                        base_image="tensorflow/tensorflow:2.0.0b0-gpu",
                        preprocessor=preprocessor)
builder.build()
pod_spec = builder.generate_pod_spec()

Building image using docker
Docker command: ['python', 'tf_dist.py', '/app/tf_dist.py']
Creating docker context: /tmp/fairing_context_nmtkun45
Building docker image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:BC00C163...
Build output: Step 1/7 : FROM tensorflow/tensorflow:2.0.0b0-gpu
Build output: 
Build output: ---> b53afcb4b0f4
Build output: Step 2/7 : WORKDIR /app/
Build output: 
Build output: ---> Using cache
Build output: ---> 98b149411154
Build output: Step 3/7 : ENV FAIRING_RUNTIME 1
Build output: 
Build output: ---> Using cache
Build output: ---> 8eaad7315849
Build output: Step 4/7 : COPY /app//requirements.txt /app/
Build output: 
Build output: ---> Using cache
Build output: ---> 53b4fd790eb7
Build output: Step 5/7 : RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi
Build output: 
Build output: ---> Using cache
Build output: ---> 850752aba54f
Build output: Step 6/7 : COPY /app/ /app/
Build output: 
Build output: ---> e1122f3a4e3a
Build o

In [12]:
deployer = TfJob(namespace="kubeflow",
                 pod_spec_mutators=[add_gpu],
                 worker_count=3,
                 chief_count=0)
deployer.deploy(pod_spec)

Training job fairing-tfjob-vfpcb launched.
Waiting for fairing-tfjob-vfpcb-worker-0 to start...
Waiting for fairing-tfjob-vfpcb-worker-0 to start...
Waiting for fairing-tfjob-vfpcb-worker-0 to start...
Pod started running True


2019-06-14 01:05:21.032687: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1
2019-06-14 01:05:21.087147: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-06-14 01:05:21.088446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0000:00:04.0
2019-06-14 01:05:21.089365: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-06-14 01:05:21.093204: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-06-14 01:05:21.096435: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuf

Dl Completed...: 0 url [00:00, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/2 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/3 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/4 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/4 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.18 url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.18 url/s]
Dl Size...:   0%|          | 0/1 [00:00<?, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.18 url/s]
Dl Size...:   0%|          | 0/1 [00:00<?, ? MiB/s][A

Extraction completed...:   0%|          | 0/1 [00:00<?, ? file/s][A[A

Dl Completed...:  25%|

2019-06-14 01:28:06.841210: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-06-14 01:28:09.578914: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
[1mDownloading and preparing dataset mnist (11.06 MiB) to /root/tensorflow_datasets/mnist/1.0.0...[0m



[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/1.0.0. Subsequent calls will reuse this data.[0m
Train on None steps
Epoch 1/3
      9/Unknown - 6s 779ms/step - loss: 2.2921 - accuracy: 0.146
     16/Unknown - 7s 407ms/step - loss: 2.2909 - accuracy: 0.154
     24/Unknown - 7s 283ms/step - loss: 2.2903 - accuracy: 0.151
     32/Unknown - 7s 222ms/step - loss: 2.2887 - accuracy: 0.155
     40/Unknown - 7s 184ms/step - loss: 2.2866 - accuracy: 0.162
     48/Unknown - 8s 162ms/step - loss: 2.2847 - accuracy: 0.170
     55/Unknown - 8s 143ms/step - loss: 2.2829 - accuracy:

'fairing-tfjob-vfpcb'