In [1]:
import fairing
from fairing import TrainJob
from fairing.backends import KubeflowGKEBackend
from fairing.kubernetes.utils import get_resource_mutator

## Executing a python file

In [9]:
%%writefile train.py
print("hello world!")

Overwriting train.py


In [10]:
job = TrainJob("train.py", backend=KubeflowGKEBackend())
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using default base docker image: registry.hub.docker.com/library/python:3.7.2
Using builder: <class 'fairing.builders.append.append.AppendBuilder'>
Building the docker image.
Building image using Append builder...
Creating docker context: /tmp/fairing_context_5xhzxv7o
Loading Docker credentials for repository 'registry.hub.docker.com/library/python:3.7.2'
Image successfully built in 3.302749749999748s.
Pushing image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:AF73294F...
Loading Docker credentials for repository 'gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:AF73294F'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:AF73294F
Layer sha256:7d3cdae560215b5913fcb23622a9f98b73109c422da51fff1d9eab3c53f0756b exists, skipping
Layer sha256:d54db43011fd116b8cb6d9e49e268cee1fa6212f152b30cbfa7f3c4c6

hello world!


'fairing-job-pjmq5'

## Executing a python function

In [12]:
def train():
    print("simple train job!")

In [21]:
train()

CPU count: 12
Memory: {} 32.0


In [22]:
job = TrainJob(train, backend=KubeflowGKEBackend())
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using default base docker image: registry.hub.docker.com/library/python:3.7.2
Using builder: <class 'fairing.builders.append.append.AppendBuilder'>
Building the docker image.
Building image using Append builder...
Creating docker context: /tmp/fairing_context_5fuymnhx
/Users/cartick/Documents/workspace/fairing/venv/lib/python3.7/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Loading Docker credentials for repository 'registry.hub.docker.com/library/python:3.7.2'
Image successfully built in 1.7307556389996535s.
Pushing image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:C6D1D8A1...
Loading Docker credentials for repository 'gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:C6D1D8A1'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:C6D1D8A1
Layer sha256:7d3cdae5

CPU count: 8
Memory: {} 29.450111389160156


'fairing-job-pq8jh'

## Executing a complete notebook

In [14]:
%%writefile requirements.txt
papermill
jupyter

Overwriting requirements.txt


In [15]:
job = TrainJob("train.ipynb", backend=KubeflowGKEBackend(), input_files=["requirements.txt"])
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using default base docker image: registry.hub.docker.com/library/python:3.7.2
Using builder: <class 'fairing.builders.docker.docker.DockerBuilder'>
Building the docker image.
Building image using docker
Docker command: ['papermill', 'train.ipynb', 'fairing_output_notebook.ipynb', '--log-output']
Creating docker context: /tmp/fairing_context_3mycavj9
Building docker image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:28265AFC...
Build output: Step 1/7 : FROM registry.hub.docker.com/library/python:3.7.2
Build output: 
Build output: ---> 2053ca75899e
Build output: Step 2/7 : WORKDIR /app/
Build output: 
Build output: ---> Using cache
Build output: ---> 2c7f8a216fa8
Build output: Step 3/7 : ENV FAIRING_RUNTIME 1
Build output: 
Build output: ---> Using cache
Build output: ---> 13640ae5453f
Build output: Step 4/7 : COPY /app//requirements.txt /app/
Build output: 
Build output: ---> Using cache
Build output: ---> 3161c

Input Notebook:  train.ipynb
Output Notebook: fairing_output_notebook.ipynb
Executing notebook with kernel: python3
Executing Cell 1---------------------------------------
training in notebook!

Ending Cell 1------------------------------------------


Cleaning up job fairing-job-2qj98...


'fairing-job-2qj98'

## Executing it with large #CPUs and huge Memory
You Kubernetes cluster should have a node pool that can satisfy these resource requests. For example, to schedule a job with 90 cpus and 600GB memory you need a nodepool created using n1-hihmem-624 in GCP.

In [16]:
import multiprocessing
import os
def train():
    print("CPU count: {}".format(multiprocessing.cpu_count()))
    print("Memory: {}", os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')/(1024.**3))
train()

CPU count: 12
Memory: {} 32.0


In [17]:
job = TrainJob(train, backend=KubeflowGKEBackend(), 
              pod_spec_mutators=[get_resource_mutator(cpu=90, memory=600)])
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using default base docker image: registry.hub.docker.com/library/python:3.7.2
Using builder: <class 'fairing.builders.append.append.AppendBuilder'>
Building the docker image.
Building image using Append builder...
Creating docker context: /tmp/fairing_context_desyj_od
/Users/cartick/Documents/workspace/fairing/venv/lib/python3.7/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Loading Docker credentials for repository 'registry.hub.docker.com/library/python:3.7.2'
Image successfully built in 1.739382934000787s.
Pushing image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:4C228EA7...
Loading Docker credentials for repository 'gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:4C228EA7'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:4C228EA7
Layer sha256:7d3cdae56

CPU count: 96
Memory: {} 614.1527938842773


'fairing-job-k2d5p'

## Executing with a GPU 

In [18]:
from kubernetes.client.models.v1_resource_requirements import V1ResourceRequirements
def add_gpu(kube_manager, pod_spec, namespace):
    pod_spec.containers[0].resources = V1ResourceRequirements(limits={"nvidia.com/gpu":"1"})

In [19]:
%%writefile gpu_test.py
import subprocess
def train():
    print(subprocess.check_output(["nvidia-smi"]))
train()

Overwriting gpu_test.py


In [20]:
job = TrainJob("gpu_test.py",
               base_docker_image="tensorflow/tensorflow:2.0.0b0-gpu",
               backend=KubeflowGKEBackend(),
               pod_spec_mutators=[add_gpu])
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using builder: <class 'fairing.builders.append.append.AppendBuilder'>
Building the docker image.
Building image using Append builder...
Creating docker context: /tmp/fairing_context_k2b58s0h
Loading Docker credentials for repository 'tensorflow/tensorflow:2.0.0b0-gpu'
Image successfully built in 2.1183637859994633s.
Pushing image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:BA655542...
Loading Docker credentials for repository 'gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:BA655542'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:BA655542
Layer sha256:5788fbc934ee33c1d7f95fb77587eabf716f2e5884b2c02c8a9c329b3629dc39 exists, skipping
Layer sha256:5138351ecc1c8cb2633c072222751a972edc349ff9e025ba217a64ff0f41dd6e exists, skipping
Layer sha256:e059dd98ac7cff88cacd4e01f2f1d56af872618aac98b0aff8

Fri Jun 14 02:16:44 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No running processes found                                                 |
+-------------

'fairing-job-tjq9k'

## MNIST on GPU

In [5]:
%%writefile mnist.py
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow_datasets as tfds
import tensorflow as tf

BUFFER_SIZE = 10000
BATCH_SIZE = 64

def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255
    return image, label

datasets, info = tfds.load(name='mnist',
                           with_info=True,
                           as_supervised=True)
train_datasets_unbatched = datasets['train'].map(scale).shuffle(BUFFER_SIZE)
train_datasets = train_datasets_unbatched.batch(BATCH_SIZE)

def build_and_compile_cnn_model():
    model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
      metrics=['accuracy'])
    return model

single_worker_model = build_and_compile_cnn_model()
single_worker_model.fit(x=train_datasets, epochs=3)

Overwriting mnist.py


In [6]:
%%writefile requirements.txt
tensorflow_datasets

Overwriting requirements.txt


In [8]:
job = TrainJob("mnist.py",
               base_docker_image="tensorflow/tensorflow:2.0.0b0-gpu",
               backend=KubeflowGKEBackend(),
               pod_spec_mutators=[add_gpu],
               input_files=["requirements.txt"])
job.submit()

Using default docker registry: gcr.io/caip-dexter-bugbash/fairing-job
Using builder: <class 'fairing.builders.docker.docker.DockerBuilder'>
Building the docker image.
Building image using docker
Docker command: ['python', '/app/mnist.py']
Creating docker context: /tmp/fairing_context_i7oq20xv
Building docker image gcr.io/caip-dexter-bugbash/fairing-job/fairing-job:CEDDE73E...
Build output: Step 1/7 : FROM tensorflow/tensorflow:2.0.0b0-gpu
Build output: 
Push output: Pulling from tensorflow/tensorflow None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None
Push output: Pulling fs layer None

Push output: Downloading [==>                                                ]  7.503MB/138.6MB
Push output: Downloading [=>                                                 ]  26.27MB/665.9MB
Push output: Downloading [===>                                               ]  8.584MB/138.6MB
Push output: Downloading [==>                                                ]  27.88MB/665.9MB
Push output: Downloading [===>                                               ]  9.653MB/138.6MB
Push output: Downloading [==>                                                ]  29.49MB/665.9MB
Push output: Downloading [===>                                               ]  10.72MB/138.6MB
Push output: Downloading [==>                                                ]  31.11MB/665.9MB
Push output: Downloading [====>                                              ]   11.8MB/138.6MB
Push output: Pull complete None
Push output: Downloading [==>                                                ]  32.73MB/665.9MB
Push out

Push output: Downloading [=====>                                             ]  70.27MB/665.9MB
Push output: Pull complete None
Push output: Downloading [=====>                                             ]  72.96MB/665.9MB
Push output: Downloading [=====>                                             ]  76.16MB/665.9MB
Push output: Downloading [=====>                                             ]  78.85MB/665.9MB






Push output: Verifying Checksum None
Push output: Download complete None
Push output: Extracting [>                                                  ]  557.1kB/665.9MB
Push output: Extracting [>                                                  ]  4.456MB/665.9MB
Push output: Extracting [>                                                  ]  8.913MB/665.9MB
Push output: Extracting [>                                                  ]  10.03MB/665.9MB
Push output: Downloading [>                                                  ]  33.27kB/3.289MB
Push output: Extracting [>                                                  ]  11.14MB/665.9MB
Push output: Verifying Checksum None
Push output: Download complete None
Push output: Extracting [>                                                  ]  12.81MB/665.9MB
Push output: Extracting [=>                                                 ]  13.37MB/665.9MB
Push output: Extracting [=>                                                 ]  16.15MB/665.9M

Push output: Extracting [===>                                               ]  40.11MB/665.9MB
Push output: Downloading [>                                                  ]  8.555MB/754.6MB
Push output: Extracting [===>                                               ]  43.45MB/665.9MB
Push output: Downloading [>                                                  ]   10.7MB/754.6MB
Push output: Extracting [===>                                               ]  45.68MB/665.9MB
Push output: Downloading [>                                                  ]  12.84MB/754.6MB
Push output: Extracting [===>                                               ]  47.35MB/665.9MB
Push output: Downloading [>                                                  ]  14.99MB/754.6MB
Push output: Downloading [=>                                                 ]  17.68MB/754.6MB
Push output: Verifying Checksum None
Push output: Download complete None
Push output: Extracting [===>                                      









Push output: Pull complete None


Push output: Download complete None
Push output: Download complete None
Push output: Extracting [>                                                  ]  557.1kB/138.6MB
Push output: Extracting [=>                                                 ]  4.456MB/138.6MB
Push output: Extracting [====>                                              ]  12.81MB/138.6MB
Push output: Pull complete None
Push output: Extracting [>                                                  ]  557.1kB/64.04MB
Push output: Extracting [=====>                                             ]  6.685MB/64.04MB
Push output: Pull complete None
Push output: Extracting [>                                                  ]  65.54kB/3.289MB
Push output: Extracting [====>                                              ]  327.7kB/3.289MB
Push output: Pull complete None
Push output: Pull complete None
Push output: Extracting [>                                                  ]  557.1kB/754.6MB
Push output: Extracting [>              



Push output: Pull complete None
Push output: Pull complete None
Push output: Pull complete None
Push output: Digest: sha256:2ffc4a7d72669397ee9033d10b738f2179140f60b2c3ace9dfd55f8c68a3e59f None
Push output: Status: Downloaded newer image for tensorflow/tensorflow:2.0.0b0-gpu None
Build output: ---> b53afcb4b0f4
Build output: Step 2/7 : WORKDIR /app/
Build output: 
Build output: ---> Running in eb93c76b6225
Build output: ---> 98b149411154
Build output: Step 3/7 : ENV FAIRING_RUNTIME 1
Build output: 
Build output: ---> Running in 3cc036d658a8
Build output: ---> 8eaad7315849
Build output: Step 4/7 : COPY /app//requirements.txt /app/
Build output: 
Build output: ---> 53b4fd790eb7
Build output: Step 5/7 : RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi
Build output: 
Build output: ---> Running in 17e11a3642e3
Build output: [91mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be mai

Build output: Collecting googleapis-common-protos (from tensorflow-metadata->tensorflow_datasets->-r requirements.txt (line 1))
Build output: Downloading https://files.pythonhosted.org/packages/eb/ee/e59e74ecac678a14d6abefb9054f0bbcb318a6452a30df3776f133886d7d/googleapis-common-protos-1.6.0.tar.gz
Build output: Collecting typing>=3.6.4; python_version < "3.5" (from promise->tensorflow_datasets->-r requirements.txt (line 1))
Build output: Downloading https://files.pythonhosted.org/packages/cc/3e/29f92b7aeda5b078c86d14f550bf85cff809042e3429ace7af6193c3bc9f/typing-3.6.6-py2-none-any.whl
Build output: Building wheels for collected packages: dill, bz2file, functools32, future, psutil, googleapis-common-protos
Build output: Building wheel for dill (setup.py): started
Build output: Building wheel for dill (setup.py): finished with status 'done'
Build output: Stored in directory: /tmp/pip-ephem-wheel-cache-yMJreB/wheels/5b/d7/0f/e58eae695403de585269f4e4a94e0cd6ca60ec0c202936fa4a
Build output: 

Dl Completed...: 0 url [00:00, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/2 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/3 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/4 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:   0%|          | 0/4 [00:00<?, ? url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.85 url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.85 url/s]
Dl Size...: 0 MiB [00:00, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:00,  9.85 url/s]
Dl Size...:   0%|          | 0/1 [00:00<?, ? MiB/s][A

Dl Completed...:  50%|█████     | 2/4 [00:00<00:00,  9.85 url/s]
Dl Size...:   0%|          | 0/1 [00:00<?, ? Mi

1 examples [00:00,  1.93 examples/s]302 examples [00:00,  2.76 examples/s]613 examples [00:00,  3.94 examples/s]946 examples [00:00,  5.63 examples/s]1282 examples [00:00,  8.03 examples/s]1622 examples [00:01, 11.46 examples/s]1961 examples [00:01, 16.35 examples/s]2299 examples [00:01, 23.31 examples/s]2630 examples [00:01, 33.20 examples/s]2959 examples [00:01, 47.23 examples/s]3290 examples [00:01, 67.06 examples/s]3633 examples [00:01, 95.00 examples/s]3961 examples [00:01, 133.89 examples/s]4281 examples [00:01, 187.68 examples/s]4613 examples [00:01, 261.77 examples/s]4948 examples [00:02, 361.82 examples/s]5277 examples [00:02, 493.57 examples/s]5614 examples [00:02, 663.41 examples/s]5944 examples [00:02, 872.30 examples/s]6273 examples [00:02, 1113.08 examples/s]6603 examples [00:02, 1389.03 examples/s]6941 examples [00:02, 1686.83 examples/s]7271 examples [00:02, 1961.84 examples/s]7605 examples [00:02, 2238.29 examples/s]7933 examples [00:02, 2374.1

Reading...: 0 examples [00:00, ? examples/s][A
                                            [A
Writing...:   0%|          | 0/6000 [00:00<?, ? examples/s][A
                                                           [A
Reading...: 0 examples [00:00, ? examples/s][A
                                            [A
Writing...:   0%|          | 0/6000 [00:00<?, ? examples/s][A
Shuffling...:  60%|██████    | 6/10 [00:00<00:00, 14.50 shard/s]
Reading...: 0 examples [00:00, ? examples/s][A
                                            [A
Writing...:   0%|          | 0/6000 [00:00<?, ? examples/s][A
                                                           [A
Reading...: 0 examples [00:00, ? examples/s][A
                                            [A
Writing...:   0%|          | 0/6000 [00:00<?, ? examples/s][A
Shuffling...:  80%|████████  | 8/10 [00:00<00:00, 12.90 shard/s]
Reading...: 0 examples [00:00, ? examples/s][A
                                            [A
Writing...: 

    785/Unknown - 20s 26ms/step - loss: 2.1899 - accuracy: 0.2971    786/Unknown - 20s 26ms/step - loss: 2.1896 - accuracy: 0.2974    787/Unknown - 20s 26ms/step - loss: 2.1894 - accuracy: 0.2976    788/Unknown - 20s 26ms/step - loss: 2.1892 - accuracy: 0.2980    789/Unknown - 20s 26ms/step - loss: 2.1890 - accuracy: 0.2982    790/Unknown - 20s 26ms/step - loss: 2.1887 - accuracy: 0.2984    791/Unknown - 20s 26ms/step - loss: 2.1885 - accuracy: 0.2988    792/Unknown - 20s 26ms/step - loss: 2.1883 - accuracy: 0.2990

Epoch 2/3
 16/938 [..............................] - ETA: 2:21 - loss: 1.9118 - accuracy: 0.5535
 34/938 [>.............................] - ETA: 1:13 - loss: 1.9081 - accuracy: 0.558
 52/938 [>.............................] - ETA: 52s - loss: 1.8979 - accuracy: 0.5658
 70/938 [=>............................] - ETA: 42s - loss: 1.8897 - accuracy: 0.568
 88/938 [=>............................] - ETA: 36s - loss: 1.8811 - accuracy: 0.573
105/938 [==>...........................] - ETA: 32s - loss: 1.8726 - accuracy: 0.577
123/938 [==>...........................] - ETA: 29s - loss: 1.8618 - accuracy: 0.583
139/938 [===>..........................] - ETA: 27s - loss: 1.8566 - accuracy: 0.583
157/938 [====>.........................] - ETA: 25s - loss: 1.8497 - accuracy: 0.586
174/938 [====>.........................] - ETA: 23s - loss: 1.8405 - accuracy: 0.589
192/938 [=====>........................] - ETA: 22s - loss: 1.8325 - accuracy: 0.591
209/938 [=====>........................] - ETA: 21s

Cleaning up job fairing-job-d9rt9...


'fairing-job-d9rt9'