In [4]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Trains and Evaluates the MNIST network using a feed dictionary."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys
import time
import random
import logging

from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.examples.tutorials.mnist import mnist

import fairing
from fairing import builders
from fairing.training import kubeflow

DOCKER_REPOSITORY_NAME = 'gcr.io/mrick-gcp'
BASE_IMAGE='gcr.io/kubeflow-images-public/fairing:v0.0.1'
NOTEBOOK_FILE = '/home/jovyan/work/demo.ipynb'
fairing.config.set_builder(builders.AppendBuilder(
    repository=DOCKER_REPOSITORY_NAME, 
    notebook_file=NOTEBOOK_FILE,
    base_image=BASE_IMAGE))

INPUT_DATA_DIR = '/tmp/tensorflow/mnist/input_data/'
MAX_STEPS = 2000
BATCH_SIZE = 100
LEARNING_RATE = 0.3
HIDDEN_1 = 128
HIDDEN_2 = 32

# HACK: Ideally we would want to have a unique subpath for each instance of the job, but since we can't
# we are instead appending HOSTNAME to the logdir
LOG_DIR = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                       'tensorflow/mnist/logs/fully_connected_feed/', os.getenv('HOSTNAME', ''))
MODEL_DIR = os.path.join(LOG_DIR, 'model.ckpt')

@kubeflow.DistributedTraining(worker_count=3, ps_count=1, namespace='kubeflow')
class MyModel(object):
    def train(self):
        self.data_sets = input_data.read_data_sets(INPUT_DATA_DIR)
        self.images_placeholder = tf.placeholder(
            tf.float32, shape=(BATCH_SIZE, mnist.IMAGE_PIXELS))
        self.labels_placeholder = tf.placeholder(tf.int32, shape=(BATCH_SIZE))

        logits = mnist.inference(self.images_placeholder,
                                 HIDDEN_1,
                                 HIDDEN_2)

        self.loss = mnist.loss(logits, self.labels_placeholder)
        self.train_op = mnist.training(self.loss, LEARNING_RATE)
        self.summary = tf.summary.merge_all()
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        self.sess = tf.Session()
        self.summary_writer = tf.summary.FileWriter(LOG_DIR, self.sess.graph)
        self.sess.run(init)

        data_set = self.data_sets.train
        for step in xrange(MAX_STEPS):
            images_feed, labels_feed = data_set.next_batch(BATCH_SIZE, False)
            feed_dict = {
                self.images_placeholder: images_feed,
                self.labels_placeholder: labels_feed,
            }

            _, loss_value = self.sess.run([self.train_op, self.loss],
                                     feed_dict=feed_dict)
            if step % 100 == 0:
                print("At step {}, loss = {}".format(step, loss_value))
                summary_str = self.sess.run(self.summary, feed_dict=feed_dict)
                self.summary_writer.add_summary(summary_str, step)
                self.summary_writer.flush()

model = MyModel()
model.train()

Running...
Uploading gcr.io/mrick-gcp/fairing-job:0b24740a7175bcd031aa2a26ddcede9599206b866bab39f7bb17b0d44c834c19
Pushed image gcr.io/mrick-gcp/fairing-job:0b24740a7175bcd031aa2a26ddcede9599206b866bab39f7bb17b0d44c834c19
Training(s) launched.
Waiting for job to start...


b'Instructions for updating:'
b'Use the retry module or similar alternatives.'
b'Instructions for updating:'
b'Please use alternatives such as official/mnist/dataset.py from tensorflow/models.'
b'From /app/demo.py:67: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.'
b'Instructions for updating:'
b'Please use alternatives such as official/mnist/dataset.py from tensorflow/models.'
b'Instructions for updating:'
b'Please write your own downloading logic.'
b'From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.'
b'Instructions for updating:'
b'Please write your own downloading logic.'
b'Instructions for updating:'
b'Please use urllib or similar directly.'
b'From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/l