# Between-graph replication

In [1]:
import numpy as np
import tensorflow as tf
from keras.datasets import mnist

Using TensorFlow backend.


In [2]:
tf.__version__

'1.11.0'

In [3]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [4]:
print("Train Set: {}".format(x_train.shape))
print("Test Set: {}".format(x_test.shape))

Train Set: (60000, 28, 28)
Test Set: (10000, 28, 28)


In [5]:
print("Train Set: {}".format(y_train.shape))
print("Test Set: {}".format(y_test.shape))

Train Set: (60000,)
Test Set: (10000,)


In [6]:
x_train = x_train/255.0
x_test = x_test/255.0

In [7]:
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

if y_train.shape[1] != 10:
    y_train = y_train[:,0]
    y_test = y_test[:,0]

In [8]:
print("Train Set: {}".format(y_train.shape))
print("Test Set: {}".format(y_test.shape))

Train Set: (60000, 10)
Test Set: (10000, 10)


In [9]:
cluster = tf.train.ClusterSpec(
    {
        "ps": ["172.17.0.2:2223",
               "172.17.0.3:2224"],
        "worker": ["172.17.0.1:2222"]
    }
)

In [10]:
x = tf.placeholder(tf.float32, shape=[None, 28, 28], name="x")
y = tf.placeholder(tf.int8, shape=[None, 10], name="y")

## Sub Graph on Parameter Server 1

In [11]:
with tf.device("/job:ps/task:0"):  
    # Input Layer
    input_layer = tf.reshape(x, [-1, 28, 28, 1])

    # Convolutional Layer #1
    conv1 = tf.layers.conv2d(
                              inputs=input_layer,
                              filters=32,
                              kernel_size=[5, 5],
                              padding="same",
                              activation=tf.nn.relu
                            )

    # Pooling Layer #1
    pool1 = tf.layers.max_pooling2d(
                                    inputs=conv1, 
                                    pool_size=[2, 2], 
                                    strides=2
                                    )

    # Convolutional Layer #2
    conv2 = tf.layers.conv2d(
                              inputs=pool1,
                              filters=64,
                              kernel_size=[5, 5],
                              padding="same",
                              activation=tf.nn.relu
                            )
    
    # Pooling Layer #2
    pool2 = tf.layers.max_pooling2d(
                                    inputs=conv2, 
                                    pool_size=[2, 2], 
                                    strides=2
                                    )

## Sub Graph on Parameter Server 2

In [12]:
with tf.device("/job:ps/task:1"):
    # Dense Layer
    pool2_flat = tf.reshape(
                            tensor=pool2, 
                            shape=[-1, 7 * 7 * 64]
                            )
    
    dense = tf.layers.dense(
                            inputs=pool2_flat, 
                            units=1024, 
                            activation=tf.nn.relu
                            )
    
    dropout = tf.layers.dropout(
                                inputs=dense, 
                                rate=0.4, 
                                )

    # Logits Layer
    logits = tf.layers.dense(
                            inputs=dropout, 
                            units=10
                            )

## Sub Graph on Worker Node

In [13]:
with tf.device("/job:worker/task:0"): 
    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=tf.argmax(y, axis=1), logits=logits)


    # Configure the Training Op (for TRAIN mode)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(
                                loss=loss,
                                global_step=tf.train.get_global_step()
                              )


    # Add evaluation metrics (for EVAL mode)
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
EPOCHS = 5000
BATCH_SIZE = 16

In [15]:
with tf.Session("grpc://172.17.0.1:2222") as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(EPOCHS):
        idx = np.random.randint(0, x_train.shape[0], size=BATCH_SIZE)
        
        x_batch = x_train[idx]
        y_batch = y_train[idx]
        
        sess.run(train_op, feed_dict={ x: x_batch, y: y_batch })
        LOSS, ACC = sess.run([loss, accuracy], feed_dict={ x: x_batch, y: y_batch })
        
        if i%100 == 0:
            TEST_ACC = 0
            count = 0
            j=0
            while j<x_test.shape[0] :
                TEST_ACC += sess.run(accuracy, feed_dict={ 
                                x: x_test[j:min(j+BATCH_SIZE, x_test.shape[0])], 
                                y: y_test[j:min(j+BATCH_SIZE, x_test.shape[0])]
                                                        })

                j = j+BATCH_SIZE
                count += 1
                
            print("Epochs: {:4d}   Loss:{:.6f}   Val_Acc:{:.3f}%   Test_Acc:{:.3f}%".format(i, LOSS, ACC*100, 100*TEST_ACC/count))

Epochs:    0   Loss:2.327526   Val_Acc:0.000%   Test_Acc:12.120%
Epochs:  100   Loss:2.292347   Val_Acc:18.750%   Test_Acc:20.800%
Epochs:  200   Loss:2.253204   Val_Acc:31.250%   Test_Acc:29.460%
Epochs:  300   Loss:2.251566   Val_Acc:43.750%   Test_Acc:38.220%
Epochs:  400   Loss:2.218656   Val_Acc:31.250%   Test_Acc:44.970%
Epochs:  500   Loss:2.236017   Val_Acc:31.250%   Test_Acc:52.180%
Epochs:  600   Loss:2.205187   Val_Acc:37.500%   Test_Acc:57.170%
Epochs:  700   Loss:2.097551   Val_Acc:62.500%   Test_Acc:60.250%
Epochs:  800   Loss:2.106690   Val_Acc:43.750%   Test_Acc:59.100%
Epochs:  900   Loss:1.979638   Val_Acc:62.500%   Test_Acc:63.400%
Epochs: 1000   Loss:1.865477   Val_Acc:81.250%   Test_Acc:68.010%
Epochs: 1100   Loss:1.688928   Val_Acc:75.000%   Test_Acc:74.300%
Epochs: 1200   Loss:1.665369   Val_Acc:68.750%   Test_Acc:77.260%
Epochs: 1300   Loss:1.516363   Val_Acc:68.750%   Test_Acc:75.700%
Epochs: 1400   Loss:1.198220   Val_Acc:87.500%   Test_Acc:79.090%
Epochs: 150