# Multi GPU training

Ref: http://python.usyiyi.cn/documents/effective-tf/10.html

- 레퍼런스는 많은데... 이게 제일 깔끔해 보여서 일단 이런 방식으로 시도.

Check:

- 학습이 잘 되는지 => OK
- Single GPU 랑 속도 차이가 얼마나 나는지 => OK
- TODO: tb logging 은 잘 되는지 => 기본적으론 잘 되겠지만 좀 처리를 해 줘야 할 수도 있음

etc

## Single GPU MNIST

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow.contrib.slim as slim
import collections
import time

In [2]:
tf.__version__

'1.5.0'

In [3]:
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [4]:
def prepare_graph():
    tf.reset_default_graph()

    # inputs
    X = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    
    return X, y

In [5]:
def build_graph(X, y):
    x = tf.reshape(X, [-1, 28, 28, 1])

    x = slim.conv2d(x, 128, kernel_size=[5,5]) 
    x = slim.max_pool2d(x, kernel_size=[2,2])

    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.max_pool2d(x, kernel_size=[2,2])
    
    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.max_pool2d(x, kernel_size=[2,2])

    flat = slim.flatten(x)
    logits = slim.fully_connected(flat, 10, activation_fn=None)
    prob = tf.nn.softmax(logits)

    correct = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits)
    
    return loss, accuracy

In [6]:
def train(X, y, train_op, loss, accuracy):
    config = tf.ConfigProto(log_device_placement=True)
    config.gpu_options.allow_growth = True
    # config.log_device_placement = True # 안 나오네...
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    N = mnist.train.num_examples
    n_iter = N // batch_size
    # multi-gpu 환경에서는 test batch size 를 더 키워도 됨
    test_batch_size = 1024

    for epoch in range(epoch_n):
        st = time.time()

        avg_loss = 0.
        avg_acc = 0.
        for _ in range(n_iter):
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            _, cur_acc, cur_loss = sess.run([train_op, accuracy, loss], {X: batch_x, y: batch_y})
            avg_acc += cur_acc
            avg_loss += cur_loss

        avg_acc /= n_iter
        avg_loss /= n_iter

        train_elapsed = time.time() - st
        st = time.time()

        test_acc = 0.
        test_loss = 0.
        for _ in range(mnist.test.num_examples // test_batch_size):
            batch_x, batch_y = mnist.test.next_batch(test_batch_size)
            cur_acc, cur_loss = sess.run([accuracy, loss], {X: batch_x, y: batch_y})
            test_acc += cur_acc
            test_loss += cur_loss
        test_acc /= (mnist.test.num_examples // test_batch_size)
        test_loss /= (mnist.test.num_examples // test_batch_size)

        test_elapsed = time.time() - st

        print("[{:2}/{}] (train) acc: {:.2%}, loss: {:.3f} | (test) acc: {:.2%}, loss: {:.3f} | {:.2f}s, {:.2f}s".
              format(epoch+1, epoch_n, avg_acc, avg_loss, test_acc, test_loss, train_elapsed, test_elapsed))

In [7]:
# hyperparams
''' [!] batch_size 는 n_gpu 의 배수인 경우만 고려함. '''
epoch_n = 10
batch_size = 1024

In [8]:
X, y = prepare_graph()
loss, accuracy = build_graph(X, y)
train_op = tf.train.AdamOptimizer().minimize(loss)

train(X, y, train_op, loss, accuracy)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

[ 1/10] (train) acc: 83.63%, loss: 0.607 | (test) acc: 95.88%, loss: 0.132 | 3.99s, 0.20s
[ 2/10] (train) acc: 96.87%, loss: 0.105 | (test) acc: 98.05%, loss: 0.065 | 3.10s, 0.18s
[ 3/10] (train) acc: 98.05%, loss: 0.064 | (test) acc: 98.61%, loss: 0.047 | 3.10s, 0.18s
[ 4/10] (train) acc: 98.45%, loss: 0.051 | (test) acc: 98.71%, loss: 0.044 | 3.11s, 0.18s
[ 5/10] (train) acc: 98.61%, loss: 0.044 | (test) acc: 98.95%, loss: 0.037 | 3.11s, 0.18s
[ 6/10] (train) acc: 98.91%, loss: 0.034 | (test) acc: 98.91%, loss: 0.034 | 3.11s, 0.18s
[ 7/10] (train) acc: 99.05%, loss: 0.030 | (test) acc: 98.89%, loss: 0.035 | 3.12s, 0.18s
[ 8/10] (train) acc: 99.25%, loss: 0.024 | (test) acc: 99.13%, loss: 0.028 | 3.19s, 0.18s
[ 9/10] (train) acc: 99.30%, loss: 0.022 | (test) acc: 99.02%, loss: 0.031 | 3.27s, 0.18s
[

## Multi GPUs MNIST

In [9]:
def make_parallel(fn, num_gpus, **kwargs):
    '''
    Args:
        fn: model builder function
        num_gpus
        kwargs: input of model builder; e.g. X=X, y=y.
        
    Returns:
        2d tensors: num_gpus * retrun_list_of_fn
        e.g. 
        [[loss, acc, train_op],
         [loss, acc, train_op],
        ...
        ]
    '''
    in_splits = {}
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, num_gpus)

    out_split = []
    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                inputs = {k : v[i] for k, v in in_splits.items()}
                ret = fn(**inputs)
                out_split.append(ret)

    return tf.convert_to_tensor(out_split)

In [10]:
X, y = prepare_graph()
parallel_tensors = make_parallel(build_graph, 4, X=X, y=y) # loss, acc
integrated_tensors = tf.reduce_mean(parallel_tensors, axis=0)
loss = integrated_tensors[0]
accuracy = integrated_tensors[1]
'''
[!] colocate_gradients_with_ops 옵션을 켜 줘야 gradient 를 계산할 때 original ops 와 같은 디바이스에서 계산함.
만약 이걸 키지 않으면 gradient 는 전부 default device 인 gpu:0 에서 하게 되어서 속도가 안 빨라짐 (오히려 더 느려짐;)
'''
train_op = tf.train.AdamOptimizer().minimize(loss, colocate_gradients_with_ops=True)

train(X, y, train_op, loss, accuracy)

[ 1/10] (train) acc: 82.05%, loss: 0.633 | (test) acc: 95.81%, loss: 0.140 | 2.52s, 0.10s
[ 2/10] (train) acc: 96.58%, loss: 0.111 | (test) acc: 98.07%, loss: 0.066 | 0.95s, 0.07s
[ 3/10] (train) acc: 97.91%, loss: 0.066 | (test) acc: 98.29%, loss: 0.054 | 1.01s, 0.06s
[ 4/10] (train) acc: 98.47%, loss: 0.050 | (test) acc: 98.70%, loss: 0.041 | 0.95s, 0.06s
[ 5/10] (train) acc: 98.73%, loss: 0.042 | (test) acc: 98.78%, loss: 0.035 | 0.91s, 0.06s
[ 6/10] (train) acc: 98.91%, loss: 0.035 | (test) acc: 99.00%, loss: 0.030 | 0.94s, 0.06s
[ 7/10] (train) acc: 99.12%, loss: 0.029 | (test) acc: 98.87%, loss: 0.034 | 0.95s, 0.07s
[ 8/10] (train) acc: 99.19%, loss: 0.027 | (test) acc: 99.14%, loss: 0.026 | 0.96s, 0.07s
[ 9/10] (train) acc: 99.28%, loss: 0.023 | (test) acc: 99.05%, loss: 0.029 | 0.94s, 0.07s
[10/10] (train) acc: 99.33%, loss: 0.022 | (test) acc: 98.94%, loss: 0.031 | 0.94s, 0.10s


### Performance test with 2 GPUs

In [11]:
X, y = prepare_graph()
parallel_tensors = make_parallel(build_graph, 2, X=X, y=y) # loss, acc
integrated_tensors = tf.reduce_mean(parallel_tensors, axis=0)
loss = integrated_tensors[0]
accuracy = integrated_tensors[1]
train_op = tf.train.AdamOptimizer().minimize(loss, colocate_gradients_with_ops=True)

train(X, y, train_op, loss, accuracy)

[ 1/10] (train) acc: 82.50%, loss: 0.628 | (test) acc: 95.88%, loss: 0.138 | 1.87s, 0.14s
[ 2/10] (train) acc: 96.72%, loss: 0.108 | (test) acc: 97.43%, loss: 0.076 | 1.64s, 0.11s
[ 3/10] (train) acc: 97.94%, loss: 0.067 | (test) acc: 98.72%, loss: 0.046 | 1.69s, 0.11s
[ 4/10] (train) acc: 98.46%, loss: 0.049 | (test) acc: 98.56%, loss: 0.047 | 1.74s, 0.11s
[ 5/10] (train) acc: 98.67%, loss: 0.043 | (test) acc: 98.99%, loss: 0.035 | 1.68s, 0.11s
[ 6/10] (train) acc: 98.93%, loss: 0.035 | (test) acc: 98.67%, loss: 0.042 | 1.70s, 0.10s
[ 7/10] (train) acc: 99.12%, loss: 0.029 | (test) acc: 98.96%, loss: 0.029 | 1.72s, 0.10s
[ 8/10] (train) acc: 99.25%, loss: 0.024 | (test) acc: 98.93%, loss: 0.033 | 1.75s, 0.11s
[ 9/10] (train) acc: 99.20%, loss: 0.026 | (test) acc: 99.06%, loss: 0.028 | 1.75s, 0.10s
[10/10] (train) acc: 99.44%, loss: 0.019 | (test) acc: 99.31%, loss: 0.025 | 1.67s, 0.11s
