# Multi GPU training

Two versions:

1. optimizer 의 `colocate_gradients_with_ops=True` 옵션을 사용
2. 위 방법을 가속하기 위해 NCCL 라이브러리 활용

## Single GPU MNIST

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow.contrib.slim as slim
import pandas as pd
import collections
import time

  from ._conv import register_converters as _register_converters


In [2]:
tf.__version__

'1.6.0'

In [3]:
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [4]:
def prepare_graph():
    tf.reset_default_graph()
    tf.set_random_seed(777)
    np.random.seed(777)

    # inputs
    X = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    
    return X, y

In [5]:
def build_graph(X, y):
    x = tf.reshape(X, [-1, 28, 28, 1])

    x = slim.conv2d(x, 128, kernel_size=[5,5])
    x = slim.conv2d(x, 128, kernel_size=[5,5])
    x = slim.max_pool2d(x, kernel_size=[2,2])

    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.max_pool2d(x, kernel_size=[2,2])
    
    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.conv2d(x, 128, kernel_size=[3,3])
    x = slim.max_pool2d(x, kernel_size=[2,2])

    flat = slim.flatten(x)
    logits = slim.fully_connected(flat, 10, activation_fn=None)
    prob = tf.nn.softmax(logits)

    correct = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits)
    
    return loss, accuracy

In [6]:
def train(X, y, train_op, loss, accuracy, n_epoch, batch_size, test_batch_size, sync_op=None):
    # multi-gpu 환경에서는 test batch size 를 더 키워도 됨
    config = tf.ConfigProto(log_device_placement=True)
    config.gpu_options.allow_growth = True
    # config.log_device_placement = True # 안 나오네...
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    # NCCL 의 경우에는 이게 필요함
    if sync_op:
        sess.run(sync_op)

    N = mnist.train.num_examples
    n_iter = N // batch_size
    
    total_train_time = 0.
    total_test_time = 0.

    for epoch in range(n_epoch):
        st = time.time()

        avg_loss = 0.
        avg_acc = 0.
        for _ in range(n_iter):
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            _, cur_acc, cur_loss = sess.run([train_op, accuracy, loss], {X: batch_x, y: batch_y})
            avg_acc += cur_acc
            avg_loss += cur_loss

        avg_acc /= n_iter
        avg_loss /= n_iter

        train_elapsed = time.time() - st
        st = time.time()

        test_acc = 0.
        test_loss = 0.
        for _ in range(mnist.test.num_examples // test_batch_size):
            batch_x, batch_y = mnist.test.next_batch(test_batch_size)
            cur_acc, cur_loss = sess.run([accuracy, loss], {X: batch_x, y: batch_y})
            test_acc += cur_acc
            test_loss += cur_loss
        test_acc /= (mnist.test.num_examples // test_batch_size)
        test_loss /= (mnist.test.num_examples // test_batch_size)

        test_elapsed = time.time() - st

        # skip the warm-up epoch: 0
        if epoch > 0:
            total_train_time += train_elapsed
            total_test_time += test_elapsed
        print("[{:2}/{}] (train) acc: {:.2%}, loss: {:.3f} | (test) acc: {:.2%}, loss: {:.3f} | {:.2f}s, {:.2f}s".
              format(epoch+1, n_epoch, avg_acc, avg_loss, test_acc, test_loss, train_elapsed, test_elapsed))
    
    total_train_time /= (n_epoch-1)
    total_test_time /= (n_epoch-1)
    print("Average time\t TRAIN {:.3f}\t TEST {:.3f}".format(total_train_time, total_test_time))
    
    return total_train_time, total_test_time

In [7]:
# hyperparams
""" [!] batch_size 는 n_gpu 의 배수인 경우만 고려함. """
n_epoch = 10
train_batch_size = 1024
test_batch_size = 1024

stats = pd.DataFrame(columns=["#GPUs", "train", "test"])
stats = stats.set_index("#GPUs")

In [8]:
# no parallel
X, y = prepare_graph()
loss, accuracy = build_graph(X, y)
train_op = tf.train.AdamOptimizer().minimize(loss)

avg_train, avg_test = train(X, y, train_op, loss, accuracy, n_epoch, train_batch_size, test_batch_size)
stats.loc[1] = [avg_train, avg_test]

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

[ 1/10] (train) acc: 80.23%, loss: 0.638 | (test) acc: 97.09%, loss: 0.095 | 9.31s, 0.46s
[ 2/10] (train) acc: 97.63%, loss: 0.076 | (test) acc: 98.80%, loss: 0.037 | 7.18s, 0.41s
[ 3/10] (train) acc: 98.59%, loss: 0.043 | (test) acc: 98.83%, loss: 0.037 | 7.21s, 0.42s
[ 4/10] (train) acc: 98.97%, loss: 0.032 | (test) acc: 99.08%, loss: 0.028 | 7.22s, 0.41s
[ 5/10] (train) acc: 99.24%, loss: 0.024 | (test) acc: 98.96%, loss: 0.034 | 7.24s, 0.41s
[ 6/10] (train) acc: 99.46%, loss: 0.017 | (test) acc: 98.93%, loss: 0.035 | 7.24s, 0.41s
[ 7/10] (train) acc: 99.47%, loss: 0.017 | (test) acc: 99.22%, loss: 0.023 | 7.25s, 0.41s
[ 8/10] (train) acc: 99.57%, loss: 0.014 | (test) acc: 99.15%, loss: 0.031 | 7.25s, 0.41s
[ 9/10] (train) acc: 99.64%, loss: 0.012 | (test) acc: 98.95%, loss: 0.038 | 7.25s, 0.41s
[

## Multi GPUs MNIST

In [9]:
def make_parallel(model_builder, num_gpus, **kwargs):
    """ 
    Args:
        fn: model builder function
        num_gpus
        kwargs: input of model builder; e.g. X=X, y=y.
        
    Returns:
        2d tensors: num_gpus * retrun_list_of_fn
        e.g. 
        [[loss, acc, train_op],
         [loss, acc, train_op],
        ...
        ] 
    """
    in_splits = {}
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, num_gpus)

    out_split = []
    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                inputs = {k : v[i] for k, v in in_splits.items()}
                ret = model_builder(**inputs) # return loss, accuracy
                out_split.append(ret)

    return tf.convert_to_tensor(out_split)

In [10]:
def performance_test(n_gpu):
    X, y = prepare_graph()
    parallel_tensors = make_parallel(build_graph, n_gpu, X=X, y=y) # loss, acc
    integrated_tensors = tf.reduce_mean(parallel_tensors, axis=0)
    loss = integrated_tensors[0]
    accuracy = integrated_tensors[1]
    '''
    [!] colocate_gradients_with_ops 옵션을 켜 줘야 gradient 를 계산할 때 original ops 와 같은 디바이스에서 계산함.
    만약 이걸 키지 않으면 gradient 는 전부 default device 인 gpu:0 에서 하게 되어서 속도가 안 빨라짐 (오히려 더 느려짐;)
    '''
    train_op = tf.train.AdamOptimizer().minimize(loss, colocate_gradients_with_ops=True)

    avg_train, avg_test = train(X, y, train_op, loss, accuracy, n_epoch, train_batch_size, test_batch_size)
    stats.loc[n_gpu] = [avg_train, avg_test]

### Performance test with 2/4/8 GPUs

In [11]:
performance_test(2)

[ 1/10] (train) acc: 83.26%, loss: 0.514 | (test) acc: 97.54%, loss: 0.082 | 5.48s, 0.29s
[ 2/10] (train) acc: 97.84%, loss: 0.069 | (test) acc: 98.56%, loss: 0.048 | 3.87s, 0.24s
[ 3/10] (train) acc: 98.75%, loss: 0.040 | (test) acc: 98.67%, loss: 0.040 | 3.87s, 0.21s
[ 4/10] (train) acc: 99.16%, loss: 0.028 | (test) acc: 98.74%, loss: 0.040 | 3.87s, 0.22s
[ 5/10] (train) acc: 99.28%, loss: 0.023 | (test) acc: 98.77%, loss: 0.038 | 3.93s, 0.25s
[ 6/10] (train) acc: 99.41%, loss: 0.018 | (test) acc: 99.08%, loss: 0.028 | 3.92s, 0.23s
[ 7/10] (train) acc: 99.52%, loss: 0.015 | (test) acc: 98.91%, loss: 0.034 | 3.90s, 0.23s
[ 8/10] (train) acc: 99.61%, loss: 0.012 | (test) acc: 99.23%, loss: 0.030 | 3.93s, 0.23s
[ 9/10] (train) acc: 99.63%, loss: 0.011 | (test) acc: 99.26%, loss: 0.027 | 3.93s, 0.23s
[10/10] (train) acc: 99.69%, loss: 0.010 | (test) acc: 99.18%, loss: 0.035 | 3.89s, 0.23s
Average time	 TRAIN 3.902	 TEST 0.231


In [12]:
performance_test(4)

[ 1/10] (train) acc: 81.60%, loss: 0.605 | (test) acc: 97.21%, loss: 0.087 | 4.61s, 0.34s
[ 2/10] (train) acc: 97.99%, loss: 0.066 | (test) acc: 98.62%, loss: 0.042 | 2.32s, 0.25s
[ 3/10] (train) acc: 98.66%, loss: 0.043 | (test) acc: 99.02%, loss: 0.030 | 2.31s, 0.25s
[ 4/10] (train) acc: 99.11%, loss: 0.028 | (test) acc: 99.01%, loss: 0.029 | 2.33s, 0.25s
[ 5/10] (train) acc: 99.18%, loss: 0.024 | (test) acc: 99.02%, loss: 0.034 | 2.30s, 0.25s
[ 6/10] (train) acc: 99.43%, loss: 0.019 | (test) acc: 99.16%, loss: 0.027 | 2.33s, 0.22s
[ 7/10] (train) acc: 99.58%, loss: 0.013 | (test) acc: 99.28%, loss: 0.024 | 2.32s, 0.24s
[ 8/10] (train) acc: 99.60%, loss: 0.013 | (test) acc: 98.98%, loss: 0.033 | 2.27s, 0.25s
[ 9/10] (train) acc: 99.62%, loss: 0.012 | (test) acc: 99.02%, loss: 0.027 | 2.31s, 0.24s
[10/10] (train) acc: 99.66%, loss: 0.010 | (test) acc: 99.11%, loss: 0.032 | 2.30s, 0.24s
Average time	 TRAIN 2.311	 TEST 0.242


In [13]:
performance_test(8)

[ 1/10] (train) acc: 82.07%, loss: 0.588 | (test) acc: 97.80%, loss: 0.076 | 5.89s, 0.35s
[ 2/10] (train) acc: 97.99%, loss: 0.066 | (test) acc: 98.18%, loss: 0.054 | 2.18s, 0.18s
[ 3/10] (train) acc: 98.62%, loss: 0.045 | (test) acc: 98.99%, loss: 0.032 | 2.18s, 0.16s
[ 4/10] (train) acc: 98.95%, loss: 0.032 | (test) acc: 99.02%, loss: 0.033 | 2.19s, 0.17s
[ 5/10] (train) acc: 99.33%, loss: 0.021 | (test) acc: 98.74%, loss: 0.041 | 2.17s, 0.16s
[ 6/10] (train) acc: 99.40%, loss: 0.018 | (test) acc: 98.97%, loss: 0.034 | 2.16s, 0.17s
[ 7/10] (train) acc: 99.56%, loss: 0.014 | (test) acc: 99.11%, loss: 0.031 | 2.17s, 0.15s
[ 8/10] (train) acc: 99.57%, loss: 0.013 | (test) acc: 98.95%, loss: 0.036 | 2.16s, 0.17s
[ 9/10] (train) acc: 99.59%, loss: 0.013 | (test) acc: 99.12%, loss: 0.031 | 2.17s, 0.14s
[10/10] (train) acc: 99.79%, loss: 0.007 | (test) acc: 99.05%, loss: 0.032 | 2.18s, 0.16s
Average time	 TRAIN 2.173	 TEST 0.162


## Current processing time

average of 10 epochs

- train/test batch size: 1024

In [14]:
def df_print(df):
    df = df.copy()
    # train time / test time 별 성능 향상 % 계산
    df['train %'] = df['train'][1] / df['train']
    df['test %'] = df['test'][1] / df['test']
    
    # 예쁘게 출력
    return df.style.format({
        'train': '{:.3f}',
        'test': '{:.3f}',
        'train %': '{:.0%}',
        'test %': '{:.0%}'
    })

In [15]:
df_print(stats)

Unnamed: 0_level_0,train,test,train %,test %
#GPUs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.234,0.412,100%,100%
2,3.902,0.231,185%,178%
4,2.311,0.242,313%,170%
8,2.173,0.162,333%,254%


## 개선: NCCL

현재 multi-gpu training 프로세스는 다음과 같음:

1. 각 gpu 에 네트워크를 할당
2. input data 를 각 gpu 별로 분배 (DataParallel)
3. 각 gpu 마다 weight 를 gpu:0 에서 읽어옴
    - `make_parallel` 코드를 보면 0 번 gpu 외에는 `reuse=True` 로 들어감
    - 즉, 매 step 마다 gpu:0 에서 weight 를 읽어오게 됨
4. 각 gpu마다 2번에서 분배받은 input data 에 대해 forward/backward(gradients) 계산 
5. gradients 를 gpu:0 으로 모아서 평균내어 weight update
    - 이렇게 업데이트한 weight 는 gpu:0 에만 저장되지만 다음 스텝에서 다른 gpu들은 3번 과정에서 다시 gpu:0 에서 읽어오므로 괜찮음

여기서 문제가 되는건 결국 gpu:0 으로 gradients 를 모으고 다시 재배포하는 과정으로, NCCL 을 사용하면 이 과정을 가속할 수 있음

1. gpu 마다 네트워크를 할당할 때 `reuse=False` 로 하여 각 gpu 마다 weight 를 독립적으로 유지
2. NCCL 의 all_sum 함수를 이용하여 gradient 를 모아서 평균내고 각 gpu 로 재배포
    - 결국 gradients 를 모으고 다시 재배포하는 것은 동일하나 이것을 NCCL 을 통해 함으로써 가속
    - NCCL 의 all_sum 함수를 활용하기 위해 weight 를 업데이트 후 재배포하는 것이 아니라 gradient 만 모아서 재배포하고 각 gpu 에서 알아서 업데이트
    
### 참고

- [Tensorpack과 Multigpu를 활용한 빠른 트레이닝 코드 작성하기](http://openresearch.ai/t/tensorpack-multigpu/45)
- [NCCL을 이용한 Efficient한 Tensorflow MultiGPU Training 코드 작성하기](http://openresearch.ai/t/nccl-efficient-tensorflow-multigpu-training/159)

In [16]:
# `synchronize_gpus()` 함수로 대체
# def get_post_init_ops():
#     """
#     Copy values of variables on GPU 0 to other GPUs.
#     """
#     # literally all variables, because it's better to sync optimizer-internal variables as well
#     all_vars = tf.global_variables() + tf.local_variables()
#     var_by_name = dict([(v.name, v) for v in all_vars])
#     post_init_ops = []
#     for v in all_vars:
#         if not v.name.startswith('tower'):
#             continue
#         if v.name.startswith('tower0'):
#             # no need for copy to tower0
#             continue
#         # in this trainer, the master name doesn't have the towerx/ prefix
#         split_name = v.name.split('/')
#         prefix = split_name[0]
#         realname = '/'.join(split_name[1:])
#         if prefix in realname:
#             # logger.warning("variable {} has its prefix {} appears multiple times in its name!".format(v.name, prefix))
#             pass
#         copy_from = var_by_name.get(v.name.replace(prefix, 'tower0'))
#         if copy_from is not None:
#             post_init_ops.append(v.assign(copy_from.read_value()))
#         else:
#             # logger.warning("Cannot find {} in the graph!".format(realname))
#             pass
#     print("'sync_variables_from_main_tower' includes {} operations.".format(len(post_init_ops)))
#     return tf.group(*post_init_ops, name='sync_variables_from_main_tower')

In [17]:
def make_parallel_without_weight_sharing(model_builder, optimizers, n_gpu, **kwargs):
    """ 각 gpu 마다 네트워크를 생성 (단, weight sharing 하지 않음)
    여기서 `optimizers[i].compute_gradients` 를 통해 gradients 도 구함
    
    Args:
        kwargs: input of model builder; e.g. X=X, y=y.
        
    Returns:
        out_list: [K, 2]; [(loss, accuracy) * K]
        gv_list: [K, N, 2]
    """
    # 레퍼런스에 따르면 이 split 과정이 무겁기 때문에 gpu 별로 input queue 를 만드는게 좋다고 함
    # 본 실험에서 8gpu 성능이 원하는 만큼 나오지 않는 것도 이 때문일 수 있음
    in_splits = {}
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, n_gpu)

    out_list = []
    gv_list = []
    for i in range(n_gpu):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            scope = "tower{}".format(i)
            # reuse=False 로 설정하여 각 gpu 의 weight 를 독립적으로 운용
            with tf.variable_scope(scope, reuse=False):
                inputs = {k : v[i] for k, v in in_splits.items()}
                ret = build_graph(**inputs) # (loss, accuracy)
                out_list.append(ret)

                # 각 gpu 에서 loss 에 대해 gradients 를 계산
                
                # variable 을 지정해주지 않으면 전체 variable 에 대해서 다 계산함
                variables = slim.get_variables(scope=scope)
                grads_vars = optimizers[i].compute_gradients(ret[0], var_list=variables)

                # 전체 variable 에 대해서 다 계산해준 후 골라내는 방법도 있음
#                 grads_vars = optimizers[i].compute_gradients(ret[0])
#                 grads_vars = [(grad, var) for grad, var in grads_vars if grad is not None]
            
                gv_list.append(grads_vars) # [K, N, 2]

    # gv_list 를 각각 grad_list 와 var_list 로 분리
    grad_list = []
    var_list = []
    for tower in gv_list:
        grad_list.append([x[0] for x in tower])
        var_list.append([x[1] for x in tower])
        
    return out_list, grad_list, var_list

In [18]:
from tensorflow.contrib import nccl

def reduce_average_gradients(grad_list):
    """ 각 gradients 를 모아서 평균내고 다시 각 gpu 로 보냄 - NCCL 을 이용하여 가속.
    Args:
        grad_list: [K, N]; K == n_gpu.
    Returns:
        new_grad_list: [K, N]; averaged gradients.
    """
    n_gpu = len(grad_list)
    if n_gpu == 1:
        return grad_list

    # 각 tower (gpu) 를 돌면서 동일한 gradient 들에 대해 sum 을 해줘야 하므로 grad_list[:, i] 를 구해야 한다.
    # numpy 가 아니므로 이걸 구하기가 까다로우므로 zip(*grad_list) 로 [N, K] 로 만들어주자.
    new_grad_list = []
    for grads_per_var in zip(*grad_list):
        assert len(grads_per_var) == n_gpu
        # grads_per_var 는 어떤 variable 에 대한 모든 gradients 를 다 가지고 있는 리스트.
        # nccl.all_sum 을 통해 이 gradients 를 전부 더해준 뒤 다시 각 gpu로 재배포했다.
        summed = nccl.all_sum(grads_per_var)

        # 이 때 각 tensor 들은 sum 이므로, average 로 바꿔주자
        averaged = []
        for tensor in summed:
            with tf.device(tensor.device):
                averaged.append(tensor / float(n_gpu))
        new_grad_list.append(averaged)
#         new_grad_list.append(summed)

    # 다시 [N, K] 로 transpose
    new_grad_list = list(zip(*new_grad_list))

    # 이제 averaged gradients 를 얻었다!
    return new_grad_list

In [19]:
def apply_gradients(grad_list, var_list, optimizers, n_gpu):
    """ grad_list 와 var_list 를 받아서 weights 에 gradients 를 업데이트 """
    train_ops = []
    for i in range(n_gpu):
        grads_vars = zip(grad_list[i], var_list[i])

        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            scope = "tower{}".format(i)
            # BN 이 있으면 이걸 해줘야 함
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=scope)
            with tf.control_dependencies(update_ops):
                train_op = optimizers[i].apply_gradients(grads_vars)
                train_ops.append(train_op)

    train_op = tf.group(*train_ops)
    
    return train_op

In [20]:
# weights 의 초기값 동기화가 필요함
def synchronize_gpus(n_gpu):
    """ tower0 의 weights 를 tower{1~7} 로 복사하는 오퍼레이션 생성 """
    # local variables 는 일단 무시; 뭐가 있는지 모르겠네
    assert tf.local_variables() == []
    
    copy_ops = []
    towers = [slim.get_variables('tower{}'.format(i)) for i in range(n_gpu)]
    for i in range(1, n_gpu):
        # tower[i] <= tower[0] 
        # print("copy from {} to {}".format(towers[0][0].name.split('/')[0], towers[i][0].name.split('/')[0]))
        copy_ops += [tf.assign(ref, value) for ref, value in zip(towers[i], towers[0])]
    
    print("# of copy_ops = {}".format(len(copy_ops)))
    return tf.group(*copy_ops)

In [21]:
def performance_test(n_gpu):
    X, y = prepare_graph()
    # optimizer 를 하나로 사용할 경우 optimizer 내부 변수가 공유되어 문제가 생김.
    # Adam 의 경우 beta1_power, beta2_power 가 있는데 apply_gradients 때마다 매번 이 변수가 업데이트됨.
    # 이 문제를 해결하기 위해 각 gpu 마다 optimizer 를 따로 사용해주자.
    # optimizer = tf.train.AdamOptimizer()
    optimizers = [tf.train.AdamOptimizer() for _ in range(n_gpu)]
    
    # 각 gpu 마다 네트워크 생성 및 gradients operation 생성
    out_list, grad_list, var_list = make_parallel_without_weight_sharing(
        build_graph, optimizers, n_gpu, X=X, y=y)

    # loss/accuracy operation
    parallel_tensors = tf.convert_to_tensor(out_list)
    integrated_tensors = tf.reduce_mean(parallel_tensors, axis=0)
    loss = integrated_tensors[0]
    accuracy = integrated_tensors[1]

    # gpu 들에 흩어져있는 gradients 를 통합하고 적용
    ## reduce-average gradients via NCCL
    grad_list = reduce_average_gradients(grad_list)
    ## apply averaged gradients
    train_op = apply_gradients(grad_list, var_list, optimizers, n_gpu)

    # synchronize operation
    sync_op = synchronize_gpus(n_gpu)

    # train
    avg_train, avg_test = train(X, y, train_op, loss, accuracy, n_epoch, 
                                train_batch_size, test_batch_size, sync_op=sync_op)
    
    # write stats
    stats_nccl.loc[n_gpu] = [avg_train, avg_test]

In [22]:
stats_nccl = pd.DataFrame(columns=["#GPUs", "train", "test"])
stats_nccl = stats_nccl.set_index("#GPUs")

In [23]:
performance_test(1)

# of copy_ops = 0
[ 1/10] (train) acc: 81.12%, loss: 0.629 | (test) acc: 97.41%, loss: 0.084 | 7.48s, 0.47s
[ 2/10] (train) acc: 97.85%, loss: 0.069 | (test) acc: 98.67%, loss: 0.044 | 7.31s, 0.41s
[ 3/10] (train) acc: 98.64%, loss: 0.045 | (test) acc: 98.86%, loss: 0.033 | 7.38s, 0.42s
[ 4/10] (train) acc: 99.06%, loss: 0.030 | (test) acc: 99.11%, loss: 0.030 | 7.39s, 0.41s
[ 5/10] (train) acc: 99.21%, loss: 0.024 | (test) acc: 99.10%, loss: 0.030 | 7.34s, 0.41s
[ 6/10] (train) acc: 99.38%, loss: 0.020 | (test) acc: 98.90%, loss: 0.035 | 7.33s, 0.43s
[ 7/10] (train) acc: 99.54%, loss: 0.015 | (test) acc: 99.18%, loss: 0.025 | 7.37s, 0.41s
[ 8/10] (train) acc: 99.63%, loss: 0.012 | (test) acc: 99.12%, loss: 0.032 | 7.34s, 0.41s
[ 9/10] (train) acc: 99.67%, loss: 0.010 | (test) acc: 99.21%, loss: 0.025 | 7.33s, 0.41s
[10/10] (train) acc: 99.76%, loss: 0.008 | (test) acc: 99.12%, loss: 0.033 | 7.34s, 0.41s
Average time	 TRAIN 7.348	 TEST 0.415


In [24]:
performance_test(2)

# of copy_ops = 42
[ 1/10] (train) acc: 80.57%, loss: 0.634 | (test) acc: 97.32%, loss: 0.085 | 4.18s, 0.29s
[ 2/10] (train) acc: 97.72%, loss: 0.073 | (test) acc: 98.36%, loss: 0.048 | 3.88s, 0.22s
[ 3/10] (train) acc: 98.74%, loss: 0.041 | (test) acc: 98.98%, loss: 0.032 | 3.87s, 0.23s
[ 4/10] (train) acc: 99.06%, loss: 0.031 | (test) acc: 98.99%, loss: 0.031 | 3.87s, 0.23s
[ 5/10] (train) acc: 99.17%, loss: 0.027 | (test) acc: 98.98%, loss: 0.030 | 3.87s, 0.23s
[ 6/10] (train) acc: 99.36%, loss: 0.020 | (test) acc: 99.12%, loss: 0.026 | 3.87s, 0.23s
[ 7/10] (train) acc: 99.56%, loss: 0.014 | (test) acc: 98.97%, loss: 0.034 | 3.87s, 0.23s
[ 8/10] (train) acc: 99.62%, loss: 0.013 | (test) acc: 99.15%, loss: 0.024 | 3.88s, 0.23s
[ 9/10] (train) acc: 99.64%, loss: 0.011 | (test) acc: 99.06%, loss: 0.032 | 3.90s, 0.23s
[10/10] (train) acc: 99.67%, loss: 0.010 | (test) acc: 99.27%, loss: 0.023 | 3.87s, 0.23s
Average time	 TRAIN 3.877	 TEST 0.229


In [25]:
performance_test(4)

# of copy_ops = 126
[ 1/10] (train) acc: 81.52%, loss: 0.591 | (test) acc: 97.59%, loss: 0.081 | 2.71s, 0.26s
[ 2/10] (train) acc: 97.93%, loss: 0.068 | (test) acc: 98.82%, loss: 0.039 | 2.21s, 0.15s
[ 3/10] (train) acc: 98.77%, loss: 0.042 | (test) acc: 98.71%, loss: 0.040 | 2.21s, 0.14s
[ 4/10] (train) acc: 99.04%, loss: 0.030 | (test) acc: 98.88%, loss: 0.032 | 2.20s, 0.13s
[ 5/10] (train) acc: 99.26%, loss: 0.024 | (test) acc: 99.07%, loss: 0.028 | 2.19s, 0.14s
[ 6/10] (train) acc: 99.33%, loss: 0.021 | (test) acc: 98.78%, loss: 0.038 | 2.20s, 0.15s
[ 7/10] (train) acc: 99.45%, loss: 0.017 | (test) acc: 99.33%, loss: 0.023 | 2.21s, 0.14s
[ 8/10] (train) acc: 99.60%, loss: 0.013 | (test) acc: 98.76%, loss: 0.036 | 2.23s, 0.15s
[ 9/10] (train) acc: 99.71%, loss: 0.009 | (test) acc: 99.12%, loss: 0.031 | 2.21s, 0.14s
[10/10] (train) acc: 99.70%, loss: 0.010 | (test) acc: 99.07%, loss: 0.033 | 2.21s, 0.15s
Average time	 TRAIN 2.208	 TEST 0.143


In [26]:
performance_test(8)

# of copy_ops = 294
[ 1/10] (train) acc: 81.01%, loss: 0.611 | (test) acc: 97.20%, loss: 0.086 | 2.47s, 0.32s
[ 2/10] (train) acc: 97.76%, loss: 0.071 | (test) acc: 98.46%, loss: 0.049 | 1.46s, 0.10s
[ 3/10] (train) acc: 98.75%, loss: 0.041 | (test) acc: 98.77%, loss: 0.034 | 1.47s, 0.09s
[ 4/10] (train) acc: 99.13%, loss: 0.029 | (test) acc: 98.94%, loss: 0.034 | 1.46s, 0.10s
[ 5/10] (train) acc: 99.24%, loss: 0.024 | (test) acc: 99.03%, loss: 0.027 | 1.44s, 0.09s
[ 6/10] (train) acc: 99.40%, loss: 0.019 | (test) acc: 99.07%, loss: 0.032 | 1.40s, 0.10s
[ 7/10] (train) acc: 99.61%, loss: 0.013 | (test) acc: 99.29%, loss: 0.023 | 1.44s, 0.08s
[ 8/10] (train) acc: 99.59%, loss: 0.013 | (test) acc: 98.60%, loss: 0.046 | 1.46s, 0.10s
[ 9/10] (train) acc: 99.58%, loss: 0.013 | (test) acc: 99.10%, loss: 0.033 | 1.46s, 0.09s
[10/10] (train) acc: 99.71%, loss: 0.009 | (test) acc: 99.00%, loss: 0.034 | 1.45s, 0.10s
Average time	 TRAIN 1.448	 TEST 0.095


In [27]:
df_print(stats)

Unnamed: 0_level_0,train,test,train %,test %
#GPUs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.234,0.412,100%,100%
2,3.902,0.231,185%,178%
4,2.311,0.242,313%,170%
8,2.173,0.162,333%,254%


In [28]:
df_print(stats_nccl)

Unnamed: 0_level_0,train,test,train %,test %
#GPUs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.348,0.415,100%,100%
2,3.877,0.229,190%,181%
4,2.208,0.143,333%,290%
8,1.448,0.095,507%,435%


## Performance analysis

- 유의미한 성능개선
- 왜 여전히 507%?
    1. 네트워크 크기
    2. input data split
    3. feed_dict
- 위 세가지 수정해서 변화를 봐야 함
    1. 네트워크 크기를 늘리면 유의미한 성능향상이 관측됨
- 기타 개선가능사항
    - gradient average 를 안 하고 LR 을 /n_gpu 해도 됨