# CPUとGPUの速度比較

In [1]:
# GPU確認 (例:Tesla K80,11441MiB)
!nvidia-smi

Thu Oct 11 05:29:07 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.46                 Driver Version: 390.46                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           On   | 00009349:00:00.0 Off |                  Off |
| N/A   54C    P0    57W / 149W |   7621MiB / 12206MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
# CPU確認 (例: CPU(s):6,Core(s) per socket:6,Model name:Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz)
!lscpu

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                6
On-line CPU(s) list:   0-5
Thread(s) per core:    1
Core(s) per socket:    6
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 63
Model name:            Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz
Stepping:              2
CPU MHz:               2596.994
BogoMIPS:              5193.98
Hypervisor vendor:     Microsoft
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              30720K
NUMA node0 CPU(s):     0-5
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c r

In [3]:
%matplotlib inline
import numpy as np

import chainer
import chainer.links as L
import chainer.functions as F
from chainer import training
from chainer.training import extensions
from chainer.datasets import get_mnist
train, test = get_mnist(ndim=3)

  from ._conv import register_converters as _register_converters


In [4]:
class CNN(chainer.Chain):

    def __init__(self, n_mid=100, n_out=10):
        super().__init__()
        with self.init_scope():
            self.conv1 = L.Convolution2D(in_channels=1, out_channels=3, ksize=3, stride=1, pad=1)
            self.fc1 = L.Linear(None, n_mid)
            self.fc2 = L.Linear(None, n_out)

    def __call__(self, x):
        h = F.relu(self.conv1(x))
        h = F.max_pooling_2d(h, 3, 3)
        h = self.fc1(h)
        h = self.fc2(h)
        return h

In [5]:
import random

def reset_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    if chainer.cuda.available:
        chainer.cuda.cupy.random.seed(seed)

In [6]:
reset_seed(0)
model = L.Classifier(CNN())

optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

<chainer.optimizers.adam.Adam at 0x7ff3a14c2748>

In [7]:
epoch = 10
batchsize = 4096

In [9]:
%%time

# CPUで実行 (gpu_id =0)
gpu_id = 0

train_iter = chainer.iterators.SerialIterator(train, batchsize)
test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False)



updater = training.StandardUpdater(train_iter, optimizer, device=gpu_id)
trainer = training.Trainer(updater, (epoch, 'epoch'), out='chainer_log/cpu_gpu/')
trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id))
trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
trainer.extend(extensions.PrintReport(['epoch', 'main/accuracy', 'validation/main/accuracy', 'main/loss', 'validation/main/loss', 'elapsed_time']), trigger=(1, 'epoch'))

trainer.run()

epoch       main/accuracy  validation/main/accuracy  main/loss   validation/main/loss  elapsed_time
[J1           0.908073       0.917768                  0.310181    0.276991              0.61195       
[J2           0.911344       0.922069                  0.298212    0.266436              1.10112       
[J3           0.913975       0.921907                  0.290048    0.2603                1.54908       
[J4           0.915918       0.925356                  0.282858    0.253641              2.02476       
[J5           0.918327       0.927163                  0.276306    0.248119              2.50212       
[J6           0.91966        0.928058                  0.272294    0.24308               2.95065       
[J7           0.921501       0.929528                  0.265435    0.238806              3.43077       
[J8           0.921989       0.931118                  0.262856    0.233969              3.90934       
[J9           0.923584       0.932425                  0.25

In [10]:
%%time

# CPUで実行 (gpu_id =-1)
gpu_id =-1

reset_seed(0)
model = L.Classifier(CNN())

optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

train_iter = chainer.iterators.SerialIterator(train, batchsize)
test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False)

updater = training.StandardUpdater(train_iter, optimizer, device=gpu_id)

trainer = training.Trainer(updater, (epoch, 'epoch'), out='chainer_log/cpu_gpu/')
trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id))
trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
trainer.extend(extensions.PrintReport(['epoch', 'main/accuracy', 'validation/main/accuracy', 'main/loss', 'validation/main/loss', 'elapsed_time']), trigger=(1, 'epoch'))

trainer.run()

epoch       main/accuracy  validation/main/accuracy  main/loss   validation/main/loss  elapsed_time
[J1           0.435368       0.734087                  1.81212     1.21593               13.7002       
[J2           0.764225       0.825155                  0.982017    0.710497              27.2574       
[J3           0.828945       0.865682                  0.650916    0.519692              39.9986       
[J4           0.863395       0.887997                  0.506171    0.424333              53.5172       
[J5           0.878337       0.900983                  0.432325    0.372082              66.9656       
[J6           0.888114       0.906815                  0.388988    0.341479              79.6434       
[J7           0.894775       0.912148                  0.360426    0.318067              93.2092       
[J8           0.901497       0.915028                  0.337094    0.301773              106.835       
[J9           0.905814       0.919161                  0.32