### We check the speed differences between single/multicore CPU and GPU

In [None]:
%matplotlib inline
import torch
import numpy as np
if not torch.cuda.is_available():
    print 'This tutorial expect a GPU'
else:
    torch.cuda.set_device(0)
    num_gpus    = torch.cuda.device_count()
    current_gpu = torch.cuda.device_count()
    print "Current device index: {}. Total number of devices: {}".format(current_gpu,num_gpus)
    print torch.cuda.get_device_name(0)

### Synchronize
CUDA operation are performed asynchronous, i.e. the phython code may continue before the operation is finished. By torch.cuda.synchronize() the program flow is forced to wait for completion. 

In [None]:
# simple jupyter %timeit works but it may choose a too large number of loops
# timeit adapts the number of tests(loops) to the speed of the operation.
# Without torch.cuda.synchronize() the operation looks quicker then it is
# This example may then run a few minutes
size=8192
A=torch.randn(size,size)
B=torch.randn(size,size)
%timeit A.mm(B)

A=A.cuda()
B=B.cuda()
%timeit A.mm(B);torch.cuda.synchronize()

In [None]:
%%timeit
# timing of cell behaves similar
A.mm(B)
torch.cuda.synchronize()
# w/o torch.cuda.synchronize(): 1000 loops, best of 3: 121 ms per loop
# with torch.cuda.synchronize():   1 loop, best of 3: 121 ms per loop

## If yo run this on a multicore machine you can benefit from multi-threading MKL

E.g. on a machine with 2 Intel Xeon E5‐2600v4, i.e. 20 cores, 

* numpy

 * 1 threads</br>
 Dotted two 4096x4096 matrices in 3.45 s.</br>
 Dotted two vectors of length 524288 in 0.23 ms.</br>
 SVD of a 2048x1024 matrix in 1.49 s.</br>
 Eigendecomposition of a 2048x2048 matrix in 8.95 s.</br>

 * 40 threads</br>
 Dotted two 4096x4096 matrices in 0.29 s.</br>
 Dotted two vectors of length 524288 in 0.03 ms.</br>
 SVD of a 2048x1024 matrix in 0.42 s.</br>
 Eigendecomposition of a 2048x2048 matrix in 3.93 s.</br>

* torch - cpu
 * 1 thread</br>
 Dotted two 4096x4096 matrices in 1.74 s.</br>
 Dotted two vectors of length 524288 in 0.12 ms.</br>
 SVD of a 2048x1024 matrix in 2.81 s.</br>
 Eigendecomposition of a 2048x2048 matrix in 2.15 s.</br>
 
 * 40 threads</br>
 Dotted two 4096x4096 matrices in 0.13 s.</br>
 Dotted two vectors of length 524288 in 0.01 ms.</br>
 SVD of a 2048x1024 matrix in 0.81 s.</br>
 Eigendecomposition of a 2048x2048 matrix in 1.49 s.</br>

* torch - cuda float32
 * Tesla P100-PCIE-16GB</br>
 Dotted two 4096x4096 matrices in 0.02 s.</br>
 Dotted two vectors of length 524288 in 0.05 ms.</br>
 SVD of a 2048x1024 matrix in 0.32 s.</br>
 Eigendecomposition of a 2048x2048 matrix in 1.40 s.</br></br>
 Dotted two 8192x8192 matrices in 0.12 s.</br>
 Dotted two vectors of length 1048576 in 0.06 ms.</br>
 SVD of a 4096x2048 matrix in 1.40 s.</br>
 Eigendecomposition of a 4096x4096 matrix in 6.86 s.</br>
 * torch - cuda float64</br>
 Dotted two 8192x8192 matrices in 0.24 s.</br>
 Dotted two vectors of length 1048576 in 0.07 ms.</br>
 SVD of a 4096x2048 matrix in 2.36 s.</br>
 Eigendecomposition of a 4096x4096 matrix in 12.61 s.</br>

Get the same numbers for your AWS machine

In [None]:
# basic linear algebra operation with numpy
import mkl
mkl.set_num_threads(1)
#mkl.set_num_threads(40)  # run this if you have more cores available for yourself
import numpy as np
from time import time

# Let's take the randomness out of random numbers (for reproducibility)
np.random.seed(0)

size = 4096
A, B = np.random.random((size, size)), np.random.random((size, size))
C, D = np.random.random((size * 128,)), np.random.random((size * 128,))
E = np.random.random((int(size / 2), int(size / 4)))
F = np.random.random((int(size / 2), int(size / 2)))
F = np.dot(F, F.T)
G = np.random.random((int(size / 2), int(size / 2)))

# Matrix multiplication
N = 20
t = time()
for i in range(N):
    np.dot(A, B)
delta = time() - t
print 'Dotted two %dx%d matrices in %0.2f s.' % (size, size, delta / N)
del A, B

# Vector multiplication
N = 5000
t = time()
for i in range(N):
    np.dot(C, D)
delta = time() - t
print 'Dotted two vectors of length %d in %0.2f ms.' % (size * 128, 1e3 * delta / N)
del C, D

# Singular Value Decomposition (SVD)
N = 3
t = time()
for i in range(N):
    np.linalg.svd(E, full_matrices = True)
delta = time() - t
print "SVD of a %dx%d matrix in %0.2f s." % (size / 2, size / 4, delta / N)
del E

# Cholesky Decomposition
N = 3
t = time()
for i in range(N):
    np.linalg.cholesky(F)
delta = time() - t
print("Cholesky decomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N))

# Eigendecomposition
t = time()
for i in range(N):
    np.linalg.eig(G)
delta = time() - t
print "Eigendecomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N)

In [None]:
# basic linear algebra operation with pytorch CPU
import mkl
mkl.set_num_threads(1)
#mkl.set_num_threads(40)
import torch
from time import time

# Let's take the randomness out of random numbers (for reproducibility)
torch.manual_seed(0)

size = 4096
A, B = torch.rand((size, size)), torch.rand((size, size))
C, D = torch.rand((size * 128,)), torch.rand((size * 128,))
E = torch.rand((int(size / 2), int(size / 4)))
F = torch.rand((int(size / 2), int(size / 2)))
F = torch.mm(F, F.t())
G = torch.rand((int(size / 2), int(size / 2)))

# Matrix multiplication
N = 20
t = time()
for i in range(N):
    torch.mm(A, B)
delta = time() - t
print 'Dotted two %dx%d matrices in %0.2f s.' % (size, size, delta / N)
del A, B

# Vector multiplication
N = 5000
t = time()
for i in range(N):
    torch.dot(C, D)
delta = time() - t
print 'Dotted two vectors of length %d in %0.2f ms.' % (size * 128, 1e3 * delta / N)
del C, D

# Singular Value Decomposition (SVD)
N = 3
t = time()
for i in range(N):
    torch.svd(E)
delta = time() - t
print "SVD of a %dx%d matrix in %0.2f s." % (size / 2, size / 4, delta / N)
del E


# Eigendecomposition
t = time()
for i in range(N):
    torch.eig(G)
delta = time() - t
print "Eigendecomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N)

In [None]:
# basic linear algebra operation with pytorch CUDA
import mkl
mkl.set_num_threads(1)
import torch
from time import time

# Let's take the randomness out of random numbers (for reproducibility)
torch.manual_seed(0)


dtype=torch.float32
#dtype=torch.float64
#size = 8192
size = 4096

A, B = torch.rand((size, size),dtype=dtype), torch.rand((size, size),dtype=dtype)
C, D = torch.rand((size * 128,),dtype=dtype), torch.rand((size * 128,),dtype=dtype)
E = torch.rand((int(size / 2), int(size / 4)),dtype=dtype)
F = torch.rand((int(size / 2), int(size / 2)),dtype=dtype)
F = torch.mm(F, F.t())
G = torch.rand((int(size / 2), int(size / 2)),dtype=dtype)

torch.cuda.device(0)
print torch.cuda.get_device_name(0)

A=A.cuda()
B=B.cuda()
torch.cuda.synchronize()

print B.is_cuda, A.is_cuda

# Matrix multiplication
N = 20
t = time()
for i in range(N):
    torch.mm(A, B)
    # cuda operations are asynchronous - synchronize() waits that operation is finished
    torch.cuda.synchronize()  
delta = time() - t
print 'Dotted two %dx%d matrices in %0.2f s.' % (size, size, delta / N)
del A, B

C=C.cuda()
D=D.cuda()
torch.cuda.synchronize()


# Vector multiplication
N = 5000
t = time()
for i in range(N):
    torch.dot(C, D)
    torch.cuda.synchronize()
delta = time() - t
print 'Dotted two vectors of length %d in %0.2f ms.' % (size * 128, 1e3 * delta / N)
del C, D

E=E.cuda()
torch.cuda.synchronize()


# Singular Value Decomposition (SVD)
N = 3
t = time()
for i in range(N):
    torch.svd(E)
    torch.cuda.synchronize()
delta = time() - t
print "SVD of a %dx%d matrix in %0.2f s." % (size / 2, size / 4, delta / N)
del E

G=G.cuda()
torch.cuda.synchronize()

# Eigendecomposition
t = time()
for i in range(N):
    torch.eig(G)
    torch.cuda.synchronize()
delta = time() - t
print "Eigendecomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N)