# Verify NVIDIA/CUDA device

In [None]:
smi = !which nvidia-smi
if(smi):
    !nvidia-smi
else:
    print('No NVIDIA device found. Are you sure you are on a GPU node?')

## Verify device with PyTorch

In [None]:
import torch

print('Torch version:', torch.__version__)
use_cuda = torch.cuda.is_available()
if use_cuda:
    print('Number of CUDA Devices:', torch.cuda.device_count())
    print('CUDA Device Name:',torch.cuda.get_device_name(0))
    print('CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)
    print('CUDA version:', torch.version.cuda)
else:
    print('No CUDA-GPU available from Torch perspective.')

## Verify device with TensorFlow

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print("TF version:", tf.__version__)
print("TF detected devices:", device_lib.list_local_devices())
print("Is TF built with CUDA?", tf.test.is_built_with_cuda())
print("List GPUs:", tf.config.list_physical_devices('GPU'))

# Testing PyTorch

Here it is just to test memory allocation in host and device. If you want to add more, go ahead!

In [None]:
import torch

t1 = torch.tensor([
    [1,2],
    [3,4]
])

t2 = torch.tensor([
    [5,6],
    [7,8]
])

print("Tensors are allocated on: ", t1.device, " and ", t2.device)

t2 = t2.to('cuda')

print("Tensors are allocated on: ", t1.device, " and ", t2.device)

# Testing TensorFlow

Taken from here: https://gist.github.com/j-min/baae1aa56e861cab9831b3722755ae6d

## Matrix operations

In [None]:
import numpy as np
from time import perf_counter
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# define matrix operation A^n + B^n
def matpow(M, n):
    """
    Takes in matrix M and multiplies it by itself n-1 times.
    """
    if n < 1:
        return M
    else:
        return tf.matmul(M, matpow(M,n-1))
    
# define A, B, n    
size = 1000
A = np.random.rand(size, size).astype('float32')
B = np.random.rand(size, size).astype('float32')
n = 10

### GPU multiplication

In [None]:
c1 = []
c2 = []

with tf.device('/gpu:0'):
    a = tf.placeholder(tf.float32, [size, size])
    b = tf.placeholder(tf.float32, [size, size])
    c1.append(matpow(a, n))
    c2.append(matpow(b, n))

with tf.device('/cpu:0'):
  sum = tf.add_n(c1)

start_time = perf_counter()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    # Run the op.
    sess.run(sum, {a:A, b:B})
stop_time = perf_counter()

print('GPU time: %g s' % (stop_time-start_time))

### CPU multiplication

In [None]:
# I'm leaving this cell commented 
# If you select "Run All" cells you're gonna get stuck here for ~5 minutes,
# which is useless cause this is supposed to test GPU usage, not cpu.
# Uncomment this if you want to really see it

#c1 = []
#c2 = []
#with tf.device('/cpu:0'):
#    a = tf.placeholder(tf.float32, [size, size])
#    b = tf.placeholder(tf.float32, [size, size])
#    c1.append(matpow(a, n))
#    c2.append(matpow(b, n))
#with tf.device('/cpu:0'):
#  sum = tf.add_n(c1)
#start_time = perf_counter()
#with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
#    sess.run(sum, {a:A, b:B})
#stop_time = perf_counter()
#print('CPU time: %g s' % (stop_time-start_time))

# Testing Numba 

Taken from here: https://github.com/keipertk/pygpu-workshop

In [None]:
import numpy as np
from time import perf_counter
from numba import vectorize

In [None]:
@vectorize(['float32(float32, float32)'], target='cuda')
def add_vec(v1, v2):
    return v1 + v2



N=1<<22

a = np.ones(N, dtype=np.float32)
b = np.ones(N, dtype=np.float32)
c = np.empty_like(a, dtype=a.dtype)

start_time = perf_counter()
c = add_vec(a,b)
stop_time = perf_counter()

print(c)
print('Elapsed time with target CUDA: %g s' % (stop_time-start_time))

del a
del b
del c

In [None]:
@vectorize(['float32(float32, float32)'], target='parallel')
def add_vec(v1, v2):
    return v1 + v2



N=1<<22

a = np.ones(N, dtype=np.float32)
b = np.ones(N, dtype=np.float32)
c = np.empty_like(a, dtype=a.dtype)

start_time = perf_counter()
c = add_vec(a,b)
stop_time = perf_counter()

print(c)
print('Elapsed time with parallel: %g s' % (stop_time-start_time))

del a
del b
del c

Notice here that CUDA does not necessarily give you the best performance due to communication via the PCI bus.

# Testing CuPy

Taken from: https://www.geeksforgeeks.org/python-cupy/

In [None]:
import cupy as cp
import numpy as np
import time

In [None]:
# NumPy and CPU Runtime
s = time.time()
x_cpu = np.ones((1000, 1000, 10))
e = time.time()
print("Time consumed by numpy: ", e - s)
  
# CuPy and GPU Runtime
s = time.time()
x_gpu = cp.ones((1000, 1000, 10))
e = time.time()
print("\nTime consumed by cupy: ", e - s)