# CPU/GPU Comparisons

In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)
print("TF will attempt to allocate only as much GPU memory as needed for the runtime allocations")

1 Physical GPUs, 1 Logical GPUs
TF will attempt to allocate only as much GPU memory as needed for the runtime allocations


## Matrix Multiplication Example

In [4]:
import time

def measure(x, steps):
    # TensorFlow initializes a GPU the first time it's used, exclude from timing.
    tf.matmul(x, x)
    start = time.time()
    for i in range(steps):
        x = tf.matmul(x, x)
    # tf.matmul can return before completing the matrix multiplication
    # (e.g., can return after enqueing the operation on a CUDA stream).
    # The x.numpy() call below will ensure that all enqueued operations
    # have completed (and will also copy the result to host memory,
    # so we're including a little more than just the matmul operation time).
    _ = x.numpy()
    end = time.time()
    return end - start

shape = (200, 200)
steps = 10000
print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))

# Run on CPU:
with tf.device("/cpu:0"):
    cpu_time1 = measure(tf.random.normal(shape), steps)
    print("CPU: {} secs".format(cpu_time1))

# Run on GPU, if available:
if tf.config.list_physical_devices("GPU"):
    with tf.device("/gpu:1"):
        gpu_time1 = measure(tf.random.normal(shape), steps)
        print("GPU: {} secs".format(gpu_time1))
else:
    print("GPU: not found")

print('GPU speedup over CPU: {}x'.format(int(cpu_time1/gpu_time1)))

Time to multiply a (200, 200) matrix by itself 10000 times:
CPU: 3.5832245349884033 secs
GPU: 0.9370095729827881 secs
GPU speedup over CPU: 3x


## Spatial Convolution Over Images Example
Comparison of time needed to execute a [conv2d](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2D) operation via CPU and GPU. This code started from the TF1 [gist here](https://gist.github.com/pdmack/890aa355eefb92072fc1776b7e959696).

In [5]:
import timeit
tf.config.run_functions_eagerly(True)

@tf.function
def conv_fn():
    image = tf.random.normal((100, 100, 100, 3))
    net = tf.keras.layers.Conv2D(32, 7)(image)
    return tf.reduce_sum(net)

# warm up
conv_fn()

print("30 loops of convolve 32x7x7x3 filter over random 100x100x100x3 images:")
with tf.device("/cpu:0"):
    cpu_time2 = timeit.timeit(lambda: conv_fn(), number=30)
    print("CPU:", cpu_time2)
if tf.config.list_physical_devices("GPU"):
    with tf.device("/gpu:0"):
        gpu_time2 = timeit.timeit(lambda: conv_fn(), number=30)
        print("GPU:", gpu_time2)
else:
    print("GPU: not found")
    
print('GPU speedup over CPU: {}x'.format(int(cpu_time2/gpu_time2)))

30 loops of convolve 32x7x7x3 filter over random 100x100x100x3 images:
CPU: 2.4716558000000077
GPU: 0.6768585000000087
GPU speedup over CPU: 3x
