# GPU vs. CPU Time Testing

In this notebook, we compare the speed at which the CPU and the GPU complete a matrix multiplication of the same random arrays.

In [5]:
# examples from https://www.tensorflow.org/guide/gpu

"""
If a TensorFlow operation has both CPU and GPU implementations, by default, the GPU device is prioritized when the 
operation is assigned. For example, tf.matmul has both CPU and GPU kernels and on a system with devices CPU:0 and 
GPU:0, the GPU:0 device is selected to run tf.matmul unless you explicitly request to run it on another device.

If a TensorFlow operation has no corresponding GPU implementation, then the operation falls back to the CPU device. 
For example, since tf.cast only has a CPU kernel, on a system with devices CPU:0 and GPU:0, the CPU:0 device is 
selected to run tf.cast, even if requested to run on the GPU:0 device.
"""

# Matrices had to be in float32 format (test with float64?)
# Make sure CPU time is faster than GPU time 

import os
# Set log level to 2 to supress INFO and WARNING messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
import numpy as np

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

tf.debugging.set_log_device_placement(True)

array_a = np.random.rand(2000,3000).astype(np.float32)
array_b = np.random.rand(3000,2000).astype(np.float32)

# Create some tensors
a = tf.constant(array_a)
b = tf.constant(array_b)
c = tf.matmul(a, b)

#a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
#b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
#c = tf.matmul(a, b)

print(c)

Num GPUs Available:  1
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op MatMul in device /job:localhost/replica:0/task:0/device:GPU:0
tf.Tensor(
[[761.879   749.5151  734.01904 ... 743.2405  739.74866 758.67645]
 [747.34094 734.46704 718.1686  ... 744.4136  739.31195 759.34924]
 [745.00195 742.61707 724.45233 ... 756.0435  750.24335 753.20526]
 ...
 [741.04034 735.8122  724.4512  ... 743.69775 745.3959  760.00024]
 [751.2343  738.1212  730.97296 ... 763.30334 757.3506  766.07104]
 [744.0852  744.8742  733.9725  ... 754.34326 752.95764 767.7954 ]], shape=(2000, 2000), dtype=float32)


In [2]:
from tensorflow.python.platform import build_info as tf_build_info
print(tf_build_info.build_info['cuda_version'])
print(tf_build_info.build_info['cudnn_version'])

11.8
8


In [3]:
%%time

# Place tensors on the CPU
with tf.device('/CPU:0'):
  #a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
  #b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
  a = tf.constant(array_a)
  b = tf.constant(array_b)
  c = tf.matmul(a, b)    

# Run on the CPU
print(c)

Executing op _MklMatMul in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor(
[[746.32587 743.61707 749.63763 ... 739.4483  749.2776  741.8144 ]
 [744.794   737.8059  749.718   ... 734.6162  747.1352  733.5072 ]
 [754.58905 750.89606 766.2672  ... 748.67426 757.71484 749.5813 ]
 ...
 [765.20435 753.2423  763.5108  ... 747.1719  760.8779  745.8576 ]
 [748.8022  744.7076  756.3685  ... 728.6443  746.9789  738.2501 ]
 [753.1909  740.11896 753.98145 ... 737.8197  755.7354  740.23474]], shape=(2000, 2000), dtype=float32)
CPU times: user 387 ms, sys: 56.9 ms, total: 444 ms
Wall time: 113 ms


2025-04-28 18:40:50.039102: I tensorflow/core/common_runtime/placer.cc:125] a: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.039126: I tensorflow/core/common_runtime/placer.cc:125] b: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.039132: I tensorflow/core/common_runtime/placer.cc:125] MatMul: (MatMul): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.039136: I tensorflow/core/common_runtime/placer.cc:125] product_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.044634: I tensorflow/core/common_runtime/placer.cc:125] a: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.044654: I tensorflow/core/common_runtime/placer.cc:125] b: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.044660: I tensorflow/core/common_runtime/placer.cc:125] _MklMatMul: (_MklMatMul): /job:localhost/replica:0/task:0/device:CPU:0
2025-04-28 18:40:50.044663: I tensorfl

In [4]:
%%time

# Place tensors on the GPU
with tf.device('/GPU:0'):
  #a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
  #b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
  a = tf.constant(array_a)
  b = tf.constant(array_b)
  c = tf.matmul(a, b) 
    
# Run on the GPU
#for i in range(2000):
#  c = tf.matmul(a, b)
print(c)

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op MatMul in device /job:localhost/replica:0/task:0/device:GPU:0
tf.Tensor(
[[746.32556 743.61743 749.63763 ... 739.44836 749.2778  741.8149 ]
 [744.7941  737.8058  749.71783 ... 734.61664 747.1351  733.50757]
 [754.589   750.8961  766.26733 ... 748.6742  757.71497 749.58124]
 ...
 [765.20435 753.2423  763.511   ... 747.17175 760.8783  745.8574 ]
 [748.8022  744.70776 756.3685  ... 728.6443  746.979   738.25   ]
 [753.1911  740.1188  753.9816  ... 737.8197  755.7355  740.2344 ]], shape=(2000, 2000), dtype=float32)
CPU times: user 36.3 ms, sys: 19.9 ms, total: 56.2 ms
Wall time: 54.6 ms


In [1]:
# Cut this out, rename this notebook "something something time test"

# Table this until we do more with dask bursting of GPU servers

"""
If developing on a system with a single GPU, you can simulate multiple GPUs with virtual devices. 
This enables easy testing of multi-GPU setups without requiring additional resources.
"""
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
  # Create 4 virtual GPUs with 1GB memory each
  try:
    print('trying 1')
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024),
         tf.config.LogicalDeviceConfiguration(memory_limit=1024),
         tf.config.LogicalDeviceConfiguration(memory_limit=1024),
         tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    print('trying 2')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
    tf.debugging.set_log_device_placement(True)
    gpus = tf.config.list_logical_devices('GPU')
    strategy = tf.distribute.MirroredStrategy(gpus)
    with strategy.scope():
        inputs = tf.keras.layers.Input(shape=(1,))
        predictions = tf.keras.layers.Dense(1)(inputs)
        model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
        model.compile(loss='mse',
                    optimizer=tf.keras.optimizers.SGD(learning_rate=0.2))
    print("Success.")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print("Error:",e)

2025-04-15 14:44:20.861680: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-15 14:44:20.906330: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-15 14:44:20.906348: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-15 14:44:20.906353: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-15 14:44:20.913384: I tensorflow/core/platform/cpu_feature_g

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
trying 1
trying 2
1 Physical GPU, 4 Logical GPUs
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


2025-04-15 14:44:23.481294: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-04-15 14:44:23.514528: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-04-15 14:44:23.514766: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Success.


In [2]:
#tf.debugging.set_log_device_placement(True)

gpus = tf.config.list_logical_devices('GPU')
if gpus:
  # Replicate your computation on multiple GPUs
  c = []
  for gpu in gpus:
    print("Name: ",gpu.name)
    with tf.device(gpu.name):
      a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
      b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
      c.append(tf.matmul(a, b))

  with tf.device('/GPU:0'):
    matmul_sum = tf.add_n(c)

  print(matmul_sum)

Name:  /device:GPU:0
Name:  /device:GPU:1
Name:  /device:GPU:2
Name:  /device:GPU:3
tf.Tensor(
[[ 88. 112.]
 [196. 256.]], shape=(2, 2), dtype=float32)
