## GPU data parallelism
* Worker = is a whole computer with CPU and all (node)
* device or accelerator is a GPU or TPU
* XLA_GPU = Accelerated Linear Algebra

In [2]:
import tensorflow as tf
import multiprocessing as mp
import os, sys
import inspect

currentdir = os.path.dirname(
    os.path.abspath(inspect.getfile(inspect.currentframe())))
#parentdir = os.path.dirname(currentdir)
sys.path.insert(0, currentdir)
%load_ext autoreload
%autoreload 2
import gpu_wit as gw

In [3]:
# List devices
# If this cell is run the next cell won't work properly
gpus = tf.config.list_physical_devices('XLA_GPU')[0]
print(gpus)
tf.config.list_logical_devices('GPU')

PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')


[LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [2]:
gpus = tf.config.list_physical_devices('GPU')
tf.debugging.set_log_device_placement(True)
if gpus:
  # Create 2 virtual GPUs with 1GB memory each
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024),
         tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPU, 2 Logical GPUs


In [8]:
tf.debugging.set_log_device_placement(True)
#with tf.device('/device:GPU:1'):
with tf.device('/gpu:0'):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


In [None]:
gw.para_exec()

In [3]:
def mat_mul(gpu):
    with tf.device(gpu):
        a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
        b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        c = tf.matmul(a, b)
mat_mul('/gpu:1')

Executing op MatMul in device /job:localhost/replica:0/task:0/device:GPU:1


In [None]:
# Does not end
gpu_lst = [f"/gpu:{k}" for k in range(2)]
with mp.Pool(2) as pool:
    print(pool.map(mat_mul, gpu_lst))

In [12]:
def mp_wrap():
    gpu_lst = [f"/gpu:{k}" for k in range(2)]
    with mp.Pool(2) as pool:
        result = pool.apply_async(mat_mul, ('/gpu:0',)) 
        print('ya salio de aqui.')
        print(result.get(timeout=10))

mp_wrap()

ya salio de aqui.


TimeoutError: 

In [None]:
tf.config.list_logical_devices('GPU')

In [19]:
with tf.device('/device:GPU:0'):       # Run nodes with GPU 0
#with tf.device(gpu.name):       # Run nodes with GPU 0
    m1 = tf.constant([[3, 5]])
    m2 = tf.constant([[2],[4]])
    product = tf.matmul(m1, m2)
product

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[26]], dtype=int32)>

In [7]:
# This works for parallel training
# I couldn't find an easy working example of inference on multiple GPUs.
mirror = tf.distribute.MirroredStrategy()
mirror.scope()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


<tensorflow.python.distribute.distribute_lib._CurrentDistributionContext at 0x7f6858572070>

In [None]:
# Another way of listing GPU
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [1]:
isinstance('hola', str)

True