## Multi GPU training

In [None]:
import tensorflow as tf

## Listing Physical Devices

In [None]:
physical_devies = tf.config.list_physical_devices("GPU")

## Limiting GPU visibility 

### Set env vars:
* CUDA_DEVICE_ORDER=PCI_BUS_ID 
* CUDA_VISIBLE_DEVICES=0,1

## Limiting RAM usage per GPU with logical GPU

In [None]:
for gpu in physical_devies:
    tf.config.set_logical_device_configuration(
        gpu,
        [tf.config.LogicalDeviceConfiguration(memory_limit=2048)]
    )

## Memory use growth

* Set env var: TF_FORCE_GPU_ALLOW_GROWTH = True

### or in code

In [None]:
for gpu in physical_devies:
    tf.config.experimental.set_memory_growth(gpu, True)

## Splitting GPU into multiple logical GPUs for e.g. distributed training tests

In [None]:
tf.config.set_logical_device_configuration(
        physical_devies[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=2048), # -> "/gpu:0"
         tf.config.LogicalDeviceConfiguration(memory_limit=2048)] # -> "/gpu:1"
    )

## Variables placement in devices

In [None]:
a = tf.Variable([1., 2., 3.])

In [5]:
a.device

'/job:localhost/replica:0/task:0/device:GPU:0'

## No kernel for int32 ops so var placed on CPU

In [6]:
b = tf.Variable([1, 2, 3])

In [7]:
b.device

'/job:localhost/replica:0/task:0/device:CPU:0'

## tf.int8 and tf.int16 have GPU kernels

In [23]:
b2 = tf.Variable([1, 2, 3], dtype=tf.int16)

In [24]:
b2.device

'/job:localhost/replica:0/task:0/device:GPU:0'

## Foriceing var placement with dev context

In [8]:
with tf.device('/cpu:0'):
    c = tf.Variable(1., 2., 3.)

In [9]:
c.device

'/job:localhost/replica:0/task:0/device:CPU:0'

## Disabling device placement fallback with an explicit exception:

In [None]:
tf.config.set_soft_device_placement(False)

## Control over intra and inter-op treads pools

In [None]:
tf.config.threading.set_inter_op_parallelism_threads()
tf.config.threading.set_intra_op_parallelism_threads()

## Prefetching to device

In [None]:
tf.data.experimental.prefetch_to_device() 

## Model parallelism: MESH TF
* https://github.com/tensorflow/mesh

## Data parallelism - strategies operating on different minibatches

* Mirrored Strategy: Identical replicas, AllReduce for gradient mean and update sync
* Centralized parameters server and GPU workers, allows for async updates

## Bandwidth saturation limits the number of GPUs useful in parallel

## Strategy with more central parameters server can reduce server strain

## Massive parallelism: PipeDream and more recent Pathways

## Reducing data load by reducing model pecision from tf.float32 to tf.float 16

## Distribution strategies API

In [None]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
with strategy.scope():
    model = tf.keras.Sequential([...])
    model.compile([...])

In [None]:
# Crucually: batch_size should be divisible 
# by the number of replicas so that each batch
# would have the same size
batch_size = 100

model.fit(X_train, y_train, epochs=10,
    validation_data=(X_valid, y_valid), batch_size=batch_size)

In [None]:
type(model.weights[0]) # -> tensorflow.python.distribute.values.MirroredVariabl

## Running saved model on multiple GPUs:


In [None]:
with strategy.scope():
    model = tf.keras.models.load_model("my_mirrored_model")

## Specifyibg which GPUs to use with strategy

In [None]:
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

## Other reduction strategies: set *cross_device_ops* to
* tf.distribute.HierarchicalCopyAllReduce
* tf.distribute.ReductionToOneDevice
* Tge default NCCL is tf.distribute.NcclAllReduce

## Data parallelism with parameters server
* strategy = tf.distribute.experimental.CentralStorageStrategy()

## TF Cluster: a group of TF processes working in parallel (also can be distributed)
* Each TF process in the cluster is a *task* or *TF Server*
* Has IP, port, type (role/job): worker, chief, ps, evaluator, 

## Cluster spec

In [None]:
cluster_spec = {
    "worker": [
        "machine-a.example.com:2222", # /job:worker/task:0
        "machine-b.example.com:2222"  # /job:worker/task:1
    ],
    "ps": ["machine-a.example.com:2221"] # /job:ps/task:0

## Data specifying the task

In [None]:
# This should be placed outside Python code so the same code
# can be used for all task servers

os.environ['TF_CONFIG'] = json.dumps({
    "cluster": cluster_spec,
    "task": {"type": "worker", "index": 0}
})

## Triggering the task requires running the same script on all servers

## With *MultiWorkerMirroredStrategy* all workers must perform the same steps to ensure proper sync

In [None]:
import tempfile
import tensorflow as tf

strategy = tf.distribute.MultiWorkerMirroredStrategy()
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()

print(f"Starting task {resolver.task_type} #{resolver.task_id}")

[...] # Dataset preparation

with strategy.scope():
    model = tf.keras.Sequential([...])
    model.compile([...])

model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10)

if resolver.task_id == 0: # This node is the chief, it saves the model
    model.save('my_mnist_multiworker_model', save_format='tf')
else:
    # Formal steps needed to keep all workers in sync with chief
    tmpdir = tempfile.mkdtemp()
    model.save(tmpdir, save_format='tf')
    tf.io.gfile.rmtree(tmpdir) # Nothin is stored on workers
    
    

## Changing AllReduce strategy: TF runs heuristics to select the best algorithm but explicit choice can be made as follows:

In [None]:
strategy = tf.distribute.MultiWorkerMirroredStrategy(
    communication_options=tf.distribute.experimental.CommunicationOptions(
        implementation=tf.distribute.experimental.CollectiveCommunication.NCCL))

## Async data parallelism is used with *ParameterServerStrategy*
* Add one or more param servers
* Configure TF_CONFIG properly for each task

## TPU strategy (run just after importing tf)

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)