In [1]:
import os
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

1.10.2+cu102
10.2
7605


## GPU properties

In [2]:
#Activate GPU usage, Runtime -> Change Runtime Type -> Choose GPU type
! nvidia-smi

Tue Mar 15 16:02:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P0    N/A /  N/A |    357MiB /  4096MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
print(torch.cuda.is_available())

True


In [4]:
print(torch.cuda.device_count())

1


In [5]:
print(torch.cuda.current_device())

0


In [6]:
a = torch.randn(10000000,device='cuda')
! nvidia-smi

Tue Mar 15 16:02:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   59C    P0    N/A /  N/A |   1053MiB /  4096MiB |     25%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
del a 
torch.cuda.empty_cache()
! nvidia-smi

Tue Mar 15 16:02:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   59C    P0    N/A /  N/A |   1034MiB /  4096MiB |     14%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Initialization by torch.distributed.init_process_group()

In [19]:
import torch.distributed as dist
from torch.multiprocessing import Process


def print_rank():
    print(f"Hello from process {dist.get_rank()} (out of {dist.get_world_size()})!\n")


def init_process(rank, size, fn, backend="gloo"):
    """Initialize the distributed environment."""
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "20951"
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn()


def main(fn, size=4):
    processes = []
    for rank in range(size):
        p = Process(target=init_process, args=(rank, size, fn))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()


main(print_rank, size=4)


Hello from process 0 (out of 4)!
Hello from process 2 (out of 4)!
Hello from process 3 (out of 4)!



Hello from process 1 (out of 4)!



Q1: Which method is used to launch multiple processes?  
Q2: After initilization, the rank of the process and the worldsize can be obtained by which functions in torch.distributed?

## Communication: broadcast

In [28]:
def broadcast():

    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    group = dist.new_group([0, 1, 2, 3])
    print(f"I am {rank} of {size} with a tensor {tensor}")

    if rank == 0:
        print("************ Starting Communication ************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print("Rank ", rank, " has data ", tensor)


main(broadcast, size=4)


I am 0 of 4 with a tensor 0I am 1 of 4 with a tensor 1I am 2 of 4 with a tensor 2I am 3 of 4 with a tensor 3



************ Starting Communication ************
Rank Rank Rank Rank      2301    has data    has data  has data  tensor(0)
tensor(0)
tensor(0) has data 
 tensor(0)


Q3: In the above code, which rank is the one who broadcasts?
<br>
Task 1: If Rank 0 just wants to broadcast to a random subset of all the processes, please write down the new code to acheive that.

In [None]:
#Answer for Task 1
import random

def broadcast_random(seed=1234):
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    #print(f"I am {rank} of {size} with a tensor {tensor}")
    
    random.seed(seed)
    random_group = random.sample([i for i in range(1,size)], 2)
    random_group = random_group + [0]
    print(f"Rank 0 broadcasts to the group {random_group}")
    group = dist.new_group(random_group)

    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print('Rank ', rank, ' has data ', tensor)

main(broadcast_random, size=4)

## Communication: reduce

In [35]:
def reduce():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank + 1)
    
    if rank == 0:
        tensor_old = tensor.clone()
    
    group = dist.new_group([0, 1, 2, 3])
    print(f"I am {rank} of {size} with a tensor {tensor}")
    
    if rank == 0:
        print("************ Starting Communication ************\n")
    
    dist.reduce(tensor=tensor, dst=0, op=dist.ReduceOp.SUM, group=group)
    
    if rank == 0:
        tensor -= tensor_old
    
    print(f"Rank {rank} has data {tensor.item()}")


main(reduce, size=4)


I am 0 of 4 with a tensor 1I am 3 of 4 with a tensor 4I am 2 of 4 with a tensor 3I am 1 of 4 with a tensor 2



************ Starting Communication ************
Rank 3 has data 4
Rank 2 has data 7

Rank 1 has data 9Rank 0 has data 9



Q4: What does the above code acheive?
<br>
Q5: Check the values of every rank after "reduce", try to explain the reason.
<br>

Task 2 [Server-Client communication]: Write a function which runs for 10 iterations: Among each iteration, 
- rank 0 broadcasts to a random subset of all the processes, 
- the processes in the subset update their states by adding one unit, 
- rank 0 gets the average of the states from the processes in the subset.

In [None]:
#Answer for Task 2

def server_client_communication(group_size=2):
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(float(rank))
    iterations = 10
    random.seed(0)
    seeds = [random.randint(0,10000) for i in range(iterations)]
    for i, sd in zip(range(iterations), seeds):
        # Step 1
        random.seed(sd)
        random_group = random.sample([i for i in range(1,size)], group_size)
        random_group = random_group + [0]
        if rank == 0: print(f"Iter {i}: Rank 0 broadcasts to the group {random_group}")
        random_group_dist = dist.new_group(random_group)
        dist.broadcast(tensor=tensor, src=0, group=random_group_dist)
        
        # Step 2
        if rank in random_group and rank != 0: 
            tensor += 1

        # Step 3
        if rank == 0: tensor_old = tensor.clone()
        dist.reduce(tensor=tensor, dst=0, op=dist.ReduceOp.SUM, group=random_group_dist)
        if rank == 0:
            tensor -= tensor_old
            tensor = tensor/group_size

    if rank == 0: print(f"The final value of Rank {0} is {tensor}")

main(server_client_communication, size=4)

## Communication: send and receive

In [None]:
def send_receive():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    print(f"I am {rank} of {size} with a tensor {tensor}")
    if rank == 0:
        print("**********\nStarting Communication\n************")
        dist.recv(tensor, src=1)
    if rank == 1:
        dist.send(tensor, dst=0)
    if rank == 2:
        dist.recv(tensor)
    if rank == 3:
        dist.send(tensor, dst=2)
    print('Rank ', rank, ' has data ', tensor.item())

main(send_receive, size=4)

## torch.distributed.launch()

In [None]:
%%writefile Launch.py
import os
import torch
import torch.distributed as dist
import argparse


def parse():
    parser = argparse.ArgumentParser()
    parser.add_argument('--func', type=str, help='choose the function to execute')
    parser.add_argument('--backend', type=str, help='choose the backend')
    args = parser.parse_args()
    return args

def print_rank():
    print('Hello from process {} (out of {})!'.format(dist.get_rank(), dist.get_world_size()))

def broadcast():
    rank = dist.get_rank()
    size = dist.get_world_size()
    if 'OMP_NUM_THREADS' not in os.environ:
        current_env["OMP_NUM_THREADS"] = 1
    if torch.cuda.is_available() == True:
        device = torch.device('cuda:'+str(rank))
    else:
        device = torch.device('cpu')
    tensor = torch.tensor(rank, device=device)
    group = dist.new_group([0,1])
    #print(f"I am {rank} of {size} with a tensor {tensor.item()}")
    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print('Rank ', rank, ' has data ', tensor)


if __name__== '__main__':
    args = parse()
    backend = args.backend
    if torch.cuda.is_available() == True:
        size = int(os.environ['WORLD_SIZE'])
        # if torch.cuda.device_count()<size:
            # raise ValueError('size should not larger than the number of GPUs')
    rank = int(os.environ["LOCAL_RANK"])
    function_mapping = {'print_rank': print_rank, 'broadcast': broadcast}
    dist.init_process_group(backend)
    function_mapping[args.func]()

Overwriting Launch.py


In [None]:
# Useful detail: 
# https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py
!OMP_NUM_THREADS=1 torchrun --nproc_per_node=2 Launch.py --func "print_rank" --backend gloo

Hello from process 0 (out of 2)!
Hello from process 1 (out of 2)!


Q6: Which package is used for launching multiple processes in torch.distributed.launch? [check the source code in the detail link]
<br>
Task 3: Reserve two GPUs from NEF and try to run the script Launch.py.