In [103]:
import torch
from torch.nn import DataParallel, Linear
import torchvision
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import  matplotlib.pyplot as plt
from matplotlib.pyplot import imshow


In [104]:
!jupyter nbconvert --to script DistributedDataParallel_script.ipynb
!head DistributedDataParallel_script.py

[NbConvertApp] Converting notebook DistributedDataParallel_script.ipynb to script
[NbConvertApp] Writing 11661 bytes to DistributedDataParallel_script.py
#!/usr/bin/env python
# coding: utf-8

# In[5]:


import torch
from torch.nn import DataParallel, Linear
import torchvision
from torch.utils.data.dataset import Dataset


#### Single GPU training

In [107]:
%time !python DistributedDataParallel_script.py --issingle

args =  Namespace(backend='gloo', batch=16, epochs=1, global_rank=None, isaml=False, issingle=True, local_rank=None, lr=0.001, size=None)
Single GPU training
loading data
Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST_data
    Split: Train
60000 10000
(784,) 5
using device cuda
100%|██████████████████████████████████████| 3750/3750 [00:12<00:00, 295.50it/s]
loss at epoch 0 is 0.013191968202590942
shape = torch.Size([128, 784]), mean of values = -0.003071286715567112
shape = torch.Size([128]), mean of values = 0.021458515897393227
shape = torch.Size([64, 128]), mean of values = 0.00920057762414217
shape = torch.Size([64]), mean of values = 0.035079047083854675
shape = torch.Size([10, 64]), mean of values = -0.024419544264674187
shape = torch.Size([10]), mean of values = -0.017619147896766663
100%|████████████████████████████████████████| 625/625 [00:01<00:00, 504.37it/s]
[7. 2. 1. 0. 4. 1. 4. 9. 5. 9.] [7. 2. 1. 0. 4. 1. 4. 9. 5. 9.]
validation loss is 0.153097

#### Single GPU distributed training

In [105]:
%time !python DistributedDataParallel_script.py --local_rank 0 --global_rank 0 --size 1 --backend 'gloo'

args =  Namespace(backend='gloo', batch=16, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=1)
starting distibuted training Namespace(backend='gloo', batch=16, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=1)
inside train_distrib rank 0 and world_size 1
loading data
Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST_data/0/
    Split: Train
60000 10000
(784,) 5
setting up
MASTER_ADDR = localhost
MASTER_PORT = 12355
using device 0
100%|██████████████████████████████████████| 3750/3750 [00:14<00:00, 251.31it/s]
loss at epoch 0 is 0.0966588482260704
shape = torch.Size([128, 784]), mean of values = -0.0026552591007202864
shape = torch.Size([128]), mean of values = 0.020683109760284424
shape = torch.Size([64, 128]), mean of values = 0.008393129333853722
shape = torch.Size([64]), mean of values = 0.025294966995716095
shape = torch.Size([10, 64]), mean of values = -0.028980279341340065
shape = torch.

In [106]:
%time !python DistributedDataParallel_script.py --local_rank 0 --global_rank 0 --size 1 --backend 'nccl'

args =  Namespace(backend='nccl', batch=16, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=1)
starting distibuted training Namespace(backend='nccl', batch=16, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=1)
inside train_distrib rank 0 and world_size 1
loading data
Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST_data/0/
    Split: Train
60000 10000
(784,) 5
setting up
MASTER_ADDR = localhost
MASTER_PORT = 12355
using device 0
100%|██████████████████████████████████████| 3750/3750 [00:14<00:00, 267.42it/s]
loss at epoch 0 is 0.0966588482260704
shape = torch.Size([128, 784]), mean of values = -0.0026552591007202864
shape = torch.Size([128]), mean of values = 0.020683109760284424
shape = torch.Size([64, 128]), mean of values = 0.008393129333853722
shape = torch.Size([64]), mean of values = 0.025294966995716095
shape = torch.Size([10, 64]), mean of values = -0.028980279341340065
shape = torch.

### Multi process

In [108]:
%time !python DistributedDataParallel_script.py --local_rank 0 --global_rank 0 --size 2 --batch 8

args =  Namespace(backend='gloo', batch=8, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=2)
starting distibuted training Namespace(backend='gloo', batch=8, epochs=1, global_rank=0, isaml=False, issingle=False, local_rank=0, lr=0.001, size=2)
inside train_distrib rank 0 and world_size 2
loading data
Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST_data/0/
    Split: Train
60000 10000
(784,) 5
setting up
MASTER_ADDR = localhost
MASTER_PORT = 12355
using device 0
100%|██████████████████████████████████████| 3750/3750 [00:15<00:00, 247.44it/s]
loss at epoch 0 is 0.17599113285541534
shape = torch.Size([128, 784]), mean of values = -0.0026919872034341097
shape = torch.Size([128]), mean of values = 0.021903974935412407
shape = torch.Size([64, 128]), mean of values = 0.00897801574319601
shape = torch.Size([64]), mean of values = 0.027450013905763626
shape = torch.Size([10, 64]), mean of values = -0.028402984142303467
shape = torch.Si