### summary
- 数据并行（分割） vs 模型并行（拷贝）
    - 该模块在每个设备上复制，每个副本处理输入的一部分
    - 在向后通过期间，从每个副本的梯度被求和到原始模块
- DP => DDP
    - DP：nn.DataParallel
    - DDP: DistributedDataParallel
    - Use nn.parallel.DistributedDataParallel instead of multiprocessing or nn.DataParallel and Distributed Data Parallel.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Parameters and DataLoaders
input_size = 5
output_size = 2

batch_size = 30
data_size = 100

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
device

device(type='cuda', index=0)

dummy dataset

In [3]:
class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        # 100*5
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        # (5,)
        return self.data[index]
    
    def __len__(self):
        # 100
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size,
                         shuffle=True)

In [4]:
next(iter(rand_loader)).shape

torch.Size([30, 5])

simple model

In [5]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, *args, **kwargs):
        super(Model, self).__init__(*args, **kwargs)
        self.fc = nn.Linear(input_size, output_size)
    def forward(self, input):
        output = self.fc(input)
        print("\tIn Model: input size", input.size(),
              "output size", output.size())
        return output

### DataParaller

In [10]:
# (5, 2)
model = Model(input_size, output_size)
if torch.cuda.device_count()>1:
    print(f"{torch.cuda.device_count()} gpus")
    model = nn.DataParallel(model)

9 gpus


In [11]:
model

DataParallel(
  (module): Model(
    (fc): Linear(in_features=5, out_features=2, bias=True)
  )
)

tensors: to(device)

In [20]:
a = torch.randn(3,4)
print('a.is_cuda', a.is_cuda)
b = a.to("cuda:0")
print('a.is_cuda', a.is_cuda)
print('b.is_cuda', b.is_cuda)

a.is_cuda False
a.is_cuda False
b.is_cuda True


model:to(device)

In [22]:
a = Model(3, 4)
print(next(a.parameters()).is_cuda)
b = a.to('cuda:0')
print(next(a.parameters()).is_cuda)
print(next(b.parameters()).is_cuda)

False
True
True


In [18]:
a = torch.randn(3,4)

run the model(forward)

In [25]:
model = model.to(device)
for data in rand_loader:
    input = data.to(device)
    output = model(input)
    print("Outside: input size", input.size(),
          "output_size", output.size())

  return F.linear(input, self.weight, self.bias)


	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
	In Model: input size torch.Size([4, 5]) output size torch.Size(