# 1. Outline

- 数据并行是切数据，模型并行是切模型
    - 模型并行：单卡放不下一份模型
    - 将一份大模型，不同的层切分到不同的卡上

- device_map：Huggingface

- 模型并行：on ResNet

# 2. huggingface的支持

## 2.1 device_map

{"auto", "balanced", "balanced_low_0", "sequential"}

- auto
    - GPU > CPU > Disk

In [1]:
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf",
                                         load_in_8bit=True,
                                         device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like decapoda-research/llama-7b-hf is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

## 3. toy example

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

In [6]:
class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()

        self.net1 = torch.nn.Linear(10000, 10).to("cuda:0")
        self.relu = torch.nn.ReLU()
        self.net2 = torch.nn.Linear(10, 5).to("cuda:1")

    def forward(self, x):
        x = self.relu(self.net1(x.to("cuda:0")))
        return self.net2(x.to("cuda:1"))

In [None]:
"""
数据在哪个cuda, 使用的model就在哪个cuda
labels与输出进行计算loss, 输出在cuda:1, 所以labels也需要在cuda:1
"""

model = ToyModel()
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

optimizer.zero_grad()
outputs = model(torch.randn(20, 10000))
labels = torch.randn(20, 5).to("cuda:1")
loss_fn(outputs, labels).backward()
optimizer.step()

## 4. split ResNet

model = ResNet(block, layers, **kwargs)

In [8]:
from torch import nn
from torchvision.models.resnet import ResNet, Bottleneck

In [10]:
"""
resnet-50
"""
model = ResNet(Bottleneck, [3,4,6,3])
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [11]:
from torchsummary import summary

In [12]:
summary(model, input_size=(3, 128, 128), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           9,408
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
         MaxPool2d-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]           4,096
       BatchNorm2d-6           [-1, 64, 32, 32]             128
              ReLU-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,864
       BatchNorm2d-9           [-1, 64, 32, 32]             128
             ReLU-10           [-1, 64, 32, 32]               0
           Conv2d-11          [-1, 256, 32, 32]          16,384
      BatchNorm2d-12          [-1, 256, 32, 32]             512
           Conv2d-13          [-1, 256, 32, 32]          16,384
      BatchNorm2d-14          [-1, 256,

### 4.1 自定义模型并行

In [14]:
t = torch.rand((2, 3, 4))
print(t.shape)
t.view(t.size(0), -1).shape

torch.Size([2, 3, 4])


torch.Size([2, 12])

In [15]:
from typing import Callable, List, Type


from torch.nn.modules import Module
from torchvision.models.resnet import BasicBlock, Bottleneck


class ModelParallelResNet50(ResNet):
    def __init__(self, num_classes=1000):
        super().__init__(Bottleneck, [3,4,6,3], num_classes=num_classes)

        self.seq1 = nn.Sequential(
            self.conv1,
            self.bn1,
            self.relu,
            self.maxpool,
            self.layer1,
            self.layer2
        ).to("cuda:0")

        self.seq2 = nn.Sequential(
            self.layer3,
            self.layer4,
            self.avgpool
        ).to("cuda:1")

        self.fc.to("cuda:1")

    def forward(self, x):
        x = self.seq2(self.seq1(x).to("cuda:1"))
        return self.fc(x.view(x.size(0), -1))

### 4.2 train pipeline

In [16]:
nums_classes = 1000

In [17]:
ont_hot_indices = torch.LongTensor(5).random_(0, nums_classes).view(5, 1)
ont_hot_indices

tensor([[177],
        [567],
        [598],
        [745],
        [105]])

In [18]:
labels = torch.zeros(5, nums_classes).scatter_(1, ont_hot_indices, 1)
labels

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
num_batches = 3
batchs_size = 120
image_w = 128
image_h = 128

def train(model):
    model.train(True)
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    ont_hot_indices = torch.LongTensor(batchs_size).random_(0, nums_classes).view(batchs_size, 1)

    for _ in range(num_batches):
        inputs = torch.randn(batchs_size, 3, image_w, image_h)
        labels = torch.zeros(batchs_size, nums_classes).scatter_(1, ont_hot_indices, 1)

        optimizer.zero_grad()
        outputs = model(inputs.to("cuda:0"))
        labels = labels.to(outputs.device)
        loss_fn(outputs, labels).backward()
        optimizer.step()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import timeit

"""
模型并行多gpu性能是要比单gpu差一些
"""

num_repeat = 10

stmt = "train(model)"

# 模型并行
setup = "model = ModelParallelResNet50()"
mp_run_times = timeit.repeat(
    stmt, setup, number=1, repeat=num_repeat, globals=globals()
)
mp_mean, mp_std = np.mean(mp_run_times), np.std(mp_run_times)

# 单卡
setup = "import torchvision.models as models;" + \
        "model = models.resnet50(num_classes=num_classes).to('cuda:0')"
rn_run_times = timeit.repeat(
    stmt, setup, number=1, repeat=num_repeat, globals=globals()
)
rn_mean, rn_std = np.mean(rn_run_times), np.std(rn_run_times)