### Mixed Precision

In [1]:
import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.w1 = nn.Linear(512, 512, bias=False)
        self.w2 = nn.Linear(512, 1, bias=False)
    
    def forward(self, x):
        z1 = self.w1(x)
        z2 = self.w2(z1)
        return z2

In [2]:
from torch.optim import SGD

fp32_model= Net().to("cuda")
optimizer = SGD(fp32_model.parameters(), lr=1e-2)

In [3]:
f"GPU = {torch.cuda.memory_allocated(0) / (1024 ** 2)} MB"

'GPU = 1.001953125 MB'

In [4]:
fp16_model = Net().half().to("cuda")
fp16_model.load_state_dict(fp32_model.state_dict())

<All keys matched successfully>

In [5]:
f"GPU = {torch.cuda.memory_allocated(0) / (1024 ** 2)} MB"

'GPU = 1.5029296875 MB'

In [6]:
import torch

# example input sizes
batch_size, hidden_size = 4, 512

# create dummy data (bsz=4, hid=512)
x = torch.randn(batch_size,hidden_size, dtype=torch.half, device="cuda") 

# do forward
z2 = fp16_model(x)

# check dtypr of output logits
f"logits type = {z2.dtype}"

'logits type = torch.float16'

In [7]:
# craete dummy data (bsz=4)
y = torch.tensor([[1.9], [9.5], [0.9], [1.2]], dtype=torch.half, device="cuda")

# compute mean square error loss
L = torch.nn.functional.mse_loss(z2, y)

# check dtype of loss
f"loss type = {L.dtype}"

'loss type = torch.float16'

In [8]:
# loss scaling
L *= 1024

# do backward
L.backward()

In [9]:
print(f'before: {fp32_model.w1.weight}\n')
optimizer.step()
print(f'after: {fp32_model.w1.weight}\n')

before: Parameter containing:
tensor([[-0.0283, -0.0093,  0.0111,  ...,  0.0222,  0.0309, -0.0075],
        [ 0.0058,  0.0112,  0.0040,  ...,  0.0198,  0.0111,  0.0109],
        [-0.0315, -0.0137,  0.0191,  ..., -0.0410,  0.0124, -0.0407],
        ...,
        [ 0.0271, -0.0025, -0.0171,  ...,  0.0044,  0.0420,  0.0172],
        [ 0.0396,  0.0372, -0.0292,  ...,  0.0309, -0.0390,  0.0402],
        [-0.0374,  0.0217,  0.0248,  ..., -0.0145, -0.0097, -0.0148]],
       device='cuda:0', requires_grad=True)

after: Parameter containing:
tensor([[-0.0283, -0.0093,  0.0111,  ...,  0.0222,  0.0309, -0.0075],
        [ 0.0058,  0.0112,  0.0040,  ...,  0.0198,  0.0111,  0.0109],
        [-0.0315, -0.0137,  0.0191,  ..., -0.0410,  0.0124, -0.0407],
        ...,
        [ 0.0271, -0.0025, -0.0171,  ...,  0.0044,  0.0420,  0.0172],
        [ 0.0396,  0.0372, -0.0292,  ...,  0.0309, -0.0390,  0.0402],
        [-0.0374,  0.0217,  0.0248,  ..., -0.0145, -0.0097, -0.0148]],
       device='cuda:0', requ

In [10]:
# copy gradient to FP32 model
fp32_model.w1.weight.grad = fp16_model.w1.weight.grad.float()
fp32_model.w2.weight.grad = fp16_model.w2.weight.grad.float()

In [11]:
print(f'before: {fp32_model.w1.weight}\n')
optimizer.step()
print(f'after: {fp32_model.w1.weight}\n')

before: Parameter containing:
tensor([[-0.0283, -0.0093,  0.0111,  ...,  0.0222,  0.0309, -0.0075],
        [ 0.0058,  0.0112,  0.0040,  ...,  0.0198,  0.0111,  0.0109],
        [-0.0315, -0.0137,  0.0191,  ..., -0.0410,  0.0124, -0.0407],
        ...,
        [ 0.0271, -0.0025, -0.0171,  ...,  0.0044,  0.0420,  0.0172],
        [ 0.0396,  0.0372, -0.0292,  ...,  0.0309, -0.0390,  0.0402],
        [-0.0374,  0.0217,  0.0248,  ..., -0.0145, -0.0097, -0.0148]],
       device='cuda:0', requires_grad=True)

after: Parameter containing:
tensor([[-0.6399, -1.1324, -2.4314,  ..., -3.1628,  1.4747, -1.5225],
        [-0.5933, -1.0895, -2.3885,  ..., -3.1002,  1.4261, -1.4741],
        [ 0.3779,  0.7382,  1.6541,  ...,  2.0902, -0.9539,  0.9737],
        ...,
        [-0.0872, -0.2125, -0.4740,  ..., -0.5913,  0.3122, -0.2662],
        [ 0.0162, -0.0058, -0.1226,  ..., -0.0910,  0.0162, -0.0178],
        [ 0.1802,  0.4217,  0.8942,  ...,  1.1192, -0.5238,  0.5246]],
       device='cuda:0', requ

### ZeRO-DP

In [13]:
from IPython.display import HTML

HTML("""
<div align="middle">
<video width="80%" controls>
      <source src="../images/zero_video.mp4" type="video/mp4">
</video></div>""")